]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
staging: add Lustre file system client support
authorPeng Tao <bergwolf@gmail.com>
Thu, 2 May 2013 08:46:55 +0000 (16:46 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 14 May 2013 17:54:50 +0000 (13:54 -0400)
Lustre is the most deployed distributed file system
in the HPC (High Performance Computing) world. The patch
adds its client side support.

The code is not very clean and needs to live in drivers/staging
for some time for continuing cleanup work. See
drivers/staging/lustre/TODO for details.

The code is based on Lustre master commit faefbfc04

commit faefbfc0460bc00f2ee4c1c1c86aa1e39b9eea49
Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Date:   Tue Apr 30 23:05:21 2013 +0400

    LU-3244 utils: tunefs.lustre should preserve virgin label

Plus a few under-review patches on Whamcloud gerrit:
3.8 kernel support:
http://review.whamcloud.com/#change,5973
http://review.whamcloud.com/#change,5974
http://review.whamcloud.com/#change,5768
http://review.whamcloud.com/#change,5781
http://review.whamcloud.com/#change,5763
http://review.whamcloud.com/#change,5613
http://review.whamcloud.com/#change,5655

3.9 kernel support:
http://review.whamcloud.com/#change,5898
http://review.whamcloud.com/#change,5899

Kconfig/Kbuild:
http://review.whamcloud.com/#change,4646
http://review.whamcloud.com/#change,4644

libcfs cleanup:
http://review.whamcloud.com/#change,2831
http://review.whamcloud.com/#change,4775
http://review.whamcloud.com/#change,4776
http://review.whamcloud.com/#change,4777
http://review.whamcloud.com/#change,4778
http://review.whamcloud.com/#change,4779
http://review.whamcloud.com/#change,4780

All starting/trailing whitespaces are removed, to match kernel
coding style. Also ran scripts/cleanfile on all lustre source files.

[maked the Kconfig depend on BROKEN as the recent procfs changes causes
this to fail - gregkh]

Signed-off-by: Peng Tao <tao.peng@emc.com>
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
427 files changed:
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/staging/lustre/Kconfig [new file with mode: 0644]
drivers/staging/lustre/Makefile [new file with mode: 0644]
drivers/staging/lustre/TODO [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/bitmap.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/curproc.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_string.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_time.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/kp30.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/lucache.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/params_tree.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/api-support.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/api.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lib-lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lib-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/api-support.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lib-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnetctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnetst.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/ptllnd.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/socklnd.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/types.h [new file with mode: 0644]
drivers/staging/lustre/lnet/Kconfig [new file with mode: 0644]
drivers/staging/lustre/lnet/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/acceptor.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/api-errno.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/api-ni.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/config.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-eq.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-md.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-me.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-move.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-msg.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-ptl.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lo.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/module.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/peer.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/router.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/router_proc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/brw_test.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conctl.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conrpc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conrpc.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/console.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/console.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/framework.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/module.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/ping_test.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/rpc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/rpc.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/selftest.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/timer.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/timer.h [new file with mode: 0644]
drivers/staging/lustre/lustre/Kconfig [new file with mode: 0644]
drivers/staging/lustre/lustre/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_handler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_store.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/lproc_fid.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_handler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_index.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/lproc_fld.c [new file with mode: 0644]
drivers/staging/lustre/lustre/include/cl_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/dt_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/interval_tree.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/ioctl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lclient.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lprocfs_status.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_acl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_common.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_compat25.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_debug.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_dlm.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_handles.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_intent.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_lib.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_lite.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_log.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_net.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_quota.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lvfs.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lvfs_linux.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd_class.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd_support.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lprocfs_status.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_ref.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_target.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/libiam.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/liblustreapi.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_idl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustreapi.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_acl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_capa.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_cfg.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_debug.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_disk.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_dlm.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_eacl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_export.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fid.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fld.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fsfilt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_ha.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_handles.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_idmap.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_import.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_lib.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_linkea.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_lite.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_log.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mdc.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mds.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mdt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_net.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_param.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_quota.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_req_layout.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_sec.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_update.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_ver.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lvfs.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/md_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_cache.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_cksum.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_class.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_lov.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_ost.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_support.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/glimpse.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/lcommon_cl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/lcommon_misc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/interval_tree.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/l_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_extent.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_flock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_plain.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_pool.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_resource.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/fail.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/hash.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/heap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_mem.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_string.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/lwt.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/nidstrings.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/prng.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/tracefile.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/tracefile.h [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/upcall_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/watchdog.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/workitem.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/dcache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/dir.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/file.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_capa.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_close.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_mmap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_nfs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_rmtacl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/lloop.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/lproc_llite.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/namei.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/remote_perm.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/rw.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/rw26.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/statahead.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/super25.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/symlink.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/xattr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_fld.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_intent.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lproc_lmv.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_ea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_log.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_merge.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_offset.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_pack.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_pool.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lproc_lov.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/fsfilt.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/lvfs_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/lvfs_linux.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/lproc_mdc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_locks.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_reint.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/libmgc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/lproc_mgc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/mgc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/mgc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/acl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/capa.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/class_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/dt_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/genops.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/idmap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linkea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_cat.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_ioctl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_lvfs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_osd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_swab.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_test.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/local_storage.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/local_storage.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lprocfs_status.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_ref.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_ucred.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lustre_handles.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lustre_peer.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/md_attrs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/md_local_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/mea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obd_config.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obd_mount.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obd_mount_server.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obdo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/statfs_pack.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/uuid.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo_client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/lproc_echo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/lproc_osc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_quota.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/connection.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/events.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/import.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/layout.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_net.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_server.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/niobuf.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pack_generic.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pers.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pinger.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/recover.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_config.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_gc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_null.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_plain.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/service.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/wirehdr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/wiretest.c [new file with mode: 0644]

index 4e8a1794f50a893120c028bf915616216697ada6..c39fe782ba676f1e24be3bc63353a71d88aaf11c 100644 (file)
@@ -140,4 +140,6 @@ source "drivers/staging/netlogic/Kconfig"
 
 source "drivers/staging/dwc2/Kconfig"
 
+source "drivers/staging/lustre/Kconfig"
+
 endif # STAGING
index 415772ea306dd160a9c19477b557ff16fd746fe6..110c59754dda748c046db15c903eb8df740e0fdd 100644 (file)
@@ -62,3 +62,4 @@ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/
 obj-$(CONFIG_ZCACHE)           += zcache/
 obj-$(CONFIG_GOLDFISH)         += goldfish/
 obj-$(CONFIG_USB_DWC2)         += dwc2/
+obj-$(CONFIG_LUSTRE_FS)                += lustre/
diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig
new file mode 100644 (file)
index 0000000..a224d88
--- /dev/null
@@ -0,0 +1,3 @@
+source "drivers/staging/lustre/lustre/Kconfig"
+
+source "drivers/staging/lustre/lnet/Kconfig"
diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile
new file mode 100644 (file)
index 0000000..2616289
--- /dev/null
@@ -0,0 +1,4 @@
+subdir-ccflags-y := -I$(src)/include/
+
+obj-$(CONFIG_LUSTRE_FS)                += lustre/
+obj-$(CONFIG_LNET)             += lnet/
diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO
new file mode 100644 (file)
index 0000000..22742d6
--- /dev/null
@@ -0,0 +1,13 @@
+* Possible remaining coding style fix.
+* Remove deadcode.
+* Seperate client/server functionality. Functions only used by server can be
+  removed from client.
+* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely.
+* Clean up CLIO layer. Lustre client readahead/writeback control needs to better
+  suit kernel providings.
+* Add documents in Documentation.
+* Other minor misc cleanups...
+
+Please send any patches to Greg Kroah-Hartman <greg@kroah.com>, Andreas Dilger
+<andreas.dilger@intel.com> and Peng Tao <tao.peng@emc.com>. CCing
+hpdd-discuss <hpdd-discuss@lists.01.org> would be great too.
diff --git a/drivers/staging/lustre/include/linux/libcfs/bitmap.h b/drivers/staging/lustre/include/linux/libcfs/bitmap.h
new file mode 100644 (file)
index 0000000..3f1c37b
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+
+typedef struct {
+       int          size;
+       unsigned long   data[0];
+} cfs_bitmap_t;
+
+#define CFS_BITMAP_SIZE(nbits) \
+     (((nbits/BITS_PER_LONG)+1)*sizeof(long)+sizeof(cfs_bitmap_t))
+
+static inline
+cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size)
+{
+       cfs_bitmap_t *ptr;
+
+       OBD_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+       if (ptr == NULL)
+               RETURN(ptr);
+
+       ptr->size = size;
+
+       RETURN (ptr);
+}
+
+#define CFS_FREE_BITMAP(ptr)   OBD_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(cfs_bitmap_t *bitmap, int nbit)
+{
+       set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+       test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(cfs_bitmap_t *bitmap, int nbit)
+{
+       return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+       return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(cfs_bitmap_t *bitmap)
+{
+       return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(cfs_bitmap_t *new, cfs_bitmap_t *old)
+{
+       int newsize;
+
+       LASSERT(new->size >= old->size);
+       newsize = new->size;
+       memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+       new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)                                   \
+       for ((pos) = find_first_bit((bitmap)->data, bitmap->size);      \
+            (pos) < (bitmap)->size;                                    \
+            (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
new file mode 100644 (file)
index 0000000..90d7ce6
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+/*
+ * Portable API to access common characteristics of "current" UNIX process.
+ *
+ * Implemented in portals/include/libcfs/<os>/
+ */
+int    cfs_curproc_groups_nr(void);
+int    current_is_in_group(gid_t group);
+void   cfs_curproc_groups_dump(gid_t *array, int size);
+
+/*
+ * Plus, platform-specific constant
+ *
+ * CFS_CURPROC_COMM_MAX,
+ *
+ * and opaque scalar type
+ *
+ * kernel_cap_t
+ */
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+#define current_pid()          (current->pid)
+#define current_comm()         (current->comm)
+int cfs_get_environ(const char *key, char *value, int *val_len);
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN             0
+#define CFS_CAP_DAC_OVERRIDE       1
+#define CFS_CAP_DAC_READ_SEARCH         2
+#define CFS_CAP_FOWNER           3
+#define CFS_CAP_FSETID           4
+#define CFS_CAP_LINUX_IMMUTABLE         9
+#define CFS_CAP_SYS_ADMIN            21
+#define CFS_CAP_SYS_BOOT              23
+#define CFS_CAP_SYS_RESOURCE      24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |                 \
+                        (1 << CFS_CAP_DAC_OVERRIDE) |    \
+                        (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+                        (1 << CFS_CAP_FOWNER) |                \
+                        (1 << CFS_CAP_FSETID ) |              \
+                        (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+                        (1 << CFS_CAP_SYS_ADMIN) |          \
+                        (1 << CFS_CAP_SYS_BOOT) |            \
+                        (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+void cfs_curproc_cap_unpack(cfs_cap_t cap);
+int cfs_capable(cfs_cap_t cap);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
new file mode 100644 (file)
index 0000000..6dd5a7d
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#if !__GNUC__
+#define __attribute__(x)
+#endif
+
+#include <linux/libcfs/linux/libcfs.h>
+
+#include "curproc.h"
+
+#ifndef offsetof
+# define offsetof(typ,memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb)))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof (a)) / (sizeof ((a)[0])))
+#endif
+
+#if !defined(swap)
+#define swap(x,y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+       ((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+#endif
+
+static inline int __is_po2(unsigned long long val)
+{
+       return !(val & (val - 1));
+}
+
+#define IS_PO2(val) __is_po2((unsigned long long)(val))
+
+#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+                          ((hexnum) >> 8 & 0xf))
+
+
+/*
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
+ */
+#if defined(NULL)
+#undef NULL
+#endif
+
+#define NULL ((void *)0)
+
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
+
+
+#include <linux/list.h>
+
+#ifndef cfs_for_each_possible_cpu
+#  error cfs_for_each_possible_cpu is not supported by kernel!
+#endif
+
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(socket_t **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(socket_t **newsockp, socket_t *sock);
+void libcfs_sock_abort_accept(socket_t *sock);
+int libcfs_sock_connect(socket_t **sockp, int *fatal,
+                       __u32 local_ip, int local_port,
+                       __u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(socket_t *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(socket_t *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(socket_t *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(socket_t *sock);
+
+/* libcfs watchdogs */
+struct lc_watchdog;
+
+/* Add a watchdog which fires after "time" milliseconds of delay.  You have to
+ * touch it once to enable it. */
+struct lc_watchdog *lc_watchdog_add(int time,
+                                   void (*cb)(pid_t pid, void *),
+                                   void *data);
+
+/* Enables a watchdog and resets its timer. */
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
+#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout,             \
+                         AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
+                         svc->srv_watchdog_factor)
+
+/* Disable a watchdog; touch it to restart it. */
+void lc_watchdog_disable(struct lc_watchdog *lcw);
+
+/* Clean up the watchdog */
+void lc_watchdog_delete(struct lc_watchdog *lcw);
+
+/* Dump a debug log */
+void lc_watchdog_dumplog(pid_t pid, void *data);
+
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * libcfs pseudo device operations
+ *
+ * struct psdev_t and
+ * misc_register() and
+ * misc_deregister() are declared in
+ * libcfs/<os>/<os>-prim.h
+ *
+ * It's just draft now.
+ */
+
+struct cfs_psdev_file {
+       unsigned long   off;
+       void        *private_data;
+       unsigned long   reserved1;
+       unsigned long   reserved2;
+};
+
+struct cfs_psdev_ops {
+       int (*p_open)(unsigned long, void *);
+       int (*p_close)(unsigned long, void *);
+       int (*p_read)(struct cfs_psdev_file *, char *, unsigned long);
+       int (*p_write)(struct cfs_psdev_file *, char *, unsigned long);
+       int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *);
+};
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_get_blocked_sigs(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+
+int convert_server_error(__u64 ecode);
+int convert_client_oflag(int cflag, int *result);
+
+/*
+ * Stack-tracing filling.
+ */
+
+/*
+ * Platform-dependent data-type to hold stack frames.
+ */
+struct cfs_stack_trace;
+
+/*
+ * Fill @trace with current back-trace.
+ */
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace);
+
+/*
+ * Return instruction pointer for frame @frame_no. NULL if @frame_no is
+ * invalid.
+ */
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no);
+
+#ifndef O_NOACCESS
+#define O_NOACCESS O_NONBLOCK
+#endif
+
+/*
+ * Universal open flags.
+ */
+#define CFS_O_NOACCESS   0003
+#define CFS_O_ACCMODE     CFS_O_NOACCESS
+#define CFS_O_CREAT         0100
+#define CFS_O_EXCL           0200
+#define CFS_O_NOCTTY       0400
+#define CFS_O_TRUNC         01000
+#define CFS_O_APPEND       02000
+#define CFS_O_NONBLOCK   04000
+#define CFS_O_NDELAY       CFS_O_NONBLOCK
+#define CFS_O_SYNC           010000
+#define CFS_O_ASYNC         020000
+#define CFS_O_DIRECT       040000
+#define CFS_O_LARGEFILE         0100000
+#define CFS_O_DIRECTORY         0200000
+#define CFS_O_NOFOLLOW   0400000
+#define CFS_O_NOATIME     01000000
+
+/* convert local open flags to universal open flags */
+int cfs_oflags2univ(int flags);
+/* convert universal open flags to local open flags */
+int cfs_univ2oflags(int flags);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+
+#include <linux/libcfs/libcfs_debug.h>
+#include <linux/libcfs/libcfs_cpu.h>
+#include <linux/libcfs/libcfs_private.h>
+#include <linux/libcfs/libcfs_ioctl.h>
+#include <linux/libcfs/libcfs_prim.h>
+#include <linux/libcfs/libcfs_time.h>
+#include <linux/libcfs/libcfs_string.h>
+#include <linux/libcfs/libcfs_kernelcomm.h>
+#include <linux/libcfs/libcfs_workitem.h>
+#include <linux/libcfs/libcfs_hash.h>
+#include <linux/libcfs/libcfs_heap.h>
+#include <linux/libcfs/libcfs_fail.h>
+#include <linux/libcfs/params_tree.h>
+#include <linux/libcfs/libcfs_crypto.h>
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(void *ptr, unsigned long shift)
+{
+       if (unlikely(IS_ERR(ptr) || ptr == NULL))
+               return ptr;
+       else
+               return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member) \
+       ((type *)__container_of((void *)(ptr), offsetof(type, member)))
+
+#define SET_BUT_UNUSED(a) do { } while(sizeof(a) - sizeof(a))
+
+#define _LIBCFS_H
+
+#endif /* _LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
new file mode 100644 (file)
index 0000000..6ae7415
--- /dev/null
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *           core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *           core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *       cpu_npartitions=1:
+ *           core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *            cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#ifndef HAVE_LIBCFS_CPT
+
+typedef unsigned long          cpumask_t;
+typedef unsigned long          nodemask_t;
+
+struct cfs_cpt_table {
+       /* # of CPU partitions */
+       int                     ctb_nparts;
+       /* cpu mask */
+       cpumask_t               ctb_mask;
+       /* node mask */
+       nodemask_t              ctb_nodemask;
+       /* version */
+       __u64                   ctb_version;
+};
+
+#endif /* !HAVE_LIBCFS_CPT */
+
+/* any CPU partition */
+#define CFS_CPT_ANY            (-1)
+
+extern struct cfs_cpt_table    *cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab,
+                       int cpt, cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab,
+                          int cpt, cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab,
+                        int cpt, nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab,
+                           int cpt, nodemask_t *mask);
+/**
+ * unset all cpus for CPU partition \a cpt
+ */
+void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)     \
+       for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
new file mode 100644 (file)
index 0000000..64ca62f
--- /dev/null
@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+       char            *cht_name;      /**< hash algorithm name, equal to
+                                        * format name for crypto api */
+       unsigned int    cht_key;        /**< init key by default (vaild for
+                                        * 4 bytes context like crc32, adler */
+       unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+       CFS_HASH_ALG_NULL       = 0,
+       CFS_HASH_ALG_ADLER32,
+       CFS_HASH_ALG_CRC32,
+       CFS_HASH_ALG_MD5,
+       CFS_HASH_ALG_SHA1,
+       CFS_HASH_ALG_SHA256,
+       CFS_HASH_ALG_SHA384,
+       CFS_HASH_ALG_SHA512,
+       CFS_HASH_ALG_CRC32C,
+       CFS_HASH_ALG_MAX
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+       [CFS_HASH_ALG_NULL]    = { "null",     0,      0 },
+       [CFS_HASH_ALG_ADLER32] = { "adler32",  1,      4 },
+       [CFS_HASH_ALG_CRC32]   = { "crc32",   ~0,      4 },
+       [CFS_HASH_ALG_CRC32C]  = { "crc32c",  ~0,      4 },
+       [CFS_HASH_ALG_MD5]     = { "md5",      0,     16 },
+       [CFS_HASH_ALG_SHA1]    = { "sha1",     0,     20 },
+       [CFS_HASH_ALG_SHA256]  = { "sha256",   0,     32 },
+       [CFS_HASH_ALG_SHA384]  = { "sha384",   0,     48 },
+       [CFS_HASH_ALG_SHA512]  = { "sha512",   0,     64 },
+};
+
+/**    Return pointer to type of hash for valid hash algorithm identifier */
+static inline const struct cfs_crypto_hash_type *
+                   cfs_crypto_hash_type(unsigned char hash_alg)
+{
+       struct cfs_crypto_hash_type *ht;
+
+       if (hash_alg < CFS_HASH_ALG_MAX) {
+               ht = &hash_types[hash_alg];
+               if (ht->cht_name)
+                       return ht;
+       }
+       return NULL;
+}
+
+/**     Return hash name for valid hash algorithm identifier or "unknown" */
+static inline const char *cfs_crypto_hash_name(unsigned char hash_alg)
+{
+       const struct cfs_crypto_hash_type *ht;
+
+       ht = cfs_crypto_hash_type(hash_alg);
+       if (ht)
+               return ht->cht_name;
+       else
+               return "unknown";
+}
+
+/**     Return digest size for valid algorithm identifier or 0 */
+static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg)
+{
+       const struct cfs_crypto_hash_type *ht;
+
+       ht = cfs_crypto_hash_type(hash_alg);
+       if (ht)
+               return ht->cht_size;
+       else
+               return 0;
+}
+
+/**     Return hash identifier for valid hash algorithm name or 0xFF */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+       unsigned char   i;
+
+       for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+               if (!strcmp(hash_types[i].cht_name, algname))
+                       break;
+       return (i == CFS_HASH_ALG_MAX ? 0xFF : i);
+}
+
+/**     Calculate hash digest for buffer.
+ *      @param alg         id of hash algorithm
+ *      @param buf         buffer of data
+ *      @param buf_len buffer len
+ *      @param key         initial value for algorithm, if it is NULL,
+ *                         default initial value should be used.
+ *      @param key_len len of initial value
+ *      @param hash       [out] pointer to hash, if it is NULL, hash_len is
+ *                         set to valid digest size in bytes, retval -ENOSPC.
+ *      @param hash_len       [in,out] size of hash buffer
+ *      @returns             status of operation
+ *      @retval -EINVAL       if buf, buf_len, hash_len or alg_id is invalid
+ *      @retval -ENODEV       if this algorithm is unsupported
+ *      @retval -ENOSPC       if pointer to hash is NULL, or hash_len less than
+ *                         digest size
+ *      @retval 0           for success
+ *      @retval < 0       other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(unsigned char alg,
+                          const void *buf, unsigned int buf_len,
+                          unsigned char *key, unsigned int key_len,
+                          unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+
+/**     Allocate and initialize desriptor for hash algorithm.
+ *      @param alg         algorithm id
+ *      @param key         initial value for algorithm, if it is NULL,
+ *                         default initial value should be used.
+ *      @param key_len len of initial value
+ *      @returns             pointer to descriptor of hash instance
+ *      @retval ERR_PTR(error) when errors occured.
+ */
+struct cfs_crypto_hash_desc*
+       cfs_crypto_hash_init(unsigned char alg,
+                            unsigned char *key, unsigned int key_len);
+
+/**    Update digest by part of data.
+ *     @param desc           hash descriptor
+ *     @param page           data page
+ *     @param offset       data offset
+ *     @param len             data len
+ *     @returns                 status of operation
+ *     @retval 0               for success.
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+                               struct page *page, unsigned int offset,
+                               unsigned int len);
+
+/**    Update digest by part of data.
+ *     @param desc           hash descriptor
+ *     @param buf             pointer to data buffer
+ *     @param buf_len     size of data at buffer
+ *     @returns                 status of operation
+ *     @retval 0               for success.
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+                          unsigned int buf_len);
+
+/**    Finalize hash calculation, copy hash digest to buffer, destroy hash
+ *     descriptor.
+ *     @param desc           hash descriptor
+ *     @param hash           buffer pointer to store hash digest
+ *     @param hash_len   pointer to hash buffer size, if NULL
+ *                           destory hash descriptor
+ *     @returns                 status of operation
+ *     @retval -ENOSPC   if hash is NULL, or *hash_len less than
+ *                           digest size
+ *     @retval 0               for success
+ *     @retval < 0           other errors from lower layers.
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+                         unsigned char *hash, unsigned int *hash_len);
+/**
+ *      Register crypto hash algorithms
+ */
+int cfs_crypto_register(void);
+
+/**
+ *      Unregister
+ */
+void cfs_crypto_unregister(void);
+
+/**     Return hash speed in Mbytes per second for valid hash algorithm
+ *      identifier. If test was unsuccessfull -1 would be return.
+ */
+int cfs_crypto_hash_speed(unsigned char hash_alg);
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
new file mode 100644 (file)
index 0000000..dd8ac2f
--- /dev/null
@@ -0,0 +1,350 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+       __u32 ph_len;
+       __u32 ph_flags;
+       __u32 ph_subsys;
+       __u32 ph_mask;
+       __u16 ph_cpu_id;
+       __u16 ph_type;
+       __u32 ph_sec;
+       __u64 ph_usec;
+       __u32 ph_stack;
+       __u32 ph_pid;
+       __u32 ph_extern_pid;
+       __u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define S_UNDEFINED   0x00000001
+#define S_MDC   0x00000002
+#define S_MDS   0x00000004
+#define S_OSC   0x00000008
+#define S_OST   0x00000010
+#define S_CLASS       0x00000020
+#define S_LOG   0x00000040
+#define S_LLITE       0x00000080
+#define S_RPC   0x00000100
+#define S_MGMT 0x00000200
+#define S_LNET 0x00000400
+#define S_LND   0x00000800 /* ALL LNDs */
+#define S_PINGER      0x00001000
+#define S_FILTER      0x00002000
+/* unused */
+#define S_ECHO 0x00008000
+#define S_LDLM 0x00010000
+#define S_LOV   0x00020000
+#define S_LQUOTA      0x00040000
+#define S_OSD          0x00080000
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV   0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC   0x02000000 /* upcall cache */
+#define S_GSS   0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC   0x10000000
+#define S_MGS   0x20000000
+#define S_FID   0x40000000 /* b_new_cmd */
+#define S_FLD   0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+
+/* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define D_TRACE       0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE       0x00000002
+#define D_SUPER       0x00000004
+#define D_EXT2 0x00000008 /* anything from ext2_debug */
+#define D_MALLOC      0x00000010 /* print malloc, free information */
+#define D_CACHE       0x00000020 /* cache-related items */
+#define D_INFO 0x00000040 /* general information */
+#define D_IOCTL       0x00000080 /* ioctl related information */
+#define D_NETERROR    0x00000100 /* network errors */
+#define D_NET   0x00000200 /* network communications */
+#define D_WARNING     0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS       0x00000800
+#define D_OTHER       0x00001000
+#define D_DENTRY      0x00002000
+#define D_NETTRACE    0x00004000
+#define D_PAGE 0x00008000 /* bulk page handling */
+#define D_DLMTRACE    0x00010000
+#define D_ERROR       0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG       0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA     0x00080000 /* recovery and failover */
+#define D_RPCTRACE    0x00100000 /* for distributed debugging */
+#define D_VFSTRACE    0x00200000
+#define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP 0x00800000
+#define D_CONFIG      0x01000000
+#define D_CONSOLE     0x02000000
+#define D_QUOTA       0x04000000
+#define D_SEC   0x08000000
+#define D_LFSCK              0x10000000 /* For both OI scrub and LFSCK */
+/* keep these in sync with lnet/{utils,libcfs}/debug.c */
+
+#define D_HSM   D_TRACE
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))        /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+typedef struct {
+       cfs_time_t      cdls_next;
+       unsigned int    cdls_delay;
+       int          cdls_count;
+} cfs_debug_limit_state_t;
+
+struct libcfs_debug_msg_data {
+       const char             *msg_file;
+       const char             *msg_fn;
+       int                   msg_subsys;
+       int                   msg_line;
+       int                   msg_mask;
+       cfs_debug_limit_state_t  *msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)   \
+do {                                                   \
+       (data)->msg_subsys = DEBUG_SUBSYSTEM;          \
+       (data)->msg_file   = __FILE__;                \
+       (data)->msg_fn     = __FUNCTION__;                \
+       (data)->msg_line   = __LINE__;                \
+       (data)->msg_cdls   = (cdls);                    \
+       (data)->msg_mask   = (mask);                    \
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)    \
+       static struct libcfs_debug_msg_data dataname = {    \
+              .msg_subsys = DEBUG_SUBSYSTEM,          \
+              .msg_file   = __FILE__,                \
+              .msg_fn     = __FUNCTION__,                \
+              .msg_line   = __LINE__,                \
+              .msg_cdls   = (cdls)      };           \
+       dataname.msg_mask   = (mask);
+
+
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+       return mask & D_CANTMASK ||
+               ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#define __CDEBUG(cdls, mask, format, ...)                             \
+do {                                                               \
+       static struct libcfs_debug_msg_data msgdata;                \
+                                                                       \
+       CFS_CHECK_STACK(&msgdata, mask, cdls);                    \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);       \
+               libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);     \
+       }                                                              \
+} while (0)
+
+#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#define CDEBUG_LIMIT(mask, format, ...)         \
+do {                                       \
+       static cfs_debug_limit_state_t cdls;    \
+                                               \
+       __CDEBUG(&cdls, mask, format, ## __VA_ARGS__);\
+} while (0)
+
+
+
+
+#define CWARN(format, ...)       CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)     CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)     CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+                          "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *, const char *, long_ptr_t);
+#define GOTO(label, rc)                                                 \
+do {                                                               \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               libcfs_log_goto(&msgdata, #label, (long_ptr_t)(rc));    \
+       } else {                                                        \
+               (void)(rc);                                          \
+       }                                                              \
+       goto label;                                                  \
+} while (0)
+
+
+/*
+ * if rc == NULL, we need to code as RETURN((void *)NULL), otherwise
+ * there will be a warning in osx.
+ */
+#if defined(__GNUC__)
+
+long libcfs_log_return(struct libcfs_debug_msg_data *, long rc);
+#if BITS_PER_LONG > 32
+#define RETURN(rc)                                                     \
+do {                                                                   \
+       EXIT_NESTING;                                                   \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               return (typeof(rc))libcfs_log_return(&msgdata,          \
+                                                    (long)(rc));       \
+       }                                                               \
+                                                                       \
+       return (rc);                                                    \
+} while (0)
+#else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#define RETURN(rc)                                                     \
+do {                                                                   \
+       EXIT_NESTING;                                                   \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               typeof(rc) __rc = (rc);                                 \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               libcfs_log_return(&msgdata, (long_ptr_t)__rc);          \
+               return __rc;                                            \
+       }                                                               \
+                                                                       \
+       return (rc);                                                    \
+} while (0)
+#endif /* BITS_PER_LONG > 32 */
+
+#elif defined(_MSC_VER)
+#define RETURN(rc)                                                   \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process leaving.\n");                    \
+       EXIT_NESTING;                                              \
+       return (rc);                                                \
+} while (0)
+#else
+# error "Unkown compiler"
+#endif /* __GNUC__ */
+
+#define ENTRY                                                     \
+ENTRY_NESTING;                                                   \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process entered\n");                      \
+} while (0)
+
+#define EXIT                                                       \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process leaving\n");                      \
+       EXIT_NESTING;                                              \
+} while(0)
+
+#define RETURN_EXIT                                                    \
+do {                                                                   \
+       EXIT;                                                           \
+       return;                                                         \
+} while (0)
+
+extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                           const char *format1, ...)
+       __attribute__ ((format (printf, 2, 3)));
+
+extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                             const char *format1,
+                             va_list args, const char *format2, ...)
+       __attribute__ ((format (printf, 4, 5)));
+
+/* other external symbols that tracefile provides: */
+extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                                  const char *usr_buffer, int usr_buffer_nob);
+extern int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                                   const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif /* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
new file mode 100644 (file)
index 0000000..8393c27
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+       CFS_FAIL_LOC_NOSET      = 0,
+       CFS_FAIL_LOC_ORSET      = 1,
+       CFS_FAIL_LOC_RESET      = 2,
+       CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED       (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP  0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME  0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND  0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1  0x04000000 /* user flag */
+
+#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc &&                         \
+                             (cfs_fail_loc & CFS_FAIL_MASK_LOC) ==        \
+                             ((id) & CFS_FAIL_MASK_LOC))
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+                                    int set, int quiet)
+{
+       int ret = 0;
+
+       if (unlikely(CFS_FAIL_PRECHECK(id) &&
+                    (ret = __cfs_fail_check_set(id, value, set)))) {
+               if (quiet) {
+                       CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+                              id, value);
+               } else {
+                       LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+                                     id, value);
+               }
+       }
+
+       return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+       cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+       cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+       if (unlikely(CFS_FAIL_PRECHECK(id)))
+               return __cfs_fail_timeout_set(id, value, ms, set);
+       else
+               return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+       cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+       cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+       cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+       cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+
+       if (CFS_FAIL_PRECHECK(id)) {
+               if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+                       int rc;
+                       cfs_race_state = 0;
+                       CERROR("cfs_race id %x sleeping\n", id);
+                       cfs_wait_event_interruptible(cfs_race_waitq,
+                                                    cfs_race_state != 0, rc);
+                       CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+               } else {
+                       CERROR("cfs_fail_race id %x waking\n", id);
+                       cfs_race_state = 1;
+                       wake_up(&cfs_race_waitq);
+               }
+       }
+}
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
new file mode 100644 (file)
index 0000000..c5b3715
--- /dev/null
@@ -0,0 +1,850 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure
+ * the linux kernel and user space at the same time, so we need to differentiate
+ * between them explicitely. If this is not needed on other architectures, then
+ * we'll need to move the functions to archi specific headers.
+ */
+
+#include <linux/hash.h>
+
+#define cfs_hash_long(val, bits)    hash_long(val, bits)
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE     0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1           1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2           2
+
+#define CFS_HASH_DEBUG_LEVEL   CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+typedef union {
+       rwlock_t                rw;             /**< rwlock */
+       spinlock_t              spin;           /**< spinlock */
+} cfs_hash_lock_t;
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, couter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+typedef struct cfs_hash_bucket {
+       cfs_hash_lock_t         hsb_lock;       /**< bucket lock */
+       __u32                   hsb_count;      /**< current entries */
+       __u32                   hsb_version;    /**< change version */
+       unsigned int            hsb_index;      /**< index of bucket */
+       int                     hsb_depmax;     /**< max depth on bucket */
+       long                    hsb_head[0];    /**< hash-head array */
+} cfs_hash_bucket_t;
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+typedef struct cfs_hash_bd {
+       cfs_hash_bucket_t         *bd_bucket;      /**< address of bucket */
+       unsigned int            bd_offset;      /**< offset in bucket */
+} cfs_hash_bd_t;
+
+#define CFS_HASH_NAME_LEN         16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN   64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS         3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX         30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN         CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+       /**
+        * don't need any lock, caller will protect operations with it's
+        * own lock. With this flag:
+        *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+        *    will be ignored.
+        *  . Some functions will be disabled with this flag, i.e:
+        *    cfs_hash_for_each_empty, cfs_hash_rehash
+        */
+       CFS_HASH_NO_LOCK        = 1 << 0,
+       /** no bucket lock, use one spinlock to protect the whole hash */
+       CFS_HASH_NO_BKTLOCK     = 1 << 1,
+       /** rwlock to protect bucket */
+       CFS_HASH_RW_BKTLOCK     = 1 << 2,
+       /** spinlcok to protect bucket */
+       CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+       /** always add new item to tail */
+       CFS_HASH_ADD_TAIL       = 1 << 4,
+       /** hash-table doesn't have refcount on item */
+       CFS_HASH_NO_ITEMREF     = 1 << 5,
+       /** big name for param-tree */
+       CFS_HASH_BIGNAME        = 1 << 6,
+       /** track global count */
+       CFS_HASH_COUNTER        = 1 << 7,
+       /** rehash item by new key */
+       CFS_HASH_REHASH_KEY     = 1 << 8,
+       /** Enable dynamic hash resizing */
+       CFS_HASH_REHASH  = 1 << 9,
+       /** can shrink hash-size */
+       CFS_HASH_SHRINK  = 1 << 10,
+       /** assert hash is empty on exit */
+       CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+       /** record hlist depth */
+       CFS_HASH_DEPTH    = 1 << 12,
+       /**
+        * rehash is always scheduled in a different thread, so current
+        * change on hash table is non-blocking
+        */
+       CFS_HASH_NBLK_CHANGE    = 1 << 13,
+       /** NB, we typed hs_flags as  __u16, please change it
+        * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+                               CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+typedef struct cfs_hash {
+       /** serialize with rehash, or serialize all operations if
+        * the hash-table has CFS_HASH_NO_BKTLOCK */
+       cfs_hash_lock_t      hs_lock;
+       /** hash operations */
+       struct cfs_hash_ops     *hs_ops;
+       /** hash lock operations */
+       struct cfs_hash_lock_ops   *hs_lops;
+       /** hash list operations */
+       struct cfs_hash_hlist_ops  *hs_hops;
+       /** hash buckets-table */
+       cfs_hash_bucket_t        **hs_buckets;
+       /** total number of items on this hash-table */
+       atomic_t                hs_count;
+       /** hash flags, see cfs_hash_tag for detail */
+       __u16                  hs_flags;
+       /** # of extra-bytes for bucket, for user saving extended attributes */
+       __u16                  hs_extra_bytes;
+       /** wants to iterate */
+       __u8                    hs_iterating;
+       /** hash-table is dying */
+       __u8                    hs_exiting;
+       /** current hash bits */
+       __u8                    hs_cur_bits;
+       /** min hash bits */
+       __u8                    hs_min_bits;
+       /** max hash bits */
+       __u8                    hs_max_bits;
+       /** bits for rehash */
+       __u8                    hs_rehash_bits;
+       /** bits for each bucket */
+       __u8                    hs_bkt_bits;
+       /** resize min threshold */
+       __u16                  hs_min_theta;
+       /** resize max threshold */
+       __u16                  hs_max_theta;
+       /** resize count */
+       __u32                  hs_rehash_count;
+       /** # of iterators (caller of cfs_hash_for_each_*) */
+       __u32                  hs_iterators;
+       /** rehash workitem */
+       cfs_workitem_t        hs_rehash_wi;
+       /** refcount on this hash table */
+       atomic_t                hs_refcount;
+       /** rehash buckets-table */
+       cfs_hash_bucket_t        **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+       /** serialize debug members */
+       spinlock_t                      hs_dep_lock;
+       /** max depth */
+       unsigned int            hs_dep_max;
+       /** id of the deepest bucket */
+       unsigned int            hs_dep_bkt;
+       /** offset in the deepest bucket */
+       unsigned int            hs_dep_off;
+       /** bits when we found the max depth */
+       unsigned int            hs_dep_bits;
+       /** workitem to output max depth */
+       cfs_workitem_t        hs_dep_wi;
+#endif
+       /** name of htable */
+       char                    hs_name[0];
+} cfs_hash_t;
+
+typedef struct cfs_hash_lock_ops {
+       /** lock the hash table */
+       void    (*hs_lock)(cfs_hash_lock_t *lock, int exclusive);
+       /** unlock the hash table */
+       void    (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive);
+       /** lock the hash bucket */
+       void    (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive);
+       /** unlock the hash bucket */
+       void    (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+       /** return hlist_head of hash-head of @bd */
+       struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd);
+       /** return hash-head size */
+       int (*hop_hhead_size)(cfs_hash_t *hs);
+       /** add @hnode to hash-head of @bd */
+       int (*hop_hnode_add)(cfs_hash_t *hs,
+                            cfs_hash_bd_t *bd, struct hlist_node *hnode);
+       /** remove @hnode from hash-head of @bd */
+       int (*hop_hnode_del)(cfs_hash_t *hs,
+                            cfs_hash_bd_t *bd, struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
+typedef struct cfs_hash_ops {
+       /** return hashed value from @key */
+       unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask);
+       /** return key address of @hnode */
+       void *   (*hs_key)(struct hlist_node *hnode);
+       /** copy key from @hnode to @key */
+       void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+       /**
+        *  compare @key with key of @hnode
+        *  returns 1 on a match
+        */
+       int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+       /** return object address of @hnode, i.e: container_of(...hnode) */
+       void *   (*hs_object)(struct hlist_node *hnode);
+       /** get refcount of item, always called with holding bucket-lock */
+       void     (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item */
+       void     (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item, always called with holding bucket-lock */
+       void     (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** it's called before removing of @hnode */
+       void     (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode);
+} cfs_hash_ops_t;
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+       (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+       (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(cfs_hash_t *hs)
+{
+       /* caller will serialize all operations for this hash-table */
+       return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(cfs_hash_t *hs)
+{
+       /* no bucket lock, one single lock to protect the hash-table */
+       return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(cfs_hash_t *hs)
+{
+       /* rwlock to protect hash bucket */
+       return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(cfs_hash_t *hs)
+{
+       /* spinlock to protect hash bucket */
+       return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(cfs_hash_t *hs)
+{
+       /* hash-table doesn't keep refcount on item,
+        * item can't be removed from hash unless it's
+        * ZERO refcount */
+       return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(cfs_hash_t *hs)
+{       /* cfs_hash_destroy is called */
+       return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(cfs_hash_t *hs)
+{       /* rehash is launched */
+       return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(cfs_hash_t *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+       return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(cfs_hash_t *hs)
+{
+       return offsetof(cfs_hash_bucket_t, hsb_head[0]) +
+              hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+              hs->hs_extra_bytes;
+}
+
+#define CFS_HOP(hs, op)           (hs)->hs_ops->hs_ ## op
+
+static inline unsigned
+cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return CFS_HOP(hs, hash)(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, key)(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key)
+{
+       if (CFS_HOP(hs, keycpy) != NULL)
+               CFS_HOP(hs, keycpy)(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, keycmp)(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, object)(hnode);
+}
+
+static inline void
+cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, get)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LASSERT(CFS_HOP(hs, put_locked) != NULL);
+
+       return CFS_HOP(hs, put_locked)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LASSERT(CFS_HOP(hs, put) != NULL);
+
+       return CFS_HOP(hs, put)(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       if (CFS_HOP(hs, exit))
+               CFS_HOP(hs, exit)(hs, hnode);
+}
+
+static inline void cfs_hash_lock(cfs_hash_t *hs, int excl)
+{
+       hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl)
+{
+       hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs,
+                                       atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_no_bktlock(hs));
+       return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(cfs_hash_t *hs,
+                                   cfs_hash_bd_t *bd, int excl)
+{
+       hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(cfs_hash_t *hs,
+                                     cfs_hash_bd_t *bd, int excl)
+{
+       hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd);
+
+static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                           cfs_hash_bd_t *bd, int excl)
+{
+       cfs_hash_bd_get(hs, key, bd);
+       cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(cfs_hash_t *hs,
+                                        unsigned index, cfs_hash_bd_t *bd)
+{
+       bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+       bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       return (void *)bd->bd_bucket +
+              cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(cfs_hash_bd_t *bd)
+{
+       /* need hold cfs_hash_bd_lock */
+       return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(cfs_hash_bd_t *bd)
+{
+       /* need hold cfs_hash_bd_lock */
+       return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd)
+{
+       return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+       if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+               return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+       if (bd1->bd_offset != bd2->bd_offset)
+               return bd1->bd_offset - bd2->bd_offset;
+
+       return 0;
+}
+
+void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+                            cfs_hash_bd_t *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                          atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_spin_bktlock(hs));
+       return atomic_dec_and_lock(condition,
+                                      &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bd)
+{
+       return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs,
+                                           cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs,
+                                         cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs,
+                                            cfs_hash_bd_t *bd, const void *key,
+                                            struct hlist_node *hnode,
+                                            int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs,
+                                            cfs_hash_bd_t *bd, const void *key,
+                                            struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds);
+void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                                cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_dual_bd_get(hs, key, bds);
+       cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs,
+                                                cfs_hash_bd_t *bds,
+                                                const void *key);
+struct hlist_node *cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bds,
+                                                 const void *key,
+                                                 struct hlist_node *hnode,
+                                                 int insist_add);
+struct hlist_node *cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bds,
+                                                 const void *key,
+                                                 struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+                           unsigned bkt_bits, unsigned extra_bytes,
+                           unsigned min_theta, unsigned max_theta,
+                           cfs_hash_ops_t *ops, unsigned flags);
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs);
+void cfs_hash_putref(cfs_hash_t *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(cfs_hash_t *hs, const void *key,
+                 struct hlist_node *hnode);
+int cfs_hash_add_unique(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+                             struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(cfs_hash_t *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                     struct hlist_node *node, void *data);
+void *cfs_hash_lookup(cfs_hash_t *hs, const void *key);
+void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(cfs_hash_t *hs,
+                             cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_empty(cfs_hash_t *hs,
+                            cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+                          cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+                            cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(cfs_hash_t *hs);
+__u64 cfs_hash_size_get(cfs_hash_t *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash funcion.
+ */
+void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs);
+void cfs_hash_rehash_cancel(cfs_hash_t *hs);
+int  cfs_hash_rehash(cfs_hash_t *hs, int do_rehash);
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+                        void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                     struct hlist_node *hnode)
+{
+       LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bds[2];
+
+       cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+       LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+               bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                     struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+       return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+       return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+              (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(cfs_hash_t *hs)
+{
+       return (atomic_read(&hs->hs_count) <<
+               CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max)
+{
+       LASSERT(min < max);
+       hs->hs_min_theta = (__u16)min;
+       hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+int cfs_hash_debug_header(char *str, int size);
+int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+       unsigned i, hash = 5381;
+
+       LASSERT(key != NULL);
+
+       for (i = 0; i < size; i++)
+               hash = hash * 33 + ((char *)key)[i];
+
+       return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+       return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+       return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+       for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)             \
+       for (pos = 0;                                      \
+            pos < CFS_HASH_NBKT(hs) &&                  \
+            ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)             \
+       for ((bd)->bd_offset = 0;                              \
+            (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+            (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+            (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h
new file mode 100644 (file)
index 0000000..bfa6d7b
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton <eeb@whamcloud.com>
+ *        Liang Zhen   <liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e cfs_binheap_node_t object instance on
+ * every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a cfs_binheap_ops_t::hop_compare() implementation
+ * which is used by the heap as the binary predicate during its internal sorting
+ * operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e cfs_binheap_t instance.
+ */
+typedef struct {
+       /** Index into the binary tree */
+       unsigned int    chn_index;
+} cfs_binheap_node_t;
+
+#define CBH_SHIFT      9
+#define CBH_SIZE       (1 << CBH_SHIFT)                    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB        (CBH_SIZE * sizeof(cfs_binheap_node_t *))
+
+#define CBH_POISON     0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+       CBH_FLAG_ATOMIC_GROW    = 1,
+};
+
+struct cfs_binheap;
+
+/**
+ * Binary heap operations.
+ */
+typedef struct {
+       /**
+        * Called right before inserting a node into the binary heap.
+        *
+        * Implementing this operation is optional.
+        *
+        * \param[in] h The heap
+        * \param[in] e The node
+        *
+        * \retval 0 success
+        * \retval != 0 error
+        */
+       int             (*hop_enter)(struct cfs_binheap *h,
+                                    cfs_binheap_node_t *e);
+       /**
+        * Called right after removing a node from the binary heap.
+        *
+        * Implementing this operation is optional.
+        *
+        * \param[in] h The heap
+        * \param[in] e The node
+        */
+       void            (*hop_exit)(struct cfs_binheap *h,
+                                   cfs_binheap_node_t *e);
+       /**
+        * A binary predicate which is called during internal heap sorting
+        * operations, and used in order to determine the relevant ordering of
+        * two heap nodes.
+        *
+        * Implementing this operation is mandatory.
+        *
+        * \param[in] a The first heap node
+        * \param[in] b The second heap node
+        *
+        * \retval 0 Node a > node b
+        * \retval 1 Node a < node b
+        *
+        * \see cfs_binheap_bubble()
+        * \see cfs_biheap_sink()
+        */
+       int             (*hop_compare)(cfs_binheap_node_t *a,
+                                      cfs_binheap_node_t *b);
+} cfs_binheap_ops_t;
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e cfs_binheap_node_t
+ */
+typedef struct cfs_binheap {
+       /** Triple indirect */
+       cfs_binheap_node_t  ****cbh_elements3;
+       /** double indirect */
+       cfs_binheap_node_t   ***cbh_elements2;
+       /** single indirect */
+       cfs_binheap_node_t    **cbh_elements1;
+       /** # elements referenced */
+       unsigned int            cbh_nelements;
+       /** high water mark */
+       unsigned int            cbh_hwm;
+       /** user flags */
+       unsigned int            cbh_flags;
+       /** operations table */
+       cfs_binheap_ops_t      *cbh_ops;
+       /** private data */
+       void                   *cbh_private;
+       /** associated CPT table */
+       struct cfs_cpt_table   *cbh_cptab;
+       /** associated CPT id of this cfs_binheap_t::cbh_cptab */
+       int                     cbh_cptid;
+} cfs_binheap_t;
+
+void cfs_binheap_destroy(cfs_binheap_t *h);
+cfs_binheap_t *cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+                                 unsigned count, void *arg,
+                                 struct cfs_cpt_table *cptab, int cptid);
+cfs_binheap_node_t *cfs_binheap_find(cfs_binheap_t *h, unsigned int idx);
+int cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e);
+void cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e);
+
+static inline int
+cfs_binheap_size(cfs_binheap_t *h)
+{
+       return h->cbh_nelements;
+}
+
+static inline int
+cfs_binheap_is_empty(cfs_binheap_t *h)
+{
+       return h->cbh_nelements == 0;
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_root(cfs_binheap_t *h)
+{
+       return cfs_binheap_find(h, 0);
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_remove_root(cfs_binheap_t *h)
+{
+       cfs_binheap_node_t *e = cfs_binheap_find(h, 0);
+
+       if (e != NULL)
+               cfs_binheap_remove(h, e);
+       return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
new file mode 100644 (file)
index 0000000..5be3679
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfsutil_ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+       __u32 ioc_len;
+       __u32 ioc_version;
+
+       __u64 ioc_nid;
+       __u64 ioc_u64[1];
+
+       __u32 ioc_flags;
+       __u32 ioc_count;
+       __u32 ioc_net;
+       __u32 ioc_u32[7];
+
+       __u32 ioc_inllen1;
+       char *ioc_inlbuf1;
+       __u32 ioc_inllen2;
+       char *ioc_inlbuf2;
+
+       __u32 ioc_plen1; /* buffers in userspace */
+       char *ioc_pbuf1;
+       __u32 ioc_plen2; /* buffers in userspace */
+       char *ioc_pbuf2;
+
+       char ioc_bulk[0];
+};
+
+
+struct libcfs_ioctl_hdr {
+       __u32 ioc_len;
+       __u32 ioc_version;
+};
+
+struct libcfs_debug_ioctl_data
+{
+       struct libcfs_ioctl_hdr hdr;
+       unsigned int subs;
+       unsigned int debug;
+};
+
+#define LIBCFS_IOC_INIT(data)                     \
+do {                                               \
+       memset(&data, 0, sizeof(data));          \
+       data.ioc_version = LIBCFS_IOCTL_VERSION;        \
+       data.ioc_len = sizeof(data);                \
+} while (0)
+
+
+struct libcfs_ioctl_handler {
+       struct list_head item;
+       int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)                   \
+       struct libcfs_ioctl_handler ident = {              \
+               /* .item = */ LIST_HEAD_INIT(ident.item),   \
+               /* .handle_ioctl = */ func                    \
+       }
+
+
+/* FIXME check conflict with lustre_lib.h */
+#define LIBCFS_IOC_DEBUG_MASK       _IOWR('f', 250, long)
+
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_LIBCFS_TYPE                   'e'
+#define IOC_LIBCFS_MIN_NR               30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC                  _IOWR('e', 30, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLEAR_DEBUG      _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG        _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_CONTROL      _IOWR('e', 33, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_SNAPSHOT            _IOWR('e', 34, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_LOOKUP_STRING       _IOWR('e', 35, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MEMHOG                _IOWR('e', 36, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_TEST          _IOWR('e', 37, IOCTL_LIBCFS_TYPE)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI                _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID            _IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_ROUTE          _IOWR('e', 52, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_ROUTE          _IOWR('e', 53, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_ROUTE          _IOWR('e', 54, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER          _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE      _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_DIST          _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE          _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT      _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING                    _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEBUG_PEER        _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNETST                _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID        _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION    _IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION      _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN            _IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER            _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER            _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER            _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_TXDESC        _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE          _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE          _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE          _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+#define IOC_LIBCFS_MAX_NR                           80
+
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+       int len = sizeof(*data);
+       len += cfs_size_round(data->ioc_inllen1);
+       len += cfs_size_round(data->ioc_inllen2);
+       return len;
+}
+
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+       if (data->ioc_len > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+               CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+               CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf1 && !data->ioc_plen1) {
+               CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf2 && !data->ioc_plen2) {
+               CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_plen1 && !data->ioc_pbuf1) {
+               CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+               return 1;
+       }
+       if (data->ioc_plen2 && !data->ioc_pbuf2) {
+               CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+               return 1;
+       }
+       if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
+               CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 &&
+           data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+               CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 &&
+           data->ioc_bulk[cfs_size_round(data->ioc_inllen1) +
+                          data->ioc_inllen2 - 1] != '\0') {
+               CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
+               return 1;
+       }
+       return 0;
+}
+
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_popdata(void *arg, void *buf, int size);
+
+
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
new file mode 100644 (file)
index 0000000..596a15f
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * libcfs/include/libcfs/libcfs_kernelcomm.h
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ *
+ */
+
+#ifndef __LIBCFS_KERNELCOMM_H__
+#define __LIBCFS_KERNELCOMM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+       __u16 kuc_magic;
+       __u8  kuc_transport;  /* Each new Lustre feature should use a different
+                                transport */
+       __u8  kuc_flags;
+       __u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+       __u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+#define KUC_FL_BLOCK 0x01   /* Wait for send */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+       KUC_TRANSPORT_GENERIC   = 1,
+       KUC_TRANSPORT_HSM       = 2,
+       KUC_TRANSPORT_CHANGELOG = 3,
+};
+
+enum kuc_generic_message_type {
+       KUC_MSG_SHUTDOWN = 1,
+};
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg);
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM       0x02
+#define KUC_GRP_MAX       KUC_GRP_HSM
+
+/* Kernel methods */
+extern int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+extern int libcfs_kkuc_group_put(int group, void *payload);
+extern int libcfs_kkuc_group_add(struct file *fp, int uid, int group,
+                                __u32 data);
+extern int libcfs_kkuc_group_rem(int uid, int group);
+extern int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+                                    void *cb_arg);
+
+#define LK_FLG_STOP 0x01
+
+/* kernelcomm control structure, passed from userspace to kernel */
+typedef struct lustre_kernelcomm {
+       __u32 lk_wfd;
+       __u32 lk_rfd;
+       __u32 lk_uid;
+       __u32 lk_group;
+       __u32 lk_data;
+       __u32 lk_flags;
+} __attribute__((packed)) lustre_kernelcomm;
+
+/* Userspace methods */
+extern int libcfs_ukuc_start(lustre_kernelcomm *l, int groups);
+extern int libcfs_ukuc_stop(lustre_kernelcomm *l);
+extern int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize,
+                              int transport);
+
+#endif /* __LIBCFS_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
new file mode 100644 (file)
index 0000000..9c40ed9
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+#ifndef EXPORT_SYMBOL
+# define EXPORT_SYMBOL(s)
+#endif
+
+/*
+ * Schedule
+ */
+void cfs_pause(cfs_duration_t ticks);
+
+/*
+ * Timer
+ */
+typedef  void (cfs_timer_func_t)(ulong_ptr_t);
+void schedule_timeout_and_set_state(cfs_task_state_t, int64_t);
+
+void init_waitqueue_entry_current(wait_queue_t *link);
+int64_t waitq_timedwait(wait_queue_t *, cfs_task_state_t, int64_t);
+void waitq_wait(wait_queue_t *, cfs_task_state_t);
+void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *);
+
+void cfs_init_timer(timer_list_t *t);
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg);
+void cfs_timer_done(timer_list_t *t);
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(timer_list_t *t);
+int  cfs_timer_is_armed(timer_list_t *t);
+cfs_time_t cfs_timer_deadline(timer_list_t *t);
+
+/*
+ * Memory
+ */
+#ifndef memory_pressure_get
+#define memory_pressure_get() (0)
+#endif
+#ifndef memory_pressure_set
+#define memory_pressure_set() do {} while (0)
+#endif
+#ifndef memory_pressure_clr
+#define memory_pressure_clr() do {} while (0)
+#endif
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+       int old = memory_pressure_get();
+
+       if (!old)
+               memory_pressure_set();
+       return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+       if (old)
+               memory_pressure_set();
+       else
+               memory_pressure_clr();
+       return;
+}
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
new file mode 100644 (file)
index 0000000..62bf32f
--- /dev/null
@@ -0,0 +1,568 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+/* XXX this layering violation is for nidstrings */
+#include <linux/lnet/types.h>
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+
+#define LASSERTF(cond, fmt, ...)                                       \
+do {                                                                   \
+       if (unlikely(!(cond))) {                                        \
+               LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);  \
+               libcfs_debug_msg(&__msg_data,                           \
+                                "ASSERTION( %s ) failed: " fmt, #cond, \
+                                ## __VA_ARGS__);                       \
+               lbug_with_loc(&__msg_data);                             \
+       }                                                               \
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn));
+
+#define LBUG()                                                   \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);          \
+       lbug_with_loc(&msgdata);                                        \
+} while(0)
+
+extern atomic_t libcfs_kmemory;
+/*
+ * Memory
+ */
+
+# define libcfs_kmem_inc(ptr, size)            \
+do {                                           \
+       atomic_add(size, &libcfs_kmemory);      \
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)            \
+do {                                           \
+       atomic_sub(size, &libcfs_kmemory);      \
+} while (0)
+
+# define libcfs_kmem_read()                    \
+       atomic_read(&libcfs_kmemory)
+
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE    (2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)                                       \
+do {                                                                       \
+       LASSERT(!in_interrupt() ||                                          \
+               ((size) <= LIBCFS_VMALLOC_SIZE &&                           \
+                ((mask) & GFP_ATOMIC)) != 0);                      \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)                                       \
+do {                                                                       \
+       if (unlikely((ptr) == NULL)) {                                      \
+               CERROR("LNET: out of memory at %s:%d (tried to alloc '"     \
+                      #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+               CERROR("LNET: %d total bytes allocated by lnet\n",          \
+                      libcfs_kmem_read());                                 \
+       } else {                                                            \
+               memset((ptr), 0, (size));                                   \
+               libcfs_kmem_inc((ptr), (size));                             \
+               CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+                      (int)(size), (ptr), libcfs_kmem_read());             \
+       }                                                                  \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)                                  \
+do {                                                                       \
+       LIBCFS_ALLOC_PRE((size), (mask));                                   \
+       (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?                             \
+               kmalloc((size), (mask)) : vmalloc(size);            \
+       LIBCFS_ALLOC_POST((ptr), (size));                                   \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+       LIBCFS_ALLOC_GFP(ptr, size, __GFP_IO)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+       LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)                  \
+do {                                                                       \
+       LIBCFS_ALLOC_PRE((size), (mask));                                   \
+       (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?                             \
+               cfs_cpt_malloc((cptab), (cpt), (size), (mask)) :            \
+               cfs_cpt_vmalloc((cptab), (cpt), (size));                    \
+       LIBCFS_ALLOC_POST((ptr), (size));                                   \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)                                    \
+       LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define LIBCFS_FREE(ptr, size)                                   \
+do {                                                               \
+       int s = (size);                                          \
+       if (unlikely((ptr) == NULL)) {                            \
+               CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+                      "%s:%d\n", s, __FILE__, __LINE__);              \
+               break;                                            \
+       }                                                              \
+       libcfs_kmem_dec((ptr), s);                                    \
+       CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+              s, (ptr), libcfs_kmem_read());                           \
+       if (unlikely(s > LIBCFS_VMALLOC_SIZE))                    \
+               vfree(ptr);                                 \
+       else                                                        \
+               kfree(ptr);                                       \
+} while (0)
+
+/******************************************************************************/
+
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
+
+void libcfs_debug_dumpstack(task_t *tsk);
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+void libcfs_debug_set_level(unsigned int debug_level);
+
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *     cptable != NULL: size of array is number of CPU partitions
+ *     cptable == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destory per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+void *cfs_percpt_current(void *vars);
+void *cfs_percpt_index(void *vars, int idx);
+
+#define cfs_percpt_for_each(var, i, vars)              \
+       for (i = 0; i < cfs_percpt_number(vars) &&      \
+                   ((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED   (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) == v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) != v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) < v,                    \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) <= v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) > v,                    \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) >= v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)         do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)           LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)             LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof (*(ptr)));
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof (*(ptr)));
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+
+enum {
+       CFS_PERCPT_LOCK_EX      = -1, /* negative */
+};
+
+
+struct cfs_percpt_lock {
+       /* cpu-partition-table for this lock */
+       struct cfs_cpt_table    *pcl_cptab;
+       /* exclusively locked */
+       unsigned int            pcl_locked;
+       /* private lock table */
+       spinlock_t              **pcl_locks;
+};
+
+/* return number of private locks */
+static inline int
+cfs_percpt_lock_num(struct cfs_percpt_lock *pcl)
+{
+       return cfs_cpt_number(pcl->pcl_cptab);
+}
+
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+/* create percpt (atomic) refcount based on @cptab */
+atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val);
+/* destroy percpt refcount */
+void cfs_percpt_atomic_free(atomic_t **refs);
+/* return sum of all percpu refs */
+int cfs_percpt_atomic_summary(atomic_t **refs);
+
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+
+/* support decl needed both by kernel and liblustre */
+int     libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int     libcfs_str2lnd(const char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(const char *str);
+lnet_nid_t  libcfs_str2nid(const char *str);
+int     libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void   cfs_free_nidlist(struct list_head *list);
+int     cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int     cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+/** \addtogroup lnet_addr
+ * @{ */
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+/** extract the network part of an lnet_nid_t */
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+/** make an lnet_nid_t from a network part and an address part */
+#define LNET_MKNID(net,addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ,num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+/** @} lnet_addr */
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+#ifndef CFS_CURRENT_TIME
+# define CFS_CURRENT_TIME time(0)
+#endif
+
+/* --------------------------------------------------------------------
+ * Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect.
+ * All stuff about lwt are put in arch/kp30.h
+ * -------------------------------------------------------------------- */
+
+struct libcfs_device_userstate
+{
+       int        ldu_memhog_pages;
+       struct page   *ldu_memhog_root_page;
+};
+
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int cfs_size_round4 (int val)
+{
+       return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+       return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline int cfs_size_round16(int val)
+{
+       return (val + 0xf) & (~0xf);
+}
+
+static inline int cfs_size_round32(int val)
+{
+       return (val + 0x1f) & (~0x1f);
+}
+
+static inline int cfs_size_round0(int val)
+{
+       if (!val)
+               return 0;
+       return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+       return (size_t)cfs_size_round((int)strlen(fset) + 1);
+}
+
+/* roundup \a val to power2 */
+static inline unsigned int cfs_power2_roundup(unsigned int val)
+{
+       if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */
+               do {
+                       val &= ~LOWEST_BIT_SET(val);
+               } while (val != LOWEST_BIT_SET(val));
+               /* ...and round up */
+               val <<= 1;
+       }
+       return val;
+}
+
+#define LOGL(var,len,ptr)                                     \
+do {                                                       \
+       if (var)                                                \
+               memcpy((char *)ptr, (const char *)var, len);    \
+       ptr += cfs_size_round(len);                          \
+} while (0)
+
+#define LOGU(var,len,ptr)                                     \
+do {                                                       \
+       if (var)                                                \
+               memcpy((char *)var, (const char *)ptr, len);    \
+       ptr += cfs_size_round(len);                          \
+} while (0)
+
+#define LOGL0(var,len,ptr)                           \
+do {                                               \
+       if (!len)                                      \
+               break;                            \
+       memcpy((char *)ptr, (const char *)var, len);    \
+       *((char *)(ptr) + len) = 0;                  \
+       ptr += cfs_size_round(len + 1);          \
+} while (0)
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+       /* Only add to these values (i.e. don't ever change or redefine them):
+        * network addresses depend on them... */
+       QSWLND    = 1,
+       SOCKLND   = 2,
+       GMLND     = 3, /* obsolete, keep it so that libcfs_nid2str works */
+       PTLLND    = 4,
+       O2IBLND   = 5,
+       CIBLND    = 6,
+       OPENIBLND = 7,
+       IIBLND    = 8,
+       LOLND     = 9,
+       RALND     = 10,
+       VIBLND    = 11,
+       MXLND     = 12,
+       GNILND    = 13,
+};
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
new file mode 100644 (file)
index 0000000..a6bac9c
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* string comparison ignoring case */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                int *oldmask, int minmask, int allmask);
+
+/* Allocate space for and copy an existing string.
+ * Must free with kfree().
+ */
+char *cfs_strdup(const char *str, u_int32_t flags);
+
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...);
+
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+       char            *ls_str;
+       int             ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+       /*
+        * Link to cfs_expr_list::el_exprs.
+        */
+       struct list_head        re_link;
+       __u32           re_lo;
+       __u32           re_hi;
+       __u32           re_stride;
+};
+
+struct cfs_expr_list {
+       struct list_head        el_link;
+       struct list_head        el_exprs;
+};
+
+static inline int
+cfs_iswhite(char c)
+{
+       switch (c) {
+       case ' ':
+       case '\t':
+       case '\n':
+       case '\r':
+               return 1;
+       default:
+               break;
+       }
+       return 0;
+}
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+                     unsigned min, unsigned max);
+int cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+                        int single_tok, struct cfs_range_expr **expr);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+                        int max, __u32 **values);
+static inline void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+       /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+        * by OBD_FREE() if it's called by module other than libcfs & LNet,
+        * otherwise we will see fake memory leak */
+       LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_print(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+                       struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+void cfs_ip_addr_free(struct list_head *list);
+
+#define        strtoul(str, endp, base)        simple_strtoul(str, endp, base)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
new file mode 100644 (file)
index 0000000..4bdd771
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+       return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+       return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+       return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+       return cfs_time_beforeq(t2, t1);
+}
+
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+       return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+                                  struct timeval *result)
+{
+       long r = (long) (
+               (large->tv_sec - small->tv_sec) * ONE_MILLION +
+               (large->tv_usec - small->tv_usec));
+       if (result != NULL) {
+               result->tv_usec = r % ONE_MILLION;
+               result->tv_sec = r / ONE_MILLION;
+       }
+       return r;
+}
+
+static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
+{
+       if (cfs_time_after(cfs_time_current(),
+                          cfs_time_add(now, cfs_time_seconds(15))))
+               CERROR("slow %s "CFS_TIME_T" sec\n", msg,
+                      cfs_duration_sec(cfs_time_sub(cfs_time_current(),now)));
+}
+
+#define CFS_RATELIMIT(seconds)                           \
+({                                                           \
+       /*                                                    \
+        * XXX nikita: non-portable initializer          \
+        */                                                  \
+       static time_t __next_message = 0;                      \
+       int result;                                          \
+                                                               \
+       if (cfs_time_after(cfs_time_current(), __next_message)) \
+               result = 1;                                  \
+       else {                                            \
+               __next_message = cfs_time_shift(seconds);       \
+               result = 0;                                  \
+       }                                                      \
+       result;                                          \
+})
+
+/*
+ * helper function similar to do_gettimeofday() of Linux kernel
+ */
+static inline void cfs_fs_timeval(struct timeval *tv)
+{
+       cfs_fs_time_t time;
+
+       cfs_fs_time_current(&time);
+       cfs_fs_time_usec(&time, tv);
+}
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+       if (timeout < CFS_TICK)
+               timeout = CFS_TICK;
+       return timeout;
+}
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
new file mode 100644 (file)
index 0000000..5cc64f3
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *      Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+                       int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+typedef struct cfs_workitem {
+       /** chain on runq or rerunq */
+       struct list_head       wi_list;
+       /** working function */
+       cfs_wi_action_t  wi_action;
+       /** arg for working function */
+       void        *wi_data;
+       /** in running */
+       unsigned short   wi_running:1;
+       /** scheduled */
+       unsigned short   wi_scheduled:1;
+} cfs_workitem_t;
+
+static inline void
+cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action)
+{
+       INIT_LIST_HEAD(&wi->wi_list);
+
+       wi->wi_running   = 0;
+       wi->wi_scheduled = 0;
+       wi->wi_data      = data;
+       wi->wi_action    = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h
new file mode 100644 (file)
index 0000000..4b7ae1c
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_KP30_H__
+#define __LIBCFS_LINUX_KP30_H__
+
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/smp.h>
+#include <linux/ctype.h>
+#include <linux/compiler.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+#include <linux/kallsyms.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+
+#define prepare_work(wq,cb,cbdata)                                         \
+do {                                                                     \
+       INIT_WORK((wq), (void *)(cb));                                  \
+} while (0)
+
+#define cfs_get_work_data(type,field,data) container_of(data,type,field)
+
+
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+#define work_struct_t      struct work_struct
+
+#ifdef CONFIG_SMP
+#else
+#endif
+
+
+#define SEM_COUNT(sem)   ((sem)->count)
+
+
+/* ------------------------------------------------------------------- */
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+
+
+
+/******************************************************************************/
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) \
+       module_param(name, type, perm);\
+       MODULE_PARM_DESC(name, desc)
+
+#define CFS_SYSFS_MODULE_PARM  1 /* module parameters accessible via sysfs */
+
+/******************************************************************************/
+
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+
+#endif /* __GNUC__ */
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+# define cfs_num_present_cpus()  num_present_cpus()
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+#define LWT_MEMORY   (16<<20)
+
+#ifndef KLWT_SUPPORT
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+
+/* kernel hasn't defined this? */
+typedef struct {
+       long long   lwte_when;
+       char       *lwte_where;
+       void       *lwte_task;
+       long    lwte_p1;
+       long    lwte_p2;
+       long    lwte_p3;
+       long    lwte_p4;
+# if BITS_PER_LONG > 32
+       long    lwte_pad;
+# endif
+} lwt_event_t;
+#endif /* !KLWT_SUPPORT */
+
+#if LWT_SUPPORT
+#  if !KLWT_SUPPORT
+
+typedef struct _lwt_page {
+       struct list_head               lwtp_list;
+       struct page          *lwtp_page;
+       lwt_event_t          *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+       int             lwtc_current_index;
+       lwt_page_t      *lwtc_current_page;
+} lwt_cpu_t;
+
+extern int       lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n)       #n
+#define LWTWHERE(f,l)   f ":" LWTSTR(l)
+#define LWT_EVENTS_PER_PAGE (PAGE_CACHE_SIZE / sizeof (lwt_event_t))
+
+#define LWT_EVENT(p1, p2, p3, p4)                                     \
+do {                                                               \
+       unsigned long    flags;                                  \
+       lwt_cpu_t       *cpu;                                      \
+       lwt_page_t      *p;                                          \
+       lwt_event_t     *e;                                          \
+                                                                       \
+       if (lwt_enabled) {                                            \
+               local_irq_save (flags);                          \
+                                                                       \
+               cpu = &lwt_cpus[smp_processor_id()];                \
+               p = cpu->lwtc_current_page;                          \
+               e = &p->lwtp_events[cpu->lwtc_current_index++];  \
+                                                                       \
+               if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) {   \
+                       cpu->lwtc_current_page =                        \
+                               list_entry (p->lwtp_list.next,      \
+                                               lwt_page_t, lwtp_list); \
+                       cpu->lwtc_current_index = 0;                \
+               }                                                      \
+                                                                       \
+               e->lwte_when  = get_cycles();                      \
+               e->lwte_where = LWTWHERE(__FILE__,__LINE__);        \
+               e->lwte_task  = current;                                \
+               e->lwte_p1    = (long)(p1);                          \
+               e->lwte_p2    = (long)(p2);                          \
+               e->lwte_p3    = (long)(p3);                          \
+               e->lwte_p4    = (long)(p4);                          \
+                                                                       \
+               local_irq_restore (flags);                            \
+       }                                                              \
+} while (0)
+
+#endif /* !KLWT_SUPPORT */
+
+extern int  lwt_init (void);
+extern void lwt_fini (void);
+extern int  lwt_lookup_string (int *size, char *knlptr,
+                              char *usrptr, int usrsize);
+extern int  lwt_control (int enable, int clear);
+extern int  lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+                         void *user_ptr, int user_size);
+#endif /* LWT_SUPPORT */
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#   define BITS_PER_LONG 64
+# endif
+#endif
+
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+
+/* this is a bit chunky */
+
+#define _LWORDSIZE BITS_PER_LONG
+
+# define LPU64 "%llu"
+# define LPD64 "%lld"
+# define LPX64 "%#llx"
+# define LPX64i "%llx"
+# define LPO64 "%#llo"
+# define LPF64 "L"
+
+/*
+ * long_ptr_t & ulong_ptr_t, same to "long" for gcc
+ */
+# define LPLU "%lu"
+# define LPLD "%ld"
+# define LPLX "%#lx"
+
+/*
+ * pid_t
+ */
+# define LPPID "%d"
+
+
+#undef _LWORDSIZE
+
+/* compat macroses */
+
+
+#ifndef get_cpu
+# ifdef CONFIG_PREEMPT
+#  define get_cpu()  ({ preempt_disable(); smp_processor_id(); })
+#  define put_cpu()  preempt_enable()
+# else
+#  define get_cpu()  smp_processor_id()
+#  define put_cpu()
+# endif
+#else
+#endif /* get_cpu & put_cpu */
+
+#define INIT_CTL_NAME(a)
+#define INIT_STRATEGY(a)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
new file mode 100644 (file)
index 0000000..757e6dc
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+
+#include <stdarg.h>
+#include <linux/libcfs/linux/linux-cpu.h>
+#include <linux/libcfs/linux/linux-time.h>
+#include <linux/libcfs/linux/linux-mem.h>
+#include <linux/libcfs/linux/linux-prim.h>
+#include <linux/libcfs/linux/linux-lock.h>
+#include <linux/libcfs/linux/linux-fs.h>
+#include <linux/libcfs/linux/linux-tcpip.h>
+#include <linux/libcfs/linux/linux-bitops.h>
+#include <linux/libcfs/linux/linux-types.h>
+#include <linux/libcfs/linux/kp30.h>
+
+#include <asm/types.h>
+#include <linux/types.h>
+#include <asm/timex.h>
+#include <linux/sched.h> /* THREAD_SIZE */
+#include <linux/rbtree.h>
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                         ((unsigned long)__builtin_dwarf_cfa() &       \
+                          (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                         ((unsigned long)__builtin_frame_address(0) &  \
+                          (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)                           \
+do {                                                               \
+       if (unlikely(CDEBUG_STACK() > libcfs_stack)) {            \
+               LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+               libcfs_stack = CDEBUG_STACK();                    \
+               libcfs_debug_msg(msgdata,                              \
+                                "maximum lustre stack %lu\n",    \
+                                CDEBUG_STACK());                      \
+               (msgdata)->msg_mask = mask;                          \
+               (msgdata)->msg_cdls = cdls;                          \
+               dump_stack();                                      \
+             /*panic("LBUG");*/                                        \
+       }                                                              \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID          12345
+
+#define ENTRY_NESTING_SUPPORT (1)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+#include <linux/capability.h>
+
+/*
+ * No stack-back-tracing in Linux for now.
+ */
+struct cfs_stack_trace {
+};
+
+/* long integer with size equal to pointer */
+typedef unsigned long ulong_ptr_t;
+typedef long long_ptr_t;
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+
+
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h
new file mode 100644 (file)
index 0000000..43936e3
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-bitops.h
+ */
+#include <linux/bitops.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
new file mode 100644 (file)
index 0000000..224371c
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+       /* CPUs mask for this partition */
+       cpumask_t                       *cpt_cpumask;
+       /* nodes mask for this partition */
+       nodemask_t                      *cpt_nodemask;
+       /* spread rotor for NUMA allocator */
+       unsigned                        cpt_spread_rotor;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+       /* version, reserved for hotplug */
+       unsigned                        ctb_version;
+       /* spread rotor for NUMA allocator */
+       unsigned                        ctb_spread_rotor;
+       /* # of CPU partitions */
+       unsigned                        ctb_nparts;
+       /* partitions tables */
+       struct cfs_cpu_partition        *ctb_parts;
+       /* shadow HW CPU to CPU partition ID */
+       int                             *ctb_cpu2cpt;
+       /* all cpus in this partition table */
+       cpumask_t                       *ctb_cpumask;
+       /* all nodes in this partition table */
+       nodemask_t                      *ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask);
+void cfs_node_to_cpumask(int node, cpumask_t *mask);
+int cfs_cpu_core_nsiblings(int cpu);
+int cfs_cpu_ht_nsiblings(int cpu);
+
+/**
+ * comment out definitions for compatible layer
+ * #define CFS_CPU_NR                    NR_CPUS
+ *
+ * typedef cpumask_t                      cfs_cpumask_t;
+ *
+ * #define cfs_cpu_current()              smp_processor_id()
+ * #define cfs_cpu_online(i)              cpu_online(i)
+ * #define cfs_cpu_online_num()                num_online_cpus()
+ * #define cfs_cpu_online_for_each(i)    for_each_online_cpu(i)
+ * #define cfs_cpu_possible_num()            num_possible_cpus()
+ * #define cfs_cpu_possible_for_each(i)        for_each_possible_cpu(i)
+ *
+ * #ifdef CONFIG_CPUMASK_SIZE
+ * #define cfs_cpu_mask_size()          cpumask_size()
+ * #else
+ * #define cfs_cpu_mask_size()          sizeof(cfs_cpumask_t)
+ * #endif
+ *
+ * #define cfs_cpu_mask_set(i, mask)      cpu_set(i, mask)
+ * #define cfs_cpu_mask_unset(i, mask)  cpu_clear(i, mask)
+ * #define cfs_cpu_mask_isset(i, mask)  cpu_isset(i, mask)
+ * #define cfs_cpu_mask_clear(mask)        cpus_clear(mask)
+ * #define cfs_cpu_mask_empty(mask)        cpus_empty(mask)
+ * #define cfs_cpu_mask_weight(mask)      cpus_weight(mask)
+ * #define cfs_cpu_mask_first(mask)        first_cpu(mask)
+ * #define cfs_cpu_mask_any_online(mask)      (any_online_cpu(mask) != NR_CPUS)
+ * #define cfs_cpu_mask_for_each(i, mask)      for_each_cpu_mask(i, mask)
+ * #define cfs_cpu_mask_bind(t, mask)    set_cpus_allowed(t, mask)
+ *
+ * #ifdef HAVE_CPUMASK_COPY
+ * #define cfs_cpu_mask_copy(dst, src)  cpumask_copy(dst, src)
+ * #else
+ * #define cfs_cpu_mask_copy(dst, src)  memcpy(dst, src, sizeof(*src))
+ * #endif
+ *
+ * static inline void
+ * cfs_cpu_mask_of_online(cfs_cpumask_t *mask)
+ * {
+ * cfs_cpu_mask_copy(mask, &cpu_online_map);
+ * }
+ *
+ * #ifdef CONFIG_NUMA
+ *
+ * #define CFS_NODE_NR                  MAX_NUMNODES
+ *
+ * typedef nodemask_t                    cfs_node_mask_t;
+ *
+ * #define cfs_node_of_cpu(cpu)                cpu_to_node(cpu)
+ * #define cfs_node_online(i)            node_online(i)
+ * #define cfs_node_online_num()              num_online_nodes()
+ * #define cfs_node_online_for_each(i)  for_each_online_node(i)
+ * #define cfs_node_possible_num()          num_possible_nodes()
+ * #define cfs_node_possible_for_each(i)       for_each_node(i)
+ *
+ * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask)
+ * {
+ * #if defined(HAVE_NODE_TO_CPUMASK)
+ *      *mask = node_to_cpumask(node);
+ * #elif defined(HAVE_CPUMASK_OF_NODE)
+ *      cfs_cpu_mask_copy(mask, cpumask_of_node(node));
+ * #else
+ * # error "Needs node_to_cpumask or cpumask_of_node"
+ * #endif
+ * }
+ *
+ * #define cfs_node_mask_set(i, mask)    node_set(i, mask)
+ * #define cfs_node_mask_unset(i, mask)        node_clear(i, mask)
+ * #define cfs_node_mask_isset(i, mask)        node_isset(i, mask)
+ * #define cfs_node_mask_clear(mask)      nodes_reset(mask)
+ * #define cfs_node_mask_empty(mask)      nodes_empty(mask)
+ * #define cfs_node_mask_weight(mask)    nodes_weight(mask)
+ * #define cfs_node_mask_for_each(i, mask)     for_each_node_mask(i, mask)
+ * #define cfs_node_mask_copy(dst, src)        memcpy(dst, src, sizeof(*src))
+ *
+ * static inline void
+ * cfs_node_mask_of_online(cfs_node_mask_t *mask)
+ * {
+ *       cfs_node_mask_copy(mask, &node_online_map);
+ * }
+ *
+ * #endif
+ */
+
+#endif /* CONFIG_SMP */
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h
new file mode 100644 (file)
index 0000000..97c771c
--- /dev/null
@@ -0,0 +1,49 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash CRC32 algorithm.
+ */
+int cfs_crypto_crc32_register(void);
+void cfs_crypto_crc32_unregister(void);
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32 pclmulqdq
+ */
+int cfs_crypto_crc32_pclmul_register(void);
+void cfs_crypto_crc32_pclmul_unregister(void);
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h
new file mode 100644 (file)
index 0000000..90ff47a
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/posix_acl_xattr.h>
+
+#define filp_size(f)                                   \
+       (i_size_read((f)->f_dentry->d_inode))
+#define filp_poff(f)                                   \
+       (&(f)->f_pos)
+
+# define do_fsync(fp, flag)                            \
+       ((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag))
+
+#define filp_read(fp, buf, size, pos)                  \
+       ((fp)->f_op->read((fp), (buf), (size), pos))
+
+#define filp_write(fp, buf, size, pos)                 \
+       ((fp)->f_op->write((fp), (buf), (size), pos))
+
+#define filp_fsync(fp)                                 \
+       do_fsync(fp, 1)
+
+#define flock_type(fl)                 ((fl)->fl_type)
+#define flock_set_type(fl, type)       do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)                  ((fl)->fl_pid)
+#define flock_set_pid(fl, pid)         do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)                        ((fl)->fl_start)
+#define flock_set_start(fl, st)                do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)                  ((fl)->fl_end)
+#define flock_set_end(fl, end)         do { (fl)->fl_end = (end); } while (0)
+
+ssize_t filp_user_write(struct file *filp, const void *buf, size_t count,
+                       loff_t *offset);
+
+#ifndef IFSHIFT
+#define IFSHIFT                        12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)           (((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)                ((dirtype) << IFSHIFT)
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h
new file mode 100644 (file)
index 0000000..6fbcbf3
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-lock.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_LOCK_H__
+#define __LIBCFS_LINUX_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mutex.h>
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+/*
+ * spin_lock "implementation" (use Linux kernel's primitives)
+ *
+ * - spin_lock_init(x)
+ * - spin_lock(x)
+ * - spin_lock_bh(x)
+ * - spin_lock_bh_init(x)
+ * - spin_unlock(x)
+ * - spin_unlock_bh(x)
+ * - spin_trylock(x)
+ * - spin_is_locked(x)
+ *
+ * - spin_lock_irq(x)
+ * - spin_lock_irqsave(x, f)
+ * - spin_unlock_irqrestore(x, f)
+ * - read_lock_irqsave(lock, f)
+ * - write_lock_irqsave(lock, f)
+ * - write_unlock_irqrestore(lock, f)
+ */
+
+/*
+ * spinlock "implementation"
+ */
+
+
+
+
+/*
+ * rw_semaphore "implementation" (use Linux kernel's primitives)
+ *
+ * - sema_init(x)
+ * - init_rwsem(x)
+ * - down_read(x)
+ * - up_read(x)
+ * - down_write(x)
+ * - up_write(x)
+ */
+
+
+#define fini_rwsem(s)          do {} while (0)
+
+
+/*
+ * rwlock_t "implementation" (use Linux kernel's primitives)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ * - write_lock_bh(x)
+ * - write_unlock_bh(x)
+ *
+ * - RW_LOCK_UNLOCKED
+ */
+
+
+#ifndef DEFINE_RWLOCK
+#define DEFINE_RWLOCK(lock)    rwlock_t lock = __RW_LOCK_UNLOCKED(lock)
+#endif
+
+/*
+ * completion "implementation" (use Linux kernel's primitives)
+ *
+ * - DECLARE_COMPLETION(work)
+ * - INIT_COMPLETION(c)
+ * - COMPLETION_INITIALIZER(work)
+ * - init_completion(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ * - wait_for_completion_interruptible(c)
+ * - fini_completion(c)
+ */
+#define fini_completion(c) do { } while (0)
+
+/*
+ * semaphore "implementation" (use Linux kernel's primitives)
+ * - DEFINE_SEMAPHORE(name)
+ * - sema_init(sem, val)
+ * - up(sem)
+ * - down(sem)
+ * - down_interruptible(sem)
+ * - down_trylock(sem)
+ */
+
+/*
+ * mutex "implementation" (use Linux kernel's primitives)
+ *
+ * - DEFINE_MUTEX(name)
+ * - mutex_init(x)
+ * - mutex_lock(x)
+ * - mutex_unlock(x)
+ * - mutex_trylock(x)
+ * - mutex_is_locked(x)
+ * - mutex_destroy(x)
+ */
+
+#ifndef lockdep_set_class
+
+/**************************************************************************
+ *
+ * Lockdep "implementation". Also see liblustre.h
+ *
+ **************************************************************************/
+
+struct lock_class_key {
+       ;
+};
+
+#define lockdep_set_class(lock, key) \
+       do { (void)sizeof(lock); (void)sizeof(key); } while (0)
+/* This has to be a macro, so that `subclass' can be undefined in kernels
+ * that do not support lockdep. */
+
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+#else
+
+#endif /* lockdep_set_class */
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef mutex_lock_nested
+#define mutex_lock_nested(mutex, subclass) mutex_lock(mutex)
+#endif
+
+#ifndef spin_lock_nested
+#define spin_lock_nested(lock, subclass) spin_lock(lock)
+#endif
+
+#ifndef down_read_nested
+#define down_read_nested(lock, subclass) down_read(lock)
+#endif
+
+#ifndef down_write_nested
+#define down_write_nested(lock, subclass) down_write(lock)
+#endif
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+
+#endif /* __LIBCFS_LINUX_CFS_LOCK_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
new file mode 100644 (file)
index 0000000..f6cb463
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#define CFS_PAGE_MASK             (~((__u64)PAGE_CACHE_SIZE-1))
+#define page_index(p)       ((p)->index)
+
+#define memory_pressure_get() (current->flags & PF_MEMALLOC)
+#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0)
+#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0)
+
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+       min(num_physpages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES num_physpages
+#endif
+
+/*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define ALLOC_ATOMIC_TRY   GFP_ATOMIC
+
+#define DECL_MMSPACE           mm_segment_t __oldfs
+#define MMSPACE_OPEN \
+       do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
+#define MMSPACE_CLOSE         set_fs(__oldfs)
+
+
+/*
+ * NUMA allocators
+ *
+ * NB: we will rename these functions in a separate patch:
+ * - rename kmalloc to cfs_malloc
+ * - rename kmalloc/free_page to cfs_page_alloc/free
+ * - rename kmalloc/free_large to cfs_vmalloc/vfree
+ */
+extern void *cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt,
+                           size_t nr_bytes, unsigned int flags);
+extern void *cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt,
+                            size_t nr_bytes);
+extern struct page *cfs_page_cpt_alloc(struct cfs_cpt_table *cptab,
+                                     int cpt, unsigned int flags);
+extern void *cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep,
+                                    struct cfs_cpt_table *cptab,
+                                    int cpt, unsigned int flags);
+
+/*
+ * Shrinker
+ */
+
+# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+                      struct shrinker *shrinker, \
+                      struct shrink_control *sc
+# define shrink_param(sc, var) ((sc)->var)
+
+typedef int (*shrinker_t)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask));
+
+static inline
+struct shrinker *set_shrinker(int seek, shrinker_t func)
+{
+       struct shrinker *s;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (s == NULL)
+               return (NULL);
+
+       s->shrink = func;
+       s->seeks = seek;
+
+       register_shrinker(s);
+
+       return s;
+}
+
+static inline
+void remove_shrinker(struct shrinker *shrinker)
+{
+       if (shrinker == NULL)
+               return;
+
+       unregister_shrinker(shrinker);
+       kfree(shrinker);
+}
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h
new file mode 100644 (file)
index 0000000..c346bcd
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-prim.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_PRIM_H__
+#define __LIBCFS_LINUX_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+
+#include <linux/miscdevice.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/linux-time.h>
+
+
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_set_cpus_allowed(t, mask)  set_cpus_allowed(t, mask)
+
+/*
+ * cache
+ */
+
+/*
+ * IRQs
+ */
+
+
+/*
+ * Pseudo device register
+ */
+typedef struct miscdevice              psdev_t;
+
+/*
+ * Sysctl register
+ */
+typedef struct ctl_table               ctl_table_t;
+typedef struct ctl_table_header                ctl_table_header_t;
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#define DECLARE_PROC_HANDLER(name)                   \
+static int                                           \
+LL_PROC_PROTO(name)                                 \
+{                                                     \
+       DECLARE_LL_PROC_PPOS_DECL;                    \
+                                                       \
+       return proc_call_handler(table->data, write,    \
+                                ppos, buffer, lenp,    \
+                                __##name);          \
+}
+
+/*
+ * Symbol register
+ */
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)       do {} while(0)
+#define cfs_symbol_get(s)             symbol_get(s)
+#define cfs_symbol_put(s)             symbol_put(s)
+
+typedef struct module module_t;
+
+/*
+ * Proc file system APIs
+ */
+typedef struct proc_dir_entry     proc_dir_entry_t;
+
+/*
+ * Wait Queue
+ */
+
+
+typedef long                       cfs_task_state_t;
+
+#define CFS_DECL_WAITQ(wq)             DECLARE_WAIT_QUEUE_HEAD(wq)
+
+/*
+ * Task struct
+ */
+typedef struct task_struct           task_t;
+#define DECL_JOURNAL_DATA         void *journal_info
+#define PUSH_JOURNAL           do {    \
+       journal_info = current->journal_info;   \
+       current->journal_info = NULL;      \
+       } while(0)
+#define POP_JOURNAL             do {    \
+       current->journal_info = journal_info;   \
+       } while(0)
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+       module_init(init);                  \
+       module_exit(fini)
+
+/*
+ * Signal
+ */
+
+/*
+ * Timer
+ */
+typedef struct timer_list timer_list_t;
+
+
+#ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */
+#define __wait_event_timeout(wq, condition, timeout, ret)      \
+do {                                                        \
+       int __ret = 0;                                     \
+       if (!(condition)) {                                   \
+               wait_queue_t __wait;                         \
+               unsigned long expire;                       \
+                                                                \
+               init_waitqueue_entry(&__wait, current);   \
+               expire = timeout + jiffies;                   \
+               add_wait_queue(&wq, &__wait);               \
+               for (;;) {                                     \
+                       set_current_state(TASK_UNINTERRUPTIBLE); \
+                       if (condition)                     \
+                               break;                     \
+                       if (jiffies > expire) {           \
+                               ret = jiffies - expire;   \
+                               break;                     \
+                       }                                       \
+                       schedule_timeout(timeout);             \
+               }                                               \
+               current->state = TASK_RUNNING;             \
+               remove_wait_queue(&wq, &__wait);                 \
+       }                                                       \
+} while (0)
+/*
+   retval == 0; condition met; we're good.
+   retval > 0; timed out.
+*/
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+do {                                                            \
+       ret = 0;                                                     \
+       if (!(condition))                                           \
+               __wait_event_timeout(wq, condition, timeout, ret);   \
+} while (0)
+#else
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+       ret = wait_event_timeout(wq, condition, timeout)
+#endif
+
+#define cfs_waitq_wait_event_interruptible_timeout(wq, c, timeout, ret) \
+       ret = wait_event_interruptible_timeout(wq, c, timeout)
+
+/*
+ * atomic
+ */
+
+
+#define cfs_atomic_add_unless(atom, a, u)    atomic_add_unless(atom, a, u)
+#define cfs_atomic_cmpxchg(atom, old, nv)    atomic_cmpxchg(atom, old, nv)
+
+/*
+ * membar
+ */
+
+
+/*
+ * interrupt
+ */
+
+
+/*
+ * might_sleep
+ */
+
+/*
+ * group_info
+ */
+typedef struct group_info group_info_t;
+
+
+/*
+ * Random bytes
+ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h
new file mode 100644 (file)
index 0000000..687f33f
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-tcpip.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <net/sock.h>
+
+#ifndef HIPQUAD
+// XXX Should just kill all users
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+       ((unsigned char *)&addr)[3], \
+       ((unsigned char *)&addr)[2], \
+       ((unsigned char *)&addr)[1], \
+       ((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+#endif
+
+typedef struct socket   socket_t;
+
+#define SOCK_SNDBUF(so)         ((so)->sk->sk_sndbuf)
+#define SOCK_TEST_NOSPACE(so)   test_bit(SOCK_NOSPACE, &(so)->flags)
+
+static inline int
+cfs_sock_error(struct socket *sock)
+{
+       return sock->sk->sk_err;
+}
+
+static inline int
+cfs_sock_wmem_queued(struct socket *sock)
+{
+       return sock->sk->sk_wmem_queued;
+}
+
+#define cfs_sk_sleep(sk)       sk_sleep(sk)
+
+#define DEFAULT_NET    (&init_net)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
new file mode 100644 (file)
index 0000000..4a48b91
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t represents point in time. This is internal kernel
+ *                 time rather than "wall clock". This time bears no
+ *                 relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                 platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *                 used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t      cfs_duration_sec (cfs_duration_t);
+ *  void          cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void          cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void          cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t      cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void          cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void          cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+/*
+ * post 2.5 kernels.
+ */
+
+#include <linux/jiffies.h>
+
+typedef struct timespec cfs_fs_time_t;
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+       v->tv_sec  = t->tv_sec;
+       v->tv_usec = t->tv_nsec / 1000;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+       *s = *t;
+}
+
+/*
+ * internal helper function used by cfs_fs_time_before*()
+ */
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+       return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec;
+}
+
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+typedef cycles_t cfs_cycles_t;
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+       return time_before(t1, t2);
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+       return time_before_eq(t1, t2);
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+       return jiffies;
+}
+
+static inline time_t cfs_time_current_sec(void)
+{
+       return get_seconds();
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+       *t = CURRENT_TIME;
+}
+
+static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+       return t->tv_sec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+       return __cfs_fs_time_flat(t1) <  __cfs_fs_time_flat(t2);
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+       return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2);
+}
+
+#if 0
+static inline cfs_duration_t cfs_duration_build(int64_t nano)
+{
+#if (BITS_PER_LONG == 32)
+       /* We cannot use do_div(t, ONE_BILLION), do_div can only process
+        * 64 bits n and 32 bits base */
+       int64_t  t = nano * HZ;
+       do_div(t, 1000);
+       do_div(t, 1000000);
+       return (cfs_duration_t)t;
+#else
+       return (nano * HZ / ONE_BILLION);
+#endif
+}
+#endif
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+       return ((cfs_duration_t)seconds) * HZ;
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+       return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+       __u64 t;
+
+       s->tv_sec = d / HZ;
+       t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION;
+       do_div(t, HZ);
+       s->tv_usec = t;
+#else
+       s->tv_sec = d / HZ;
+       s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * \
+               ONE_MILLION) / HZ;
+#endif
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+#if (BITS_PER_LONG == 32)
+       __u64 t;
+
+       s->tv_sec = d / HZ;
+       t = (d - s->tv_sec * HZ) * ONE_BILLION;
+       do_div(t, HZ);
+       s->tv_nsec = t;
+#else
+       s->tv_sec = d / HZ;
+       s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ;
+#endif
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+       return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+       return cfs_time_add_64(cfs_time_current_64(),
+                              cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+       return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+       return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK               (1)
+
+#define CFS_TIME_T           "%lu"
+#define CFS_DURATION_T   "%ld"
+
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h
new file mode 100644 (file)
index 0000000..1423949
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/user-bitops.h
+ */
+#include <linux/types.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h
new file mode 100644 (file)
index 0000000..2b94872
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_PORTALS_COMPAT_H__
+#define __LIBCFS_LINUX_PORTALS_COMPAT_H__
+
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG
+#  define SIGNAL_MASK_ASSERT() \
+   LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC)
+#else
+# define SIGNAL_MASK_ASSERT()
+#endif
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+
+#define SIGNAL_MASK_LOCK(task, flags)                            \
+       spin_lock_irqsave(&task->sighand->siglock, flags)
+#define SIGNAL_MASK_UNLOCK(task, flags)                                \
+       spin_unlock_irqrestore(&task->sighand->siglock, flags)
+#define USERMODEHELPER(path, argv, envp)                              \
+       call_usermodehelper(path, argv, envp, 1)
+#define clear_tsk_thread_flag(current, TIF_SIGPENDING)   clear_tsk_thread_flag(current,       \
+                                                       TIF_SIGPENDING)
+# define smp_num_cpus        num_online_cpus()
+
+#define cfs_wait_event_interruptible(wq, condition, ret)              \
+       ret = wait_event_interruptible(wq, condition)
+#define cfs_wait_event_interruptible_exclusive(wq, condition, ret)     \
+       ret = wait_event_interruptible_exclusive(wq, condition)
+
+#define THREAD_NAME(comm, len, fmt, a...)                            \
+       snprintf(comm, len, fmt, ## a)
+
+/* 2.6 alloc_page users can use page->lru */
+#define PAGE_LIST_ENTRY lru
+#define PAGE_LIST(page) ((page)->lru)
+
+#ifndef __user
+#define __user
+#endif
+
+#ifndef __fls
+#define __cfs_fls fls
+#else
+#define __cfs_fls __fls
+#endif
+
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)       \
+       proc_dointvec(table, write, buffer, lenp, ppos);
+
+#define ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos)      \
+       proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)       \
+       proc_dostring(table, write, buffer, lenp, ppos);
+#define LL_PROC_PROTO(name)                                         \
+       name(ctl_table_t *table, int write,                   \
+            void __user *buffer, size_t *lenp, loff_t *ppos)
+#define DECLARE_LL_PROC_PPOS_DECL
+
+/* helper for sysctl handlers */
+int proc_call_handler(void *data, int write,
+                     loff_t *ppos, void *buffer, size_t *lenp,
+                     int (*handler)(void *data, int write,
+                                    loff_t pos, void *buffer, int len));
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_set_cpus_allowed(t, mask)  set_cpus_allowed(t, mask)
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#endif /* _PORTALS_COMPAT_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h
new file mode 100644 (file)
index 0000000..7ae36fc
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUCACHE_H
+#define _LUCACHE_H
+
+#include <linux/libcfs/libcfs.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW       0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID       0x04
+#define UC_CACHE_EXPIRED       0x08
+
+#define UC_CACHE_IS_NEW(i)       ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)   ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)     (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i)     (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i)   (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i)     (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i)       (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i)       (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i)   (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i)   (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache_entry;
+
+struct md_perm {
+       lnet_nid_t      mp_nid;
+       __u32      mp_perm;
+};
+
+struct md_identity {
+       struct upcall_cache_entry *mi_uc_entry;
+       uid_t                 mi_uid;
+       gid_t                 mi_gid;
+       group_info_t      *mi_ginfo;
+       int                     mi_nperms;
+       struct md_perm      *mi_perms;
+};
+
+struct upcall_cache_entry {
+       struct list_head              ue_hash;
+       __u64              ue_key;
+       atomic_t            ue_refcount;
+       int                  ue_flags;
+       wait_queue_head_t            ue_waitq;
+       cfs_time_t            ue_acquire_expire;
+       cfs_time_t            ue_expire;
+       union {
+               struct md_identity     identity;
+       } u;
+};
+
+#define UC_CACHE_HASH_SIZE     (128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+       void        (*init_entry)(struct upcall_cache_entry *, void *args);
+       void        (*free_entry)(struct upcall_cache *,
+                                     struct upcall_cache_entry *);
+       int          (*upcall_compare)(struct upcall_cache *,
+                                         struct upcall_cache_entry *,
+                                         __u64 key, void *args);
+       int          (*downcall_compare)(struct upcall_cache *,
+                                           struct upcall_cache_entry *,
+                                           __u64 key, void *args);
+       int          (*do_upcall)(struct upcall_cache *,
+                                    struct upcall_cache_entry *);
+       int          (*parse_downcall)(struct upcall_cache *,
+                                         struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+       struct list_head                uc_hashtable[UC_CACHE_HASH_SIZE];
+       spinlock_t              uc_lock;
+       rwlock_t                uc_upcall_rwlock;
+
+       char                    uc_name[40];            /* for upcall */
+       char                    uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+       int                     uc_acquire_expire;      /* seconds */
+       int                     uc_entry_expire;        /* seconds */
+       struct upcall_cache_ops *uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+                                                 __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+                         void *args);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+                                      struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+#if 0
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash,
+                                                 __u64 key, __u32 primary,
+                                                 __u32 ngroups, __u32 *groups);
+void upcall_cache_put_entry(struct upcall_cache *hash,
+                           struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key,
+                         __u32 primary, __u32 ngroups, __u32 *groups);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+struct upcall_cache *upcall_cache_init(const char *name);
+void upcall_cache_cleanup(struct upcall_cache *hash);
+
+#endif
+
+/** @} ucache */
+
+#endif /* _LUCACHE_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/params_tree.h b/drivers/staging/lustre/include/linux/libcfs/params_tree.h
new file mode 100644 (file)
index 0000000..6551f4b
--- /dev/null
@@ -0,0 +1,230 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * API and structure definitions for params_tree.
+ *
+ * Author: LiuYing <emoly.liu@oracle.com>
+ */
+#ifndef __PARAMS_TREE_H__
+#define __PARAMS_TREE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#undef LPROCFS
+#if  defined(CONFIG_PROC_FS)
+# define LPROCFS
+#endif
+
+#ifdef LPROCFS
+typedef struct file                         cfs_param_file_t;
+typedef struct inode                       cfs_inode_t;
+typedef struct proc_inode                     cfs_proc_inode_t;
+typedef struct seq_file                         cfs_seq_file_t;
+typedef struct seq_operations             cfs_seq_ops_t;
+typedef struct file_operations           cfs_param_file_ops_t;
+typedef module_t                          *cfs_param_module_t;
+typedef struct proc_dir_entry             cfs_param_dentry_t;
+typedef struct poll_table_struct               cfs_poll_table_t;
+#define CFS_PARAM_MODULE                       THIS_MODULE
+#define CFS_PDE(value)                   PDE(value)
+#define cfs_file_private(file)           (file->private_data)
+#define cfs_dentry_data(dentry)                 (dentry->data)
+#define cfs_proc_inode_pde(proc_inode)   (proc_inode->pde)
+#define cfs_proc_inode(proc_inode)           (proc_inode->vfs_inode)
+#define cfs_seq_read_common                 seq_read
+#define cfs_seq_lseek_common               seq_lseek
+#define cfs_seq_private(seq)               (seq->private)
+#define cfs_seq_printf(seq, format, ...)       seq_printf(seq, format,  \
+                                                          ## __VA_ARGS__)
+#define cfs_seq_release(inode, file)       seq_release(inode, file)
+#define cfs_seq_puts(seq, s)               seq_puts(seq, s)
+#define cfs_seq_putc(seq, s)               seq_putc(seq, s)
+#define cfs_seq_read(file, buf, count, ppos, rc) (rc = seq_read(file, buf, \
+                                                           count, ppos))
+#define cfs_seq_open(file, ops, rc)         (rc = seq_open(file, ops))
+
+/* in lprocfs_stat.c, to protect the private data for proc entries */
+extern struct rw_semaphore             _lprocfs_lock;
+
+/* to begin from 2.6.23, Linux defines self file_operations (proc_reg_file_ops)
+ * in procfs, the proc file_operation defined by Lustre (lprocfs_generic_fops)
+ * will be wrapped into the new defined proc_reg_file_ops, which instroduces
+ * user count in proc_dir_entrey(pde_users) to protect the proc entry from
+ * being deleted. then the protection lock (_lprocfs_lock) defined by Lustre
+ * isn't necessary anymore for lprocfs_generic_fops(e.g. lprocfs_fops_read).
+ * see bug19706 for detailed information.
+ */
+#define LPROCFS_ENTRY() do{ }while(0)
+#define LPROCFS_EXIT()  do{ }while(0)
+
+static inline
+int LPROCFS_ENTRY_AND_CHECK(struct proc_dir_entry *dp)
+{
+       int deleted = 0;
+
+       spin_lock(&(dp)->pde_unload_lock);
+       if (dp->proc_fops == NULL)
+               deleted = 1;
+       spin_unlock(&(dp)->pde_unload_lock);
+       if (deleted)
+               return -ENODEV;
+       return 0;
+}
+#define LPROCFS_SRCH_ENTRY()       \
+do {                               \
+       down_read(&_lprocfs_lock);      \
+} while(0)
+
+#define LPROCFS_SRCH_EXIT()         \
+do {                               \
+       up_read(&_lprocfs_lock);        \
+} while(0)
+
+#define LPROCFS_WRITE_ENTRY()          \
+do {                                   \
+       down_write(&_lprocfs_lock);     \
+} while(0)
+
+#define LPROCFS_WRITE_EXIT()           \
+do {                                   \
+       up_write(&_lprocfs_lock);       \
+} while(0)
+#else /* !LPROCFS */
+
+typedef struct cfs_params_file {
+       void       *param_private;
+       loff_t    param_pos;
+       unsigned int    param_flags;
+} cfs_param_file_t;
+
+typedef struct cfs_param_inode {
+       void    *param_private;
+} cfs_inode_t;
+
+typedef struct cfs_param_dentry {
+       void *param_data;
+} cfs_param_dentry_t;
+
+typedef struct cfs_proc_inode {
+       cfs_param_dentry_t *param_pde;
+       cfs_inode_t      param_inode;
+} cfs_proc_inode_t;
+
+struct cfs_seq_operations;
+typedef struct cfs_seq_file {
+       char                  *buf;
+       size_t               size;
+       size_t               from;
+       size_t               count;
+       loff_t               index;
+       loff_t               version;
+       struct mutex                    lock;
+       struct cfs_seq_operations *op;
+       void                  *private;
+} cfs_seq_file_t;
+
+typedef struct cfs_seq_operations {
+       void *(*start) (cfs_seq_file_t *m, loff_t *pos);
+       void  (*stop) (cfs_seq_file_t *m, void *v);
+       void *(*next) (cfs_seq_file_t *m, void *v, loff_t *pos);
+       int   (*show) (cfs_seq_file_t *m, void *v);
+} cfs_seq_ops_t;
+
+typedef void *cfs_param_module_t;
+typedef void *cfs_poll_table_t;
+
+typedef struct cfs_param_file_ops {
+       cfs_param_module_t owner;
+       int (*open) (cfs_inode_t *, struct file *);
+       loff_t (*llseek)(struct file *, loff_t, int);
+       int (*release) (cfs_inode_t *, cfs_param_file_t *);
+       unsigned int (*poll) (struct file *, cfs_poll_table_t *);
+       ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+       ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+} cfs_param_file_ops_t;
+typedef cfs_param_file_ops_t *cfs_lproc_filep_t;
+
+static inline cfs_proc_inode_t *FAKE_PROC_I(const cfs_inode_t *inode)
+{
+       return container_of(inode, cfs_proc_inode_t, param_inode);
+}
+
+static inline cfs_param_dentry_t *FAKE_PDE(cfs_inode_t *inode)
+{
+       return FAKE_PROC_I(inode)->param_pde;
+}
+
+#define CFS_PARAM_MODULE                       NULL
+#define CFS_PDE(value)                   FAKE_PDE(value)
+#define cfs_file_private(file)           (file->param_private)
+#define cfs_dentry_data(dentry)                 (dentry->param_data)
+#define cfs_proc_inode(proc_inode)           (proc_inode->param_inode)
+#define cfs_proc_inode_pde(proc_inode)   (proc_inode->param_pde)
+#define cfs_seq_read_common                 NULL
+#define cfs_seq_lseek_common               NULL
+#define cfs_seq_private(seq)               (seq->private)
+#define cfs_seq_read(file, buf, count, ppos, rc) do {} while(0)
+#define cfs_seq_open(file, ops, rc)                 \
+do {                                               \
+        cfs_seq_file_t *p = cfs_file_private(file);    \
+        if (!p) {                                    \
+               LIBCFS_ALLOC(p, sizeof(*p));        \
+               if (!p) {                              \
+                       rc = -ENOMEM;              \
+                       break;                    \
+               }                                      \
+               cfs_file_private(file) = p;          \
+       }                                              \
+       memset(p, 0, sizeof(*p));                      \
+       p->op = ops;                                \
+       rc = 0;                                  \
+} while(0)
+
+#define LPROCFS_ENTRY()             do {} while(0)
+#define LPROCFS_EXIT()       do {} while(0)
+static inline
+int LPROCFS_ENTRY_AND_CHECK(cfs_param_dentry_t *dp)
+{
+       LPROCFS_ENTRY();
+       return 0;
+}
+#define LPROCFS_WRITE_ENTRY()       do {} while(0)
+#define LPROCFS_WRITE_EXIT()   do {} while(0)
+
+#endif /* LPROCFS */
+
+/* XXX: params_tree APIs */
+
+#endif  /* __PARAMS_TREE_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/api-support.h b/drivers/staging/lustre/include/linux/lnet/api-support.h
new file mode 100644 (file)
index 0000000..a8d91db
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
+
+#include <linux/lnet/linux/api-support.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h
new file mode 100644 (file)
index 0000000..e8642e3
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ *
+ * LNet can run both in OS kernel space and in userspace as a library.
+ * @{
+ */
+
+#include <linux/lnet/types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetInit(void);
+void LNetFini(void);
+
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int      portal,
+                lnet_process_id_t match_id_in,
+                __u64       match_bits_in,
+                __u64       ignore_bits_in,
+                lnet_unlink_t     unlink_in,
+                lnet_ins_pos_t    pos_in,
+                lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in,
+                lnet_process_id_t match_id_in,
+                __u64       match_bits_in,
+                __u64       ignore_bits_in,
+                lnet_unlink_t     unlink_in,
+                lnet_ins_pos_t    position_in,
+                lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(lnet_handle_me_t  current_in,
+                lnet_md_t       md_in,
+                lnet_unlink_t     unlink_in,
+                lnet_handle_md_t *handle_out);
+
+int LNetMDBind(lnet_md_t        md_in,
+              lnet_unlink_t     unlink_in,
+              lnet_handle_md_t *handle_out);
+
+int LNetMDUnlink(lnet_handle_md_t md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the lnet_handle_eq_t, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind_t defines the kinds of events
+ * that can be stored in an EQ. The lnet_event_t defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int       count_in,
+               lnet_eq_handler_t  handler,
+               lnet_handle_eq_t  *handle_out);
+
+int LNetEQFree(lnet_handle_eq_t eventq_in);
+
+int LNetEQGet(lnet_handle_eq_t  eventq_in,
+             lnet_event_t     *event_out);
+
+
+int LNetEQWait(lnet_handle_eq_t  eventq_in,
+              lnet_event_t     *event_out);
+
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in,
+              int             neq_in,
+              int             timeout_ms,
+              lnet_event_t     *event_out,
+              int            *which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t self,
+           lnet_handle_md_t  md_in,
+           lnet_ack_req_t    ack_req_in,
+           lnet_process_id_t target_in,
+           unsigned int      portal_in,
+           __u64            match_bits_in,
+           unsigned int      offset_in,
+           __u64            hdr_data_in);
+
+int LNetGet(lnet_nid_t self,
+           lnet_handle_md_t  md_in,
+           lnet_process_id_t target_in,
+           unsigned int      portal_in,
+           __u64            match_bits_in,
+           unsigned int      offset_in);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+int LNetSetAsync(lnet_process_id_t id, int nasync);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
new file mode 100644 (file)
index 0000000..59bff0b
--- /dev/null
@@ -0,0 +1,874 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#include <linux/lnet/linux/lib-lnet.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+extern lnet_t  the_lnet;                       /* THE network */
+
+#if  defined(LNET_USE_LIB_FREELIST)
+/* 1 CPT, simplify implementation... */
+# define LNET_CPT_MAX_BITS      0
+
+#else /* KERNEL and no freelist */
+
+# if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+#  define LNET_CPT_MAX_BITS     1
+
+# else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+#  define LNET_CPT_MAX_BITS     8
+# endif /* BITS_PER_LONG == 32 */
+#endif
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX       (1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER         (the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS     (the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK     ((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX       CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh)
+{
+       return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+               wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted (lnet_libmd_t *md)
+{
+       return (md->md_threshold == 0 ||
+               ((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+                md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable (lnet_libmd_t *md)
+{
+       /* Should unlink md when its refcount is 0 and either:
+        *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+        *    in the latter case md may not be exhausted).
+        *  - auto unlink is on and md is exhausted.
+        */
+       if (md->md_refcount != 0)
+               return 0;
+
+       if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+               return 1;
+
+       return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+               lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()       (the_lnet.ln_cpt_table)
+#define lnet_cpt_current()     cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+       unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+       /* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+        * get illegal cpt from it's invalid cookie */
+       return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+       cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+       cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+       int cpt = lnet_cpt_current();
+
+       lnet_res_lock(cpt);
+       return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+       cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+       cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+       int cpt = lnet_cpt_current();
+
+       lnet_net_lock(cpt);
+       return cpt;
+}
+
+#define LNET_LOCK()            lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()          lnet_net_unlock(LNET_LOCK_EX)
+
+
+#define lnet_ptl_lock(ptl)     spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)   spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()    spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()  spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)       spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)     spin_unlock(&(ni)->ni_lock)
+#define LNET_MUTEX_LOCK(m)     mutex_lock(m)
+#define LNET_MUTEX_UNLOCK(m)   mutex_unlock(m)
+
+
+#define MAX_PORTALS     64
+
+/* these are only used by code with LNET_USE_LIB_FREELIST, but we still
+ * exported them to !LNET_USE_LIB_FREELIST for easy implemetation */
+#define LNET_FL_MAX_MES                2048
+#define LNET_FL_MAX_MDS                2048
+#define LNET_FL_MAX_EQS                512
+#define LNET_FL_MAX_MSGS       2048    /* Outstanding messages */
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int lnet_freelist_init(lnet_freelist_t *fl, int n, int size);
+void lnet_freelist_fini(lnet_freelist_t *fl);
+
+static inline void *
+lnet_freelist_alloc (lnet_freelist_t *fl)
+{
+       /* ALWAYS called with liblock held */
+       lnet_freeobj_t *o;
+
+       if (list_empty (&fl->fl_list))
+               return (NULL);
+
+       o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list);
+       list_del (&o->fo_list);
+       return ((void *)&o->fo_contents);
+}
+
+static inline void
+lnet_freelist_free (lnet_freelist_t *fl, void *obj)
+{
+       /* ALWAYS called with liblock held */
+       lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents);
+
+       list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+       lnet_eq_t                 *eq;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       return eq;
+}
+
+static inline void
+lnet_eq_free_locked(lnet_eq_t *eq)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+       lnet_res_lock(0);
+       lnet_eq_free_locked(eq);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+       lnet_libmd_t              *md;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       if (md != NULL)
+               INIT_LIST_HEAD(&md->md_list);
+
+       return md;
+}
+
+static inline void
+lnet_md_free_locked(lnet_libmd_t *md)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+       lnet_res_lock(0);
+       lnet_md_free_locked(md);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+       lnet_me_t                 *me;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       return me;
+}
+
+static inline void
+lnet_me_free_locked(lnet_me_t *me)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+       lnet_res_lock(0);
+       lnet_me_free_locked(me);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc (void)
+{
+       /* NEVER called with network lock held */
+       struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+       lnet_msg_t                *msg;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_net_lock(0);
+       msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist);
+       lnet_net_unlock(0);
+
+       if (msg != NULL) {
+               /* NULL pointers, clear flags etc */
+               memset(msg, 0, sizeof(*msg));
+       }
+       return msg;
+}
+
+static inline void
+lnet_msg_free_locked(lnet_msg_t *msg)
+{
+       /* ALWAYS called with network lock held */
+       struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       LASSERT(!msg->msg_onactivelist);
+       lnet_freelist_free(&msc->msc_freelist, msg);
+}
+
+static inline void
+lnet_msg_free (lnet_msg_t *msg)
+{
+       lnet_net_lock(0);
+       lnet_msg_free_locked(msg);
+       lnet_net_unlock(0);
+}
+
+#else /* !LNET_USE_LIB_FREELIST */
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+       /* NEVER called with liblock held */
+       lnet_eq_t *eq;
+
+       LIBCFS_ALLOC(eq, sizeof(*eq));
+       return (eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+       /* ALWAYS called with resource lock held */
+       LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+       /* NEVER called with liblock held */
+       lnet_libmd_t *md;
+       unsigned int  size;
+       unsigned int  niov;
+
+       if ((umd->options & LNET_MD_KIOV) != 0) {
+               niov = umd->length;
+               size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
+       } else {
+               niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+                      umd->length : 1;
+               size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
+       }
+
+       LIBCFS_ALLOC(md, size);
+
+       if (md != NULL) {
+               /* Set here in case of early free */
+               md->md_options = umd->options;
+               md->md_niov = niov;
+               INIT_LIST_HEAD(&md->md_list);
+       }
+
+       return (md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+       /* ALWAYS called with resource lock held */
+       unsigned int  size;
+
+       if ((md->md_options & LNET_MD_KIOV) != 0)
+               size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
+       else
+               size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
+
+       LIBCFS_FREE(md, size);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc (void)
+{
+       /* NEVER called with liblock held */
+       lnet_me_t *me;
+
+       LIBCFS_ALLOC(me, sizeof(*me));
+       return (me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+       /* ALWAYS called with resource lock held */
+       LIBCFS_FREE(me, sizeof(*me));
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+       /* NEVER called with liblock held */
+       lnet_msg_t *msg;
+
+       LIBCFS_ALLOC(msg, sizeof(*msg));
+
+       /* no need to zero, LIBCFS_ALLOC does for us */
+       return (msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+       /* ALWAYS called with network lock held */
+       LASSERT(!msg->msg_onactivelist);
+       LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+#define lnet_eq_free_locked(eq)                lnet_eq_free(eq)
+#define lnet_md_free_locked(md)                lnet_md_free(md)
+#define lnet_me_free_locked(me)                lnet_me_free(me)
+#define lnet_msg_free_locked(msg)      lnet_msg_free(msg)
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
+                                    __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+                           lnet_libhandle_t *lh);
+static inline void
+lnet_res_lh_invalidate(lnet_libhandle_t *lh)
+{
+       /* ALWAYS called with resource lock held */
+       /* NB: cookie is still useful, don't reset it */
+       list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq)
+{
+       if (eq == NULL) {
+               LNetInvalidateHandle(handle);
+               return;
+       }
+
+       handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lnet_eq_t *
+lnet_handle2eq(lnet_handle_eq_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+
+       lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_eq_t, eq_lh);
+}
+
+static inline void
+lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md)
+{
+       handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lnet_libmd_t *
+lnet_handle2md(lnet_handle_md_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       cpt = lnet_cpt_of_cookie(handle->cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+                               handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline lnet_libmd_t *
+lnet_wire_handle2md(lnet_handle_wire_t *wh)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+               return NULL;
+
+       cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+                               wh->wh_object_cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline void
+lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me)
+{
+       handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lnet_me_t *
+lnet_handle2me(lnet_handle_me_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       cpt = lnet_cpt_of_cookie(handle->cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+                               handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_me_t, me_lh);
+}
+
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+       LASSERT (lp->lp_refcount > 0);
+       lp->lp_refcount++;
+}
+
+extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+       LASSERT (lp->lp_refcount > 0);
+       lp->lp_refcount--;
+       if (lp->lp_refcount == 0)
+               lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+       return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
+{
+       LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+       LASSERT(*ni->ni_refs[cpt] >= 0);
+
+       (*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni)
+{
+       lnet_net_lock(0);
+       lnet_ni_addref_locked(ni, 0);
+       lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni, int cpt)
+{
+       LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+       LASSERT(*ni->ni_refs[cpt] > 0);
+
+       (*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+       lnet_net_lock(0);
+       lnet_ni_decref_locked(ni, 0);
+       lnet_net_unlock(0);
+}
+
+void lnet_ni_free(lnet_ni_t *ni);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+       return cfs_hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+       return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+               LNET_NETTYP(net)) &
+               ((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern lnd_t the_lolnd;
+
+
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid);
+extern int lnet_cpt_of_nid(lnet_nid_t nid);
+extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
+extern lnet_ni_t *lnet_net2ni(__u32 net);
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+                  lnet_nid_t *gateway, __u32 *alive);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_rtrpools_free(void);
+lnet_remotenet_t *lnet_find_net_locked (__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+                       unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(lnet_msg_t *msg, int status);
+void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev);
+void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type);
+void lnet_msg_commit(lnet_msg_t *msg, int cpt);
+void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+                   unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(lnet_msg_t *msg);
+void lnet_return_rx_credits_locked(lnet_msg_t *msg);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(lnet_portal_t *ptl, int opt)
+{
+       ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt)
+{
+       ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+                              lnet_process_id_t id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+                                          lnet_process_id_t id, __u64 mbits,
+                                          __u64 ignore_bits,
+                                          lnet_ins_pos_t pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+                    struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+                       struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr,
+               lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+              unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str (int type);
+void lnet_print_hdr (lnet_hdr_t * hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+void lnet_counters_get(lnet_counters_t *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov);
+int lnet_extract_iov (int dst_niov, struct iovec *dst,
+                     int src_niov, struct iovec *src,
+                     unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                     int src_niov, lnet_kiov_t *src,
+                     unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov,
+                       unsigned int doffset,
+                       unsigned int nsiov, struct iovec *siov,
+                       unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov,
+                        unsigned int iovoffset,
+                        unsigned int nkiov, lnet_kiov_t *kiov,
+                        unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov,
+                        unsigned int kiovoffset,
+                        unsigned int niov, struct iovec *iov,
+                        unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov,
+                         unsigned int doffset,
+                         unsigned int nskiov, lnet_kiov_t *skiov,
+                         unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+                  unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                  unsigned int nob)
+{
+       struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+       lnet_copy_iov2iov(1, &diov, doffset,
+                         nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+                   unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
+                   unsigned int nob)
+{
+       struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+       lnet_copy_kiov2iov(1, &diov, doffset,
+                          nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                  int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+       struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+       lnet_copy_iov2iov(ndiov, diov, doffset,
+                         1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
+                   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+       struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+       lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+                          1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr (lnet_ni_t *ni);
+
+int lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+                __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+                               __u32 peer_ip, int port);
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_port(void);
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+void lnet_get_tunables(void);
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+             lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets (char **networksp, char *ip2nets);
+int lnet_parse_routes (char *route_str, int *im_a_router);
+int lnet_parse_networks (struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
+lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
+                                  lnet_nid_t nid);
+void lnet_peer_tables_cleanup(void);
+void lnet_peer_tables_destroy(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
new file mode 100644 (file)
index 0000000..86428d4
--- /dev/null
@@ -0,0 +1,765 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#include <linux/lnet/linux/lib-types.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/list.h>
+#include <linux/lnet/types.h>
+
+#define WIRE_ATTR       __attribute__((packed))
+
+/* Packed version of lnet_process_id_t to transfer via network */
+typedef struct {
+       lnet_nid_t nid;
+       lnet_pid_t pid;   /* node id / process id */
+} WIRE_ATTR lnet_process_id_packed_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+       __u64 wh_interface_cookie;
+       __u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum {
+       LNET_MSG_ACK = 0,
+       LNET_MSG_PUT,
+       LNET_MSG_GET,
+       LNET_MSG_REPLY,
+       LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+typedef struct lnet_ack {
+       lnet_handle_wire_t  dst_wmd;
+       __u64          match_bits;
+       __u32          mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+       lnet_handle_wire_t  ack_wmd;
+       __u64          match_bits;
+       __u64          hdr_data;
+       __u32          ptl_index;
+       __u32          offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+       lnet_handle_wire_t  return_wmd;
+       __u64          match_bits;
+       __u32          ptl_index;
+       __u32          src_offset;
+       __u32          sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+       lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
+       __u64         incarnation;
+       __u32         type;
+} WIRE_ATTR lnet_hello_t;
+
+typedef struct {
+       lnet_nid_t        dest_nid;
+       lnet_nid_t        src_nid;
+       lnet_pid_t        dest_pid;
+       lnet_pid_t        src_pid;
+       __u32          type;           /* lnet_msg_type_t */
+       __u32          payload_length;     /* payload data to follow */
+       /*<------__u64 aligned------->*/
+       union {
+               lnet_ack_t   ack;
+               lnet_put_t   put;
+               lnet_get_t   get;
+               lnet_reply_t reply;
+               lnet_hello_t hello;
+       } msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
+ */
+typedef struct {
+       __u32   magic;                    /* LNET_PROTO_TCP_MAGIC */
+       __u16   version_major;            /* increment on incompatible change */
+       __u16   version_minor;            /* increment on compatible change */
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC             0x0be91b91
+#define LNET_PROTO_RA_MAGIC             0x0be91b92
+#define LNET_PROTO_QSW_MAGIC           0x0be91b93
+#define LNET_PROTO_GNI_MAGIC           0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC           0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC           0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_MX_MAGIC             0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC         0xacce7100
+#define LNET_PROTO_PING_MAGIC         0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC                   0x45726963 /* ! */
+
+
+#define LNET_PROTO_TCP_VERSION_MAJOR   1
+#define LNET_PROTO_TCP_VERSION_MINOR   0
+
+/* Acceptor connection request */
+typedef struct {
+       __u32       acr_magic;            /* PTL_ACCEPTOR_PROTO_MAGIC */
+       __u32       acr_version;                /* protocol version */
+       __u64       acr_nid;                /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+       struct list_head            msg_activelist;
+       struct list_head            msg_list;      /* Q for credits/MD */
+
+       lnet_process_id_t     msg_target;
+       /* where is it from, it's only for building event */
+       lnet_nid_t              msg_from;
+       __u32                   msg_type;
+
+       /* commited for sending */
+       unsigned int            msg_tx_committed:1;
+       /* CPT # this message committed for sending */
+       unsigned int            msg_tx_cpt:15;
+       /* commited for receiving */
+       unsigned int            msg_rx_committed:1;
+       /* CPT # this message committed for receiving */
+       unsigned int            msg_rx_cpt:15;
+       /* queued for tx credit */
+       unsigned int            msg_tx_delayed:1;
+       /* queued for RX buffer */
+       unsigned int            msg_rx_delayed:1;
+       /* ready for pending on RX delay list */
+       unsigned int            msg_rx_ready_delay:1;
+
+       unsigned int      msg_vmflush:1;      /* VM trying to free memory */
+       unsigned int      msg_target_is_router:1; /* sending to a router */
+       unsigned int      msg_routing:1;      /* being forwarded */
+       unsigned int      msg_ack:1;      /* ack on finalize (PUT) */
+       unsigned int      msg_sending:1;      /* outgoing message */
+       unsigned int      msg_receiving:1;    /* being received */
+       unsigned int      msg_txcredit:1;     /* taken an NI send credit */
+       unsigned int      msg_peertxcredit:1; /* taken a peer send credit */
+       unsigned int      msg_rtrcredit:1;    /* taken a globel router credit */
+       unsigned int      msg_peerrtrcredit:1; /* taken a peer router credit */
+       unsigned int      msg_onactivelist:1; /* on the activelist */
+
+       struct lnet_peer     *msg_txpeer;        /* peer I'm sending to */
+       struct lnet_peer     *msg_rxpeer;        /* peer I received from */
+
+       void             *msg_private;
+       struct lnet_libmd    *msg_md;
+
+       unsigned int      msg_len;
+       unsigned int      msg_wanted;
+       unsigned int      msg_offset;
+       unsigned int      msg_niov;
+       struct iovec     *msg_iov;
+       lnet_kiov_t       *msg_kiov;
+
+       lnet_event_t      msg_ev;
+       lnet_hdr_t          msg_hdr;
+} lnet_msg_t;
+
+
+typedef struct lnet_libhandle {
+       struct list_head            lh_hash_chain;
+       __u64            lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+       struct list_head                eq_list;
+       lnet_libhandle_t        eq_lh;
+       lnet_seq_t              eq_enq_seq;
+       lnet_seq_t              eq_deq_seq;
+       unsigned int            eq_size;
+       lnet_eq_handler_t       eq_callback;
+       lnet_event_t            *eq_events;
+       int                     **eq_refs;      /* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+       struct list_head             me_list;
+       lnet_libhandle_t       me_lh;
+       lnet_process_id_t      me_match_id;
+       unsigned int       me_portal;
+       unsigned int       me_pos;              /* hash offset in mt_hash */
+       __u64             me_match_bits;
+       __u64             me_ignore_bits;
+       lnet_unlink_t     me_unlink;
+       struct lnet_libmd     *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+       struct list_head            md_list;
+       lnet_libhandle_t      md_lh;
+       lnet_me_t           *md_me;
+       char             *md_start;
+       unsigned int      md_offset;
+       unsigned int      md_length;
+       unsigned int      md_max_size;
+       int                md_threshold;
+       int                md_refcount;
+       unsigned int      md_options;
+       unsigned int      md_flags;
+       void             *md_user_ptr;
+       lnet_eq_t           *md_eq;
+       unsigned int      md_niov;              /* # frags */
+       union {
+               struct iovec  iov[LNET_MAX_IOV];
+               lnet_kiov_t   kiov[LNET_MAX_IOV];
+       } md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE       (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
+
+#ifdef LNET_USE_LIB_FREELIST
+typedef struct
+{
+       void              *fl_objs;       /* single contiguous array of objects */
+       int                 fl_nobjs;    /* the number of them */
+       int                 fl_objsize;       /* the size (including overhead) of each of them */
+       struct list_head             fl_list;     /* where they are enqueued */
+} lnet_freelist_t;
+
+typedef struct
+{
+       struct list_head             fo_list;        /* enqueue on fl_list */
+       void              *fo_contents;  /* aligned contents */
+} lnet_freeobj_t;
+#endif
+
+typedef struct {
+       /* info about peers we are trying to fail */
+       struct list_head             tp_list;        /* ln_test_peers */
+       lnet_nid_t           tp_nid;          /* matching nid */
+       unsigned int       tp_threshold;        /* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK       ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;                                  /* forward ref */
+
+typedef struct lnet_lnd
+{
+       /* fields managed by portals */
+       struct list_head            lnd_list;        /* stash in the LND table */
+       int                lnd_refcount;         /* # active instances */
+
+       /* fields initialised by the LND */
+       unsigned int      lnd_type;
+
+       int  (*lnd_startup) (struct lnet_ni *ni);
+       void (*lnd_shutdown) (struct lnet_ni *ni);
+       int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+       /* In data movement APIs below, payload buffers are described as a set
+        * of 'niov' fragments which are...
+        * EITHER
+        *    in virtual memory (struct iovec *iov != NULL)
+        * OR
+        *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+        * The LND may NOT overwrite these fragment descriptors.
+        * An 'offset' and may specify a byte offset within the set of
+        * fragments to start from
+        */
+
+       /* Start sending a preformatted message.  'private' is NULL for PUT and
+        * GET messages; otherwise this is a response to an incoming message
+        * and 'private' is the 'private' passed to lnet_parse().  Return
+        * non-zero for immediate failure, otherwise complete later with
+        * lnet_finalize() */
+       int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+       /* Start receiving 'mlen' bytes of payload data, skipping the following
+        * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+        * lnet_parse().  Return non-zero for immedaite failure, otherwise
+        * complete later with lnet_finalize().  This also gives back a receive
+        * credit if the LND does flow control. */
+       int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                       int delayed, unsigned int niov,
+                       struct iovec *iov, lnet_kiov_t *kiov,
+                       unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+       /* lnet_parse() has had to delay processing of this message
+        * (e.g. waiting for a forwarding buffer or send credits).  Give the
+        * LND a chance to free urgently needed resources.  If called, return 0
+        * for success and do NOT give back a receive credit; that has to wait
+        * until lnd_recv() gets called.  On failure return < 0 and
+        * release resources; lnd_recv() will not be called. */
+       int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                             void **new_privatep);
+
+       /* notification of peer health */
+       void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+       /* query of peer aliveness */
+       void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+
+       /* accept a new connection */
+       int (*lnd_accept)(struct lnet_ni *ni, socket_t *sock);
+
+} lnd_t;
+
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+       lnet_nid_t ns_nid;
+       __u32      ns_status;
+       __u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+       int                     tq_credits;     /* # tx credits free */
+       int                     tq_credits_min; /* lowest it's been */
+       int                     tq_credits_max; /* total # tx credits */
+       struct list_head                tq_delayed;     /* delayed TXs */
+};
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+       spinlock_t              ni_lock;
+       struct list_head                ni_list;        /* chain on ln_nis */
+       struct list_head                ni_cptlist;     /* chain on ln_nis_cpt */
+       int                     ni_maxtxcredits; /* # tx credits  */
+       /* # per-peer send credits */
+       int                     ni_peertxcredits;
+       /* # per-peer router buffer credits */
+       int                     ni_peerrtrcredits;
+       /* seconds to consider peer dead */
+       int                     ni_peertimeout;
+       int                     ni_ncpts;       /* number of CPTs */
+       __u32                   *ni_cpts;       /* bond NI on some CPTs */
+       lnet_nid_t              ni_nid;         /* interface's NID */
+       void                    *ni_data;       /* instance-specific data */
+       lnd_t                   *ni_lnd;        /* procedural interface */
+       struct lnet_tx_queue    **ni_tx_queues; /* percpt TX queues */
+       int                     **ni_refs;      /* percpt reference count */
+       long                    ni_last_alive;  /* when I was last alive */
+       lnet_ni_status_t        *ni_status;     /* my health status */
+       /* equivalent interfaces to use */
+       char                    *ni_interfaces[LNET_MAX_INTERFACES];
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS      0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL           (0)             /* no feature */
+#define LNET_PING_FEAT_BASE            (1 << 0)        /* just a ping */
+#define LNET_PING_FEAT_NI_STATUS       (1 << 1)        /* return NI status */
+
+#define LNET_PING_FEAT_MASK            (LNET_PING_FEAT_BASE | \
+                                        LNET_PING_FEAT_NI_STATUS)
+
+typedef struct {
+       __u32                   pi_magic;
+       __u32                   pi_features;
+       lnet_pid_t              pi_pid;
+       __u32                   pi_nnis;
+       lnet_ni_status_t        pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+       /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+       struct list_head                rcd_list;
+       lnet_handle_md_t        rcd_mdh;        /* ping buffer MD */
+       struct lnet_peer        *rcd_gateway;   /* reference to gateway */
+       lnet_ping_info_t        *rcd_pinginfo;  /* ping buffer */
+} lnet_rc_data_t;
+
+typedef struct lnet_peer {
+       struct list_head        lp_hashlist;      /* chain on peer hash */
+       struct list_head        lp_txq;        /* messages blocking for tx credits */
+       struct list_head        lp_rtrq;              /* messages blocking for router credits */
+       struct list_head        lp_rtr_list;      /* chain on router list */
+       int            lp_txcredits;     /* # tx credits available */
+       int            lp_mintxcredits;      /* low water mark */
+       int            lp_rtrcredits;   /* # router credits */
+       int            lp_minrtrcredits;     /* low water mark */
+       unsigned int      lp_alive:1;      /* alive/dead? */
+       unsigned int      lp_notify:1;    /* notification outstanding? */
+       unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+       unsigned int      lp_notifying:1;       /* some thread is handling notification */
+       unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+       int            lp_alive_count;       /* # times router went dead<->alive */
+       long          lp_txqnob;            /* bytes queued for sending */
+       cfs_time_t      lp_timestamp;    /* time of last aliveness news */
+       cfs_time_t      lp_ping_timestamp;    /* time of last ping attempt */
+       cfs_time_t      lp_ping_deadline;     /* != 0 if ping reply expected */
+       cfs_time_t      lp_last_alive;  /* when I was last alive */
+       cfs_time_t      lp_last_query;  /* when lp_ni was queried last time */
+       lnet_ni_t       *lp_ni;         /* interface peer is on */
+       lnet_nid_t      lp_nid;        /* peer's NID */
+       int            lp_refcount;       /* # refs */
+       int                     lp_cpt;         /* CPT this peer attached on */
+       /* # refs from lnet_route_t::lr_gateway */
+       int                     lp_rtr_refcount;
+       /* returned RC ping features */
+       unsigned int            lp_ping_feats;
+       struct list_head                lp_routes;      /* routers on this peer */
+       lnet_rc_data_t          *lp_rcd;        /* router checker state */
+} lnet_peer_t;
+
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS     9
+#define LNET_PEER_HASH_SIZE     (1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+       int                     pt_version;     /* /proc validity stamp */
+       int                     pt_number;      /* # peers extant */
+       struct list_head                pt_deathrow;    /* zombie peers */
+       struct list_head                *pt_hash;       /* NID->peer hash */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * lnet_ni_t::ni_peertimeout has been set to a positive value */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+                                        (lp)->lp_ni->ni_peertimeout > 0)
+
+typedef struct {
+       struct list_head                lr_list;        /* chain on net */
+       struct list_head                lr_gwlist;      /* chain on gateway */
+       lnet_peer_t             *lr_gateway;    /* router node */
+       __u32                   lr_net;         /* remote network number */
+       int                     lr_seq;         /* sequence for round-robin */
+       unsigned int            lr_downis;      /* number of down NIs */
+       unsigned int            lr_hops;        /* how far I am */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT  (1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX      (1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE     (1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct {
+       struct list_head              lrn_list;       /* chain on ln_remote_nets_hash */
+       struct list_head              lrn_routes;     /* routes to me */
+       __u32              lrn_net;     /* my net number */
+} lnet_remotenet_t;
+
+typedef struct {
+       struct list_head rbp_bufs;           /* my free buffer pool */
+       struct list_head rbp_msgs;           /* messages blocking for a buffer */
+       int     rbp_npages;        /* # pages in each buffer */
+       int     rbp_nbuffers;    /* # buffers */
+       int     rbp_credits;      /* # free buffers / blocked messages */
+       int     rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+       struct list_head             rb_list;        /* chain on rbp_bufs */
+       lnet_rtrbufpool_t     *rb_pool;      /* owning pool */
+       lnet_kiov_t         rb_kiov[0];   /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+       __u32   msgs_alloc;
+       __u32   msgs_max;
+       __u32   errors;
+       __u32   send_count;
+       __u32   recv_count;
+       __u32   route_count;
+       __u32   drop_count;
+       __u64   send_length;
+       __u64   recv_length;
+       __u64   route_length;
+       __u64   drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503               /* prime! */
+
+#define LNET_NRBPOOLS   3               /* # different router buffer pools */
+
+enum {
+       /* Didn't match anything */
+       LNET_MATCHMD_NONE       = (1 << 0),
+       /* Matched OK */
+       LNET_MATCHMD_OK         = (1 << 1),
+       /* Must be discarded */
+       LNET_MATCHMD_DROP       = (1 << 2),
+       /* match and buffer is exhausted */
+       LNET_MATCHMD_EXHAUSTED  = (1 << 3),
+       /* match or drop */
+       LNET_MATCHMD_FINISH     = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY         (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE       (1 << 1)    /* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)    /* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+       __u64                   mi_mbits;
+       lnet_process_id_t       mi_id;
+       unsigned int            mi_opc;
+       unsigned int            mi_portal;
+       unsigned int            mi_rlength;
+       unsigned int            mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS              8
+#define LNET_MT_HASH_SIZE              (1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK              (LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE            LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64               6       /* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS         (LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP         ((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+       /* reserved for upcoming patches, CPU partition ID */
+       unsigned int            mt_cpt;
+       unsigned int            mt_portal;      /* portal index */
+       /* match table is set as "enabled" if there's non-exhausted MD
+        * attached on mt_mhash, it's only valide for wildcard portal */
+       unsigned int            mt_enabled;
+       /* bitmap to flag whether MEs on mt_hash are exhausted or not */
+       __u64                   mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+       struct list_head                *mt_mhash;      /* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define        LNET_PTL_ROTOR_OFF      0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define        LNET_PTL_ROTOR_ON       1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define        LNET_PTL_ROTOR_RR_RT    2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define        LNET_PTL_ROTOR_HASH_RT  3
+
+typedef struct lnet_portal {
+       spinlock_t              ptl_lock;
+       unsigned int            ptl_index;      /* portal ID, reserved */
+       /* flags on this portal: lazy, unique... */
+       unsigned int            ptl_options;
+       /* list of messags which are stealing buffer */
+       struct list_head                ptl_msg_stealing;
+       /* messages blocking for MD */
+       struct list_head                ptl_msg_delayed;
+       /* Match table for each CPT */
+       struct lnet_match_table **ptl_mtables;
+       /* spread rotor of incoming "PUT" */
+       int                     ptl_rotor;
+       /* # active entries for this portal */
+       int                  ptl_mt_nmaps;
+       /* array of active entries' cpu-partition-id */
+       int                  ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS      12
+#define LNET_LH_HASH_SIZE      (1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK      (LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+       unsigned int            rec_type;       /* container type */
+       __u64                   rec_lh_cookie;  /* cookie generator */
+       struct list_head                rec_active;     /* active resource list */
+       struct list_head                *rec_lh_hash;   /* handle hash */
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_t         rec_freelist;   /* freelist for resources */
+#endif
+};
+
+/* message container */
+struct lnet_msg_container {
+       int                     msc_init;       /* initialized or not */
+       /* max # threads finalizing */
+       int                     msc_nfinalizers;
+       /* msgs waiting to complete finalizing */
+       struct list_head                msc_finalizing;
+       struct list_head                msc_active;     /* active message list */
+       /* threads doing finalization */
+       void                    **msc_finalizers;
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_t         msc_freelist;   /* freelist for messages */
+#endif
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN         0       /* not started */
+#define LNET_RC_STATE_RUNNING          1       /* started up OK */
+#define LNET_RC_STATE_STOPPING         2       /* telling thread to stop */
+
+typedef struct
+{
+       /* CPU partition table of LNet */
+       struct cfs_cpt_table            *ln_cpt_table;
+       /* number of CPTs in ln_cpt_table */
+       unsigned int                    ln_cpt_number;
+       unsigned int                    ln_cpt_bits;
+
+       /* protect LNet resources (ME/MD/EQ) */
+       struct cfs_percpt_lock          *ln_res_lock;
+       /* # portals */
+       int                             ln_nportals;
+       /* the vector of portals */
+       lnet_portal_t                   **ln_portals;
+       /* percpt ME containers */
+       struct lnet_res_container       **ln_me_containers;
+       /* percpt MD container */
+       struct lnet_res_container       **ln_md_containers;
+
+       /* Event Queue container */
+       struct lnet_res_container       ln_eq_container;
+       wait_queue_head_t                       ln_eq_waitq;
+       spinlock_t                      ln_eq_wait_lock;
+       unsigned int                    ln_remote_nets_hbits;
+
+       /* protect NI, peer table, credits, routers, rtrbuf... */
+       struct cfs_percpt_lock          *ln_net_lock;
+       /* percpt message containers for active/finalizing/freed message */
+       struct lnet_msg_container       **ln_msg_containers;
+       lnet_counters_t                 **ln_counters;
+       struct lnet_peer_table          **ln_peer_tables;
+       /* failure simulation */
+       struct list_head                        ln_test_peers;
+
+       struct list_head                        ln_nis;         /* LND instances */
+       /* NIs bond on specific CPT(s) */
+       struct list_head                        ln_nis_cpt;
+       /* dying LND instances */
+       struct list_head                        ln_nis_zombie;
+       lnet_ni_t                       *ln_loni;       /* the loopback NI */
+       /* NI to wait for events in */
+       lnet_ni_t                       *ln_eq_waitni;
+
+       /* remote networks with routes to them */
+       struct list_head                        *ln_remote_nets_hash;
+       /* validity stamp */
+       __u64                           ln_remote_nets_version;
+       /* list of all known routers */
+       struct list_head                        ln_routers;
+       /* validity stamp */
+       __u64                           ln_routers_version;
+       /* percpt router buffer pools */
+       lnet_rtrbufpool_t               **ln_rtrpools;
+
+       lnet_handle_md_t                ln_ping_target_md;
+       lnet_handle_eq_t                ln_ping_target_eq;
+       lnet_ping_info_t                *ln_ping_info;
+
+       /* router checker startup/shutdown state */
+       int                             ln_rc_state;
+       /* router checker's event queue */
+       lnet_handle_eq_t                ln_rc_eqh;
+       /* rcd still pending on net */
+       struct list_head                        ln_rcd_deathrow;
+       /* rcd ready for free */
+       struct list_head                        ln_rcd_zombie;
+       /* serialise startup/shutdown */
+       struct semaphore                ln_rc_signal;
+
+       struct mutex                    ln_api_mutex;
+       struct mutex                    ln_lnd_mutex;
+       int                             ln_init;        /* LNetInit() called? */
+       /* Have I called LNetNIInit myself? */
+       int                             ln_niinit_self;
+       /* LNetNIInit/LNetNIFini counter */
+       int                             ln_refcount;
+       /* shutdown in progress */
+       int                             ln_shutdown;
+
+       int                             ln_routing;     /* am I a router? */
+       lnet_pid_t                      ln_pid;         /* requested pid */
+       /* uniquely identifies this ni in this epoch */
+       __u64                           ln_interface_cookie;
+       /* registered LNDs */
+       struct list_head                        ln_lnds;
+
+       /* space for network names */
+       char                            *ln_network_tokens;
+       int                             ln_network_tokens_nob;
+       /* test protocol compatibility flags */
+       int                             ln_testprotocompat;
+
+} lnet_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
new file mode 100644 (file)
index 0000000..ca78a0a
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
new file mode 100644 (file)
index 0000000..d2c0a70
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-lnet.h> instead
+#endif
+
+# include <asm/page.h>
+# include <linux/string.h>
+# include <asm/io.h>
+# include <linux/libcfs/libcfs.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+       /* compiler optimizer will elide unused branches */
+
+       switch (sizeof(typeof(page_to_phys(p)))) {
+       case 4:
+               /* page_to_phys returns a 32 bit physical address.  This must
+                * be a 32 bit machine with <= 4G memory and we must ensure we
+                * don't sign extend when converting to 64 bits. */
+               return (unsigned long)page_to_phys(p);
+
+       case 8:
+               /* page_to_phys returns a 64 bit physical address :) */
+               return page_to_phys(p);
+
+       default:
+               LBUG();
+               return 0;
+       }
+}
+
+
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
new file mode 100644 (file)
index 0000000..669e8c0
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-types.h> instead
+#endif
+
+# include <linux/uio.h>
+# include <linux/types.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
new file mode 100644 (file)
index 0000000..1e888f1
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lnet.h> instead
+#endif
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+       tcp_sendpage(sk, page, offset, size, flags)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
new file mode 100644 (file)
index 0000000..1bde44e
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_SYSCTL_H__
+#define __LNET_SYSCTL_H__
+
+#if defined(CONFIG_SYSCTL)
+
+
+#define CTL_KRANAL      201
+#define CTL_O2IBLND     205
+#define CTL_PTLLND      206
+#define CTL_QSWNAL      207
+#define CTL_SOCKLND     208
+#define CTL_GNILND      210
+
+
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet.h b/drivers/staging/lustre/include/linux/lnet/lnet.h
new file mode 100644 (file)
index 0000000..c532b15
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+#include <linux/lnet/linux/lnet.h>
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/api.h>
+
+#define LNET_NIDSTR_COUNT  1024    /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/linux/lnet/lnetctl.h
new file mode 100644 (file)
index 0000000..b22daa2
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
+#define SMFS_DEV_ID  2
+#define SMFS_DEV_PATH "/dev/snapdev"
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers (int argc, char **argv);
+int jt_ptl_add_peer (int argc, char **argv);
+int jt_ptl_del_peer (int argc, char **argv);
+int jt_ptl_print_connections (int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_print_active_txs(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
+int jt_ptl_memhog(int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetst.h b/drivers/staging/lustre/include/linux/lnet/lnetst.h
new file mode 100644 (file)
index 0000000..d90f94e
--- /dev/null
@@ -0,0 +1,491 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+#define LST_FEAT_NONE          (0)
+#define LST_FEAT_BULK_LEN      (1 << 0)        /* enable variable page size */
+
+#define LST_FEATS_EMPTY                (LST_FEAT_NONE)
+#define LST_FEATS_MASK         (LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE     32         /* max name buffer length */
+
+#define LSTIO_DEBUG         0xC00         /* debug */
+#define LSTIO_SESSION_NEW       0xC01     /* create session */
+#define LSTIO_SESSION_END       0xC02     /* end session */
+#define LSTIO_SESSION_INFO      0xC03     /* query session */
+#define LSTIO_GROUP_ADD         0xC10     /* add group */
+#define LSTIO_GROUP_LIST       0xC11      /* list all groups in session */
+#define LSTIO_GROUP_INFO       0xC12      /* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL         0xC13     /* delete group */
+#define LSTIO_NODES_ADD         0xC14     /* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE      0xC15     /* update group */
+#define LSTIO_BATCH_ADD         0xC20     /* add batch */
+#define LSTIO_BATCH_START       0xC21     /* start batch */
+#define LSTIO_BATCH_STOP       0xC22      /* stop batch */
+#define LSTIO_BATCH_DEL         0xC23     /* delete batch */
+#define LSTIO_BATCH_LIST       0xC24      /* show all batches in the session */
+#define LSTIO_BATCH_INFO       0xC25      /* show defail of specified batch */
+#define LSTIO_TEST_ADD   0xC26    /* add test (to batch) */
+#define LSTIO_BATCH_QUERY       0xC27     /* query batch status */
+#define LSTIO_STAT_QUERY       0xC30      /* get stats */
+
+typedef struct {
+       lnet_nid_t            ses_nid;          /* nid of console node */
+       __u64              ses_stamp;         /* time stamp */
+} lst_sid_t;                                       /*** session id */
+
+extern lst_sid_t LST_INVALID_SID;
+
+typedef struct {
+       __u64              bat_id;               /* unique id in session */
+} lst_bid_t;                                       /*** batch id (group of tests) */
+
+/* Status of test node */
+#define LST_NODE_ACTIVE         0x1                 /* node in this session */
+#define LST_NODE_BUSY     0x2               /* node is taken by other session */
+#define LST_NODE_DOWN     0x4               /* node is down */
+#define LST_NODE_UNKNOWN       0x8                  /* node not in session */
+
+typedef struct {
+       lnet_process_id_t       nde_id;          /* id of node */
+       int                  nde_state;       /* state of node */
+} lstcon_node_ent_t;                               /*** node entry, for list_group command */
+
+typedef struct {
+       int                  nle_nnode;       /* # of nodes */
+       int                  nle_nactive;           /* # of active nodes */
+       int                  nle_nbusy;       /* # of busy nodes */
+       int                  nle_ndown;       /* # of down nodes */
+       int                  nle_nunknown;         /* # of unknown nodes */
+} lstcon_ndlist_ent_t;                           /*** node_list entry, for list_batch command */
+
+typedef struct {
+       int                  tse_type;         /* test type */
+       int                  tse_loop;         /* loop count */
+       int                  tse_concur;             /* concurrency of test */
+} lstcon_test_ent_t;                               /*** test summary entry, for list_batch command */
+
+typedef struct {
+       int                  bae_state;       /* batch status */
+       int                  bae_timeout;           /* batch timeout */
+       int                  bae_ntest;       /* # of tests in the batch */
+} lstcon_batch_ent_t;                             /*** batch summary entry, for list_batch command */
+
+typedef struct {
+       lstcon_ndlist_ent_t     tbe_cli_nle;        /* client (group) node_list entry */
+       lstcon_ndlist_ent_t     tbe_srv_nle;        /* server (group) node_list entry */
+       union {
+               lstcon_test_ent_t  tbe_test;        /* test entry */
+               lstcon_batch_ent_t tbe_batch;      /* batch entry */
+       } u;
+} lstcon_test_batch_ent_t;                           /*** test/batch verbose information entry,
+                                                        *** for list_batch command */
+
+typedef struct {
+       struct list_head              rpe_link;        /* link chain */
+       lnet_process_id_t       rpe_peer;              /* peer's id */
+       struct timeval    rpe_stamp;          /* time stamp of RPC */
+       int                  rpe_state;       /* peer's state */
+       int                  rpe_rpc_errno;       /* RPC errno */
+
+       lst_sid_t              rpe_sid;         /* peer's session id */
+       int                  rpe_fwk_errno;       /* framework errno */
+       int                  rpe_priv[4];           /* private data */
+       char                rpe_payload[0];      /* private reply payload */
+} lstcon_rpc_ent_t;
+
+typedef struct {
+       int                  trs_rpc_stat[4];   /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+       int                  trs_rpc_errno;       /* RPC errno */
+       int                  trs_fwk_stat[8];   /* framework stat */
+       int                  trs_fwk_errno;       /* errno of the first remote error */
+       void               *trs_fwk_private;    /* private framework stat */
+} lstcon_trans_stat_t;
+
+static inline int
+lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+typedef struct {
+       int                  lstio_ses_key;       /* IN: local key */
+       int                  lstio_ses_timeout;      /* IN: session timeout */
+       int                  lstio_ses_force;   /* IN: force create ? */
+       /** IN: session features */
+       unsigned                lstio_ses_feats;
+       lst_sid_t             *lstio_ses_idp;     /* OUT: session id */
+       int                  lstio_ses_nmlen;   /* IN: name length */
+       char               *lstio_ses_namep;    /* IN: session name */
+} lstio_session_new_args_t;
+
+/* query current session */
+typedef struct {
+       lst_sid_t             *lstio_ses_idp;     /* OUT: session id */
+       int                 *lstio_ses_keyp;     /* OUT: local key */
+       /** OUT: session features */
+       unsigned               *lstio_ses_featp;
+       lstcon_ndlist_ent_t    *lstio_ses_ndinfo;       /* OUT: */
+       int                  lstio_ses_nmlen;   /* IN: name length */
+       char               *lstio_ses_namep;    /* OUT: session name */
+} lstio_session_info_args_t;
+
+/* delete a session */
+typedef struct {
+       int                  lstio_ses_key;       /* IN: session key */
+} lstio_session_end_args_t;
+
+#define LST_OPC_SESSION         1
+#define LST_OPC_GROUP     2
+#define LST_OPC_NODES     3
+#define LST_OPC_BATCHCLI       4
+#define LST_OPC_BATCHSRV       5
+
+typedef struct {
+       int                  lstio_dbg_key;       /* IN: session key */
+       int                  lstio_dbg_type;     /* IN: debug sessin|batch|group|nodes list */
+       int                  lstio_dbg_flags;   /* IN: reserved debug flags */
+       int                  lstio_dbg_timeout;      /* IN: timeout of debug */
+
+       int                  lstio_dbg_nmlen;   /* IN: len of name */
+       char               *lstio_dbg_namep;    /* IN: name of group|batch */
+       int                  lstio_dbg_count;   /* IN: # of test nodes to debug */
+       lnet_process_id_t      *lstio_dbg_idsp;  /* IN: id of test nodes */
+       struct list_head             *lstio_dbg_resultp;      /* OUT: list head of result buffer */
+} lstio_debug_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+} lstio_group_add_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+} lstio_group_del_args_t;
+
+#define LST_GROUP_CLEAN         1                     /* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH       2                     /* refresh inactive nodes in the group */
+#define LST_GROUP_RMND   3                    /* delete nodes from the group */
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_opc;       /* IN: OPC */
+       int                  lstio_grp_args;     /* IN: arguments */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+       int                  lstio_grp_count;   /* IN: # of nodes id */
+       lnet_process_id_t      *lstio_grp_idsp;  /* IN: array of nodes */
+       struct list_head             *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_update_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+       int                  lstio_grp_count;   /* IN: # of nodes */
+       /** OUT: session features */
+       unsigned               *lstio_grp_featp;
+       lnet_process_id_t      *lstio_grp_idsp;  /* IN: nodes */
+       struct list_head             *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_nodes_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_idx;       /* IN: group idx */
+       int                  lstio_grp_nmlen;   /* IN: name len */
+       char               *lstio_grp_namep;    /* OUT: name */
+} lstio_group_list_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name len */
+       char               *lstio_grp_namep;    /* IN: name */
+       lstcon_ndlist_ent_t    *lstio_grp_entp;  /* OUT: description of group */
+
+       int                 *lstio_grp_idxp;     /* IN/OUT: node index */
+       int                 *lstio_grp_ndentp;       /* IN/OUT: # of nodent */
+       lstcon_node_ent_t      *lstio_grp_dentsp;       /* OUT: nodent array */
+} lstio_group_info_args_t;
+
+#define LST_DEFAULT_BATCH       "batch"                 /* default batch name */
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_add_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_del_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_timeout;      /* IN: timeout for the batch */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_run_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_force;   /* IN: abort unfinished test RPC */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_stop_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_testidx;      /* IN: test index */
+       int                  lstio_bat_client;       /* IN: is test client? */
+       int                  lstio_bat_timeout;      /* IN: timeout for waiting */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_query_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_idx;       /* IN: index */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_list_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: name */
+       int                  lstio_bat_server;       /* IN: query server or not */
+       int                  lstio_bat_testidx;      /* IN: test index */
+       lstcon_test_batch_ent_t *lstio_bat_entp;        /* OUT: batch ent */
+
+       int                 *lstio_bat_idxp;     /* IN/OUT: index of node */
+       int                 *lstio_bat_ndentp;       /* IN/OUT: # of nodent */
+       lstcon_node_ent_t      *lstio_bat_dentsp;       /* array of nodent */
+} lstio_batch_info_args_t;
+
+/* add stat in session */
+typedef struct {
+       int                  lstio_sta_key;       /* IN: session key */
+       int                  lstio_sta_timeout;      /* IN: timeout for stat requst */
+       int                  lstio_sta_nmlen;   /* IN: group name length */
+       char               *lstio_sta_namep;    /* IN: group name */
+       int                  lstio_sta_count;   /* IN: # of pid */
+       lnet_process_id_t      *lstio_sta_idsp;  /* IN: pid */
+       struct list_head             *lstio_sta_resultp;      /* OUT: list head of result buffer */
+} lstio_stat_args_t;
+
+typedef enum {
+       LST_TEST_BULK   = 1,
+       LST_TEST_PING   = 2
+} lst_test_type_t;
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR   1024              /* Max concurrency of test */
+
+typedef struct {
+       int                  lstio_tes_key;       /* IN: session key */
+       int                  lstio_tes_bat_nmlen;    /* IN: batch name len */
+       char               *lstio_tes_bat_name;     /* IN: batch name */
+       int                  lstio_tes_type;     /* IN: test type */
+       int                  lstio_tes_oneside;      /* IN: one sided test */
+       int                  lstio_tes_loop;     /* IN: loop count */
+       int                  lstio_tes_concur;       /* IN: concurrency */
+
+       int                  lstio_tes_dist;     /* IN: node distribution in destination groups */
+       int                  lstio_tes_span;     /* IN: node span in destination groups */
+       int                  lstio_tes_sgrp_nmlen;   /* IN: source group name length */
+       char               *lstio_tes_sgrp_name;    /* IN: group name */
+       int                  lstio_tes_dgrp_nmlen;   /* IN: destination group name length */
+       char               *lstio_tes_dgrp_name;    /* IN: group name */
+
+       int                  lstio_tes_param_len;    /* IN: param buffer len */
+       void               *lstio_tes_param;    /* IN: parameter for specified test:
+                                                              lstio_bulk_param_t,
+                                                              lstio_ping_param_t,
+                                                              ... more */
+       int                 *lstio_tes_retp;     /* OUT: private returned value */
+       struct list_head             *lstio_tes_resultp;      /* OUT: list head of result buffer */
+} lstio_test_args_t;
+
+typedef enum {
+       LST_BRW_READ    = 1,
+       LST_BRW_WRITE   = 2
+} lst_brw_type_t;
+
+typedef enum {
+       LST_BRW_CHECK_NONE   = 1,
+       LST_BRW_CHECK_SIMPLE = 2,
+       LST_BRW_CHECK_FULL   = 3
+} lst_brw_flags_t;
+
+typedef struct {
+       int                  blk_opc;           /* bulk operation code */
+       int                  blk_size;         /* size (bytes) */
+       int                  blk_time;         /* time of running the test*/
+       int                  blk_flags;       /* reserved flags */
+} lst_test_bulk_param_t;
+
+typedef struct {
+       int                  png_size;         /* size of ping message */
+       int                  png_time;         /* time */
+       int                  png_loop;         /* loop */
+       int                  png_flags;       /* reserved flags */
+} lst_test_ping_param_t;
+
+/* more tests */
+typedef struct {
+       __u32 errors;
+       __u32 rpcs_sent;
+       __u32 rpcs_rcvd;
+       __u32 rpcs_dropped;
+       __u32 rpcs_expired;
+       __u64 bulk_get;
+       __u64 bulk_put;
+} WIRE_ATTR srpc_counters_t;
+
+typedef struct {
+       /** milliseconds since current session started */
+       __u32 running_ms;
+       __u32 active_batches;
+       __u32 zombie_sessions;
+       __u32 brw_errors;
+       __u32 ping_errors;
+} WIRE_ATTR sfw_counters_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/drivers/staging/lustre/include/linux/lnet/ptllnd.h
new file mode 100644 (file)
index 0000000..fc1ce8e
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID LPU64
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a,b) ((a) == (b))
+
+/* Diffrent error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only by memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implemenation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV   LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
new file mode 100644 (file)
index 0000000..7d12b3a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd_wire.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/* Minimum buffer size that any peer will post to receive ptllnd messages */
+#define PTLLND_MIN_BUFFER_SIZE  256
+
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL     9      /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID           9   /* The Portals PID */
+#define PTLLND_PEERCREDITS      8        /* concurrent sends to 1 peer */
+
+/* Default buffer size for kernel ptllnds (guaranteed eager) */
+#define PTLLND_MAX_KLND_MSG_SIZE 512
+
+/* Default buffer size for catamount ptllnds (not guaranteed eager) - large
+ * enough to avoid RDMA for anything sent while control is not in liblustre */
+#define PTLLND_MAX_ULND_MSG_SIZE 512
+
+
+/************************************************************************
+ * Portals LND Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100  /* below this value is reserved
+                                        * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct
+{
+       lnet_hdr_t      kptlim_hdr;          /* portals header */
+       char          kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+       lnet_hdr_t      kptlrm_hdr;          /* portals header */
+       __u64        kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct
+{
+       __u64        kptlhm_matchbits;       /* matchbits */
+       __u32        kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct
+{
+       /* First 2 fields fixed FOR ALL TIME */
+       __u32      ptlm_magic;     /* I'm a Portals LND message */
+       __u16      ptlm_version;   /* this is my version number */
+       __u8        ptlm_type;      /* the message type */
+       __u8        ptlm_credits;   /* returned credits */
+       __u32      ptlm_nob;       /* # bytes in whole message */
+       __u32      ptlm_cksum;     /* checksum (0 == no checksum) */
+       __u64      ptlm_srcnid;    /* sender's NID */
+       __u64      ptlm_srcstamp;  /* sender's incarnation */
+       __u64      ptlm_dstnid;    /* destination's NID */
+       __u64      ptlm_dststamp;  /* destination's incarnation */
+       __u32      ptlm_srcpid;    /* sender's PID */
+       __u32      ptlm_dstpid;    /* destination's PID */
+
+        union {
+               kptl_immediate_msg_t    immediate;
+               kptl_rdma_msg_t  rdma;
+               kptl_hello_msg_t        hello;
+       } WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+/* kptl_msg_t::ptlm_credits is only a __u8 */
+#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t*) 0)->ptlm_credits)) -1)
+
+#define PTLLND_MSG_MAGIC               LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION           0x04
+
+#define PTLLND_RDMA_OK           0x00
+#define PTLLND_RDMA_FAIL               0x01
+
+#define PTLLND_MSG_TYPE_INVALID         0x00
+#define PTLLND_MSG_TYPE_PUT         0x01
+#define PTLLND_MSG_TYPE_GET         0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP       0x04
+#define PTLLND_MSG_TYPE_HELLO     0x05
+#define PTLLND_MSG_TYPE_NAK         0x06
diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h
new file mode 100644 (file)
index 0000000..bacc749
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/lib-types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY       0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+#define SOCKLND_CONN_ACK       SOCKLND_CONN_BULK_IN
+
+typedef struct {
+       __u32              kshm_magic;     /* magic number of socklnd message */
+       __u32              kshm_version;   /* version of socklnd message */
+       lnet_nid_t            kshm_src_nid;   /* sender's nid */
+       lnet_nid_t            kshm_dst_nid;   /* destination nid */
+       lnet_pid_t            kshm_src_pid;   /* sender's pid */
+       lnet_pid_t            kshm_dst_pid;   /* destination pid */
+       __u64              kshm_src_incarnation; /* sender's incarnation */
+       __u64              kshm_dst_incarnation; /* destination's incarnation */
+       __u32              kshm_ctype;     /* connection type */
+       __u32              kshm_nips;      /* # IP addrs */
+       __u32              kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+       lnet_hdr_t            ksnm_hdr;       /* lnet hdr */
+
+       /*
+        * ksnm_payload is removed because of winnt compiler's limitation:
+        * zero-sized array can only be placed at the tail of [nested]
+        * structure definitions. lnet payload will be stored just after
+        * the body of structure ksock_lnet_msg_t
+        */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+       __u32              ksm_type;       /* type of socklnd message */
+       __u32              ksm_csum;       /* checksum if != 0 */
+       __u64              ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+       union {
+               ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+       } WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+static inline void
+socklnd_init_msg(ksock_msg_t *msg, int type)
+{
+       msg->ksm_csum      = 0;
+       msg->ksm_type      = type;
+       msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+#define KSOCK_MSG_NOOP   0xc0      /* ksm_u empty */
+#define KSOCK_MSG_LNET   0xc1      /* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2   2
+#define KSOCK_PROTO_V3   3
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h
new file mode 100644 (file)
index 0000000..4f63b7a
--- /dev/null
@@ -0,0 +1,503 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/libcfs/libcfs.h>
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL      0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id_t, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx_t, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry).
+ * Each type of object is given a unique handle type to enhance type checking.
+ * The type lnet_handle_any_t can be used when a generic handle is needed.
+ * Every handle value can be converted into a value of type lnet_handle_any_t
+ * without loss of information.
+ */
+typedef struct {
+       __u64    cookie;
+} lnet_handle_any_t;
+
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
+
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+/**
+ * Invalidate handle \a h.
+ */
+static inline void LNetInvalidateHandle(lnet_handle_any_t *h)
+{
+       h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Compare handles \a h1 and \a h2.
+ *
+ * \return 1 if handles are equal, 0 if otherwise.
+ */
+static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2)
+{
+       return (h1.cookie == h2.cookie);
+}
+
+/**
+ * Check whether handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetHandleIsInvalid(lnet_handle_any_t h)
+{
+       return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+/**
+ * Global process ID.
+ */
+typedef struct {
+       /** node id */
+       lnet_nid_t nid;
+       /** process id */
+       lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum {
+       LNET_RETAIN = 0,
+       LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type lnet_ins_pos_t are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum {
+       /** insert ME before current position or head of the list */
+       LNET_INS_BEFORE,
+       /** insert ME after current position or tail of the list */
+       LNET_INS_AFTER,
+       /** attach ME at tail of local CPU partition ME list */
+       LNET_INS_LOCAL
+} lnet_ins_pos_t;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct {
+       /**
+        * Specify the memory region associated with the memory descriptor.
+        * If the options field has:
+        * - LNET_MD_KIOV bit set: The start field points to the starting
+        * address of an array of lnet_kiov_t and the length field specifies
+        * the number of entries in the array. The length can't be bigger
+        * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+        * fragments that are not necessarily mapped in virtal memory.
+        * - LNET_MD_IOVEC bit set: The start field points to the starting
+        * address of an array of struct iovec and the length field specifies
+        * the number of entries in the array. The length can't be bigger
+        * than LNET_MAX_IOV. The struct iovec is used to describe fragments
+        * that have virtual addresses.
+        * - Otherwise: The memory region is contiguous. The start field
+        * specifies the starting address for the memory region and the
+        * length field specifies its length.
+        *
+        * When the memory region is fragmented, all fragments but the first
+        * one must start on page boundary, and all but the last must end on
+        * page boundary.
+        */
+       void        *start;
+       unsigned int     length;
+       /**
+        * Specifies the maximum number of operations that can be performed
+        * on the memory descriptor. An operation is any action that could
+        * possibly generate an event. In the usual case, the threshold value
+        * is decremented for each operation on the MD. When the threshold
+        * drops to zero, the MD becomes inactive and does not respond to
+        * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+        * there is no bound on the number of operations that may be applied
+        * to a MD.
+        */
+       int           threshold;
+       /**
+        * Specifies the largest incoming request that the memory descriptor
+        * should respond to. When the unused portion of a MD (length -
+        * local offset) falls below this value, the MD becomes inactive and
+        * does not respond to further operations. This value is only used
+        * if the LNET_MD_MAX_SIZE option is set.
+        */
+       int           max_size;
+       /**
+        * Specifies the behavior of the memory descriptor. A bitwise OR
+        * of the following values can be used:
+        * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+        * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+        * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+        *   region is provided by the incoming request. By default, the
+        *   offset is maintained locally. When maintained locally, the
+        *   offset is incremented by the length of the request so that
+        *   the next operation (PUT or GET) will access the next part of
+        *   the memory region. Note that only one offset variable exists
+        *   per memory descriptor. If both PUT and GET operations are
+        *   performed on a memory descriptor, the offset is updated each time.
+        * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+        *   be reduced to match the memory available in the region (determined
+        *   by subtracting the offset from the length of the memory region).
+        *   By default, if the length in the incoming operation is greater
+        *   than the amount of memory available, the operation is rejected.
+        * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+        *   incoming PUT operations, even if requested. By default,
+        *   acknowledgments are sent for PUT operations that request an
+        *   acknowledgment. Acknowledgments are never sent for GET operations.
+        *   The data sent in the REPLY serves as an implicit acknowledgment.
+        * - LNET_MD_KIOV: The start and length fields specify an array of
+        *   lnet_kiov_t.
+        * - LNET_MD_IOVEC: The start and length fields specify an array of
+        *   struct iovec.
+        * - LNET_MD_MAX_SIZE: The max_size field is valid.
+        *
+        * Note:
+        * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+        *   capability for memory descriptors. They can't be both set.
+        * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+        *   region (i.e. sum of all fragment lengths) must not be less than
+        *   \a max_size.
+        */
+       unsigned int     options;
+       /**
+        * A user-specified value that is associated with the memory
+        * descriptor. The value does not need to be a pointer, but must fit
+        * in the space used by a pointer. This value is recorded in events
+        * associated with operations on this MD.
+        */
+       void        *user_ptr;
+       /**
+        * A handle for the event queue used to log the operations performed on
+        * the memory region. If this argument is a NULL handle (i.e. nullified
+        * by LNetInvalidateHandle()), operations performed on this memory
+        * descriptor are not logged.
+        */
+       lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS  20
+#define LNET_MTU       (1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+# define LNET_MAX_PAYLOAD      CONFIG_LNET_MAX_PAYLOAD
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# else
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+
+/**
+ * Options for the MD structure. See lnet_md_t::options.
+ */
+#define LNET_MD_OP_PUT        (1 << 0)
+/** See lnet_md_t::options. */
+#define LNET_MD_OP_GET        (1 << 1)
+/** See lnet_md_t::options. */
+#define LNET_MD_MANAGE_REMOTE  (1 << 2)
+/* unused                          (1 << 3) */
+/** See lnet_md_t::options. */
+#define LNET_MD_TRUNCATE            (1 << 4)
+/** See lnet_md_t::options. */
+#define LNET_MD_ACK_DISABLE      (1 << 5)
+/** See lnet_md_t::options. */
+#define LNET_MD_IOVEC          (1 << 6)
+/** See lnet_md_t::options. */
+#define LNET_MD_MAX_SIZE            (1 << 7)
+/** See lnet_md_t::options. */
+#define LNET_MD_KIOV            (1 << 8)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS                    0
+
+/** Infinite threshold on MD operations. See lnet_md_t::threshold */
+#define LNET_MD_THRESH_INF       (-1)
+
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec lnet_md_iovec_t;
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+       /** Pointer to the page where the fragment resides */
+       struct page      *kiov_page;
+       /** Length in bytes of the fragment */
+       unsigned int     kiov_len;
+       /**
+        * Starting offset of the fragment within the page. Note that the
+        * end of the fragment must not pass the end of the page; i.e.,
+        * kiov_len + kiov_offset <= PAGE_CACHE_SIZE.
+        */
+       unsigned int     kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum {
+       /** An incoming GET operation has completed on the MD. */
+       LNET_EVENT_GET          = 1,
+       /**
+        * An incoming PUT operation has completed on the MD. The
+        * underlying layers will not alter the memory (on behalf of this
+        * operation) once this event has been logged.
+        */
+       LNET_EVENT_PUT,
+       /**
+        * A REPLY operation has completed. This event is logged after the
+        * data (if any) from the REPLY has been written into the MD.
+        */
+       LNET_EVENT_REPLY,
+       /** An acknowledgment has been received. */
+       LNET_EVENT_ACK,
+       /**
+        * An outgoing send (PUT or GET) operation has completed. This event
+        * is logged after the entire buffer has been sent and it is safe for
+        * the caller to reuse the buffer.
+        *
+        * Note:
+        * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+        *   happen even when the message has not yet been put out on wire.
+        * - It's unsafe to assume that in an outgoing GET operation
+        *   the LNET_EVENT_SEND event would happen before the
+        *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+        *   LNET_EVENT_ACK events in an outgoing PUT operation.
+        */
+       LNET_EVENT_SEND,
+       /**
+        * A MD has been unlinked. Note that LNetMDUnlink() does not
+        * necessarily trigger an LNET_EVENT_UNLINK event.
+        * \see LNetMDUnlink
+        */
+       LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE       long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a,b)       (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
+
+/* XXX
+ * cygwin need the pragma line, not clear if it's needed in other places.
+ * checking!!!
+ */
+#ifdef __CYGWIN__
+#pragma pack(push, 4)
+#endif
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct {
+       /** The identifier (nid, pid) of the target. */
+       lnet_process_id_t   target;
+       /** The identifier (nid, pid) of the initiator. */
+       lnet_process_id_t   initiator;
+       /**
+        * The NID of the immediate sender. If the request has been forwarded
+        * by routers, this is the NID of the last hop; otherwise it's the
+        * same as the initiator.
+        */
+       lnet_nid_t        sender;
+       /** Indicates the type of the event. */
+       lnet_event_kind_t   type;
+       /** The portal table index specified in the request */
+       unsigned int    pt_index;
+       /** A copy of the match bits specified in the request. */
+       __u64          match_bits;
+       /** The length (in bytes) specified in the request. */
+       unsigned int    rlength;
+       /**
+        * The length (in bytes) of the data that was manipulated by the
+        * operation. For truncated operations, the manipulated length will be
+        * the number of bytes specified by the MD (possibly with an offset,
+        * see lnet_md_t). For all other operations, the manipulated length
+        * will be the length of the requested operation, i.e. rlength.
+        */
+       unsigned int    mlength;
+       /**
+        * The handle to the MD associated with the event. The handle may be
+        * invalid if the MD has been unlinked.
+        */
+       lnet_handle_md_t    md_handle;
+       /**
+        * A snapshot of the state of the MD immediately after the event has
+        * been processed. In particular, the threshold field in md will
+        * reflect the value of the threshold after the operation occurred.
+        */
+       lnet_md_t          md;
+       /**
+        * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+        * \see LNetPut
+        */
+       __u64          hdr_data;
+       /**
+        * Indicates the completion status of the operation. It's 0 for
+        * successful operations, otherwise it's an error code.
+        */
+       int              status;
+       /**
+        * Indicates whether the MD has been unlinked. Note that:
+        * - An event with unlinked set is the last event on the MD.
+        * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+        * \see LNetMDUnlink
+        */
+       int              unlinked;
+       /**
+        * The displacement (in bytes) into the memory region that the
+        * operation used. The offset can be determined by the operation for
+        * a remote managed MD or by the local MD.
+        * \see lnet_md_t::options
+        */
+       unsigned int    offset;
+       /**
+        * The sequence number for this event. Sequence numbers are unique
+        * to each event.
+        */
+       volatile lnet_seq_t sequence;
+} lnet_event_t;
+#ifdef __CYGWIN__
+#pragma pop
+#endif
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which
+ * acknowledgments can be disabled for a MD.
+ */
+typedef enum {
+       /** Request an acknowledgment */
+       LNET_ACK_REQ,
+       /** Request that no acknowledgment should be generated. */
+       LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644 (file)
index 0000000..00850ee
--- /dev/null
@@ -0,0 +1,40 @@
+config LNET
+       tristate "Lustre networking subsystem"
+       depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+       int "Lustre lnet max transfer payload (default 2MB)"
+       depends on LUSTRE_FS
+       default "1048576"
+       help
+         This option defines the maximum size of payload in bytes that lnet
+         can put into its transport.
+
+         If unsure, use default.
+
+config LNET_SELFTEST
+       tristate "Lustre networking self testing"
+       depends on LNET
+       help
+         Choose Y here if you want to do lnet self testing. To compile this
+         as a module, choose M here: the module will be called lnet_selftest.
+
+         To compile this as a kernel modules, choose M here and it will be
+         called lnet_selftest.
+
+         If unsure, say N.
+
+         See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+       tristate "LNET infiniband support"
+       depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+       default LNET && INFINIBAND
+       help
+         This option allows the LNET users to use infiniband as an
+         RDMA-enabled transport.
+
+         To compile this as a kernel module, choose M here and it will be
+         called ko2iblnd.
+
+         If unsure, say N.
diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644 (file)
index 0000000..374212b
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644 (file)
index 0000000..c23e4f6
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644 (file)
index 0000000..71b7d84
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644 (file)
index 0000000..f4b958b
--- /dev/null
@@ -0,0 +1,3256 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+lnd_t the_o2iblnd = {
+       .lnd_type       = O2IBLND,
+       .lnd_startup    = kiblnd_startup,
+       .lnd_shutdown   = kiblnd_shutdown,
+       .lnd_ctl        = kiblnd_ctl,
+       .lnd_query      = kiblnd_query,
+       .lnd_send       = kiblnd_send,
+       .lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t           kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+       char  *c  = ptr;
+       __u32  sum = 0;
+
+       while (nob-- > 0)
+               sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+       /* ensure I don't return 0 (== no checksum) */
+       return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+       switch (type) {
+       case IBLND_MSG_CONNREQ:
+               return "CONNREQ";
+
+       case IBLND_MSG_CONNACK:
+               return "CONNACK";
+
+       case IBLND_MSG_NOOP:
+               return "NOOP";
+
+       case IBLND_MSG_IMMEDIATE:
+               return "IMMEDIATE";
+
+       case IBLND_MSG_PUT_REQ:
+               return "PUT_REQ";
+
+       case IBLND_MSG_PUT_NAK:
+               return "PUT_NAK";
+
+       case IBLND_MSG_PUT_ACK:
+               return "PUT_ACK";
+
+       case IBLND_MSG_PUT_DONE:
+               return "PUT_DONE";
+
+       case IBLND_MSG_GET_REQ:
+               return "GET_REQ";
+
+       case IBLND_MSG_GET_DONE:
+               return "GET_DONE";
+
+       default:
+               return "???";
+       }
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+       const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+       switch (type) {
+       case IBLND_MSG_CONNREQ:
+       case IBLND_MSG_CONNACK:
+               return hdr_size + sizeof(kib_connparams_t);
+
+       case IBLND_MSG_NOOP:
+               return hdr_size;
+
+       case IBLND_MSG_IMMEDIATE:
+               return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+       case IBLND_MSG_PUT_REQ:
+               return hdr_size + sizeof(kib_putreq_msg_t);
+
+       case IBLND_MSG_PUT_ACK:
+               return hdr_size + sizeof(kib_putack_msg_t);
+
+       case IBLND_MSG_GET_REQ:
+               return hdr_size + sizeof(kib_get_msg_t);
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               return hdr_size + sizeof(kib_completion_msg_t);
+       default:
+               return -1;
+       }
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+       kib_rdma_desc_t   *rd;
+       int             nob;
+       int             n;
+       int             i;
+
+       LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+                msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+       rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+                             &msg->ibm_u.get.ibgm_rd :
+                             &msg->ibm_u.putack.ibpam_rd;
+
+       if (flip) {
+               __swab32s(&rd->rd_key);
+               __swab32s(&rd->rd_nfrags);
+       }
+
+       n = rd->rd_nfrags;
+
+       if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+               CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+                      n, IBLND_MAX_RDMA_FRAGS);
+               return 1;
+       }
+
+       nob = offsetof (kib_msg_t, ibm_u) +
+             kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+       if (msg->ibm_nob < nob) {
+               CERROR("Short %s: %d(%d)\n",
+                      kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+               return 1;
+       }
+
+       if (!flip)
+               return 0;
+
+       for (i = 0; i < n; i++) {
+               __swab32s(&rd->rd_frags[i].rf_nob);
+               __swab64s(&rd->rd_frags[i].rf_addr);
+       }
+
+       return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+                int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+       kib_net_t *net = ni->ni_data;
+
+       /* CAVEAT EMPTOR! all message fields not set here should have been
+        * initialised previously. */
+       msg->ibm_magic    = IBLND_MSG_MAGIC;
+       msg->ibm_version  = version;
+       /*   ibm_type */
+       msg->ibm_credits  = credits;
+       /*   ibm_nob */
+       msg->ibm_cksum    = 0;
+       msg->ibm_srcnid   = ni->ni_nid;
+       msg->ibm_srcstamp = net->ibn_incarnation;
+       msg->ibm_dstnid   = dstnid;
+       msg->ibm_dststamp = dststamp;
+
+       if (*kiblnd_tunables.kib_cksum) {
+               /* NB ibm_cksum zero while computing cksum */
+               msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+       }
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+       const int hdr_size = offsetof(kib_msg_t, ibm_u);
+       __u32     msg_cksum;
+       __u16     version;
+       int       msg_nob;
+       int       flip;
+
+       /* 6 bytes are enough to have received magic + version */
+       if (nob < 6) {
+               CERROR("Short message: %d\n", nob);
+               return -EPROTO;
+       }
+
+       if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+               flip = 0;
+       } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+               flip = 1;
+       } else {
+               CERROR("Bad magic: %08x\n", msg->ibm_magic);
+               return -EPROTO;
+       }
+
+       version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+       if (version != IBLND_MSG_VERSION &&
+           version != IBLND_MSG_VERSION_1) {
+               CERROR("Bad version: %x\n", version);
+               return -EPROTO;
+       }
+
+       if (nob < hdr_size) {
+               CERROR("Short message: %d\n", nob);
+               return -EPROTO;
+       }
+
+       msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+       if (msg_nob > nob) {
+               CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+               return -EPROTO;
+       }
+
+       /* checksum must be computed with ibm_cksum zero and BEFORE anything
+        * gets flipped */
+       msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+       msg->ibm_cksum = 0;
+       if (msg_cksum != 0 &&
+           msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+               CERROR("Bad checksum\n");
+               return -EPROTO;
+       }
+
+       msg->ibm_cksum = msg_cksum;
+
+       if (flip) {
+               /* leave magic unflipped as a clue to peer endianness */
+               msg->ibm_version = version;
+               CLASSERT (sizeof(msg->ibm_type) == 1);
+               CLASSERT (sizeof(msg->ibm_credits) == 1);
+               msg->ibm_nob     = msg_nob;
+               __swab64s(&msg->ibm_srcnid);
+               __swab64s(&msg->ibm_srcstamp);
+               __swab64s(&msg->ibm_dstnid);
+               __swab64s(&msg->ibm_dststamp);
+       }
+
+       if (msg->ibm_srcnid == LNET_NID_ANY) {
+               CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+               return -EPROTO;
+       }
+
+       if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+               CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+                      msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+               return -EPROTO;
+       }
+
+       switch (msg->ibm_type) {
+       default:
+               CERROR("Unknown message type %x\n", msg->ibm_type);
+               return -EPROTO;
+
+       case IBLND_MSG_NOOP:
+       case IBLND_MSG_IMMEDIATE:
+       case IBLND_MSG_PUT_REQ:
+               break;
+
+       case IBLND_MSG_PUT_ACK:
+       case IBLND_MSG_GET_REQ:
+               if (kiblnd_unpack_rd(msg, flip))
+                       return -EPROTO;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               if (flip)
+                       __swab32s(&msg->ibm_u.completion.ibcm_status);
+               break;
+
+       case IBLND_MSG_CONNREQ:
+       case IBLND_MSG_CONNACK:
+               if (flip) {
+                       __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                       __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+                       __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+               }
+               break;
+       }
+       return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+       kib_peer_t      *peer;
+       kib_net_t       *net = ni->ni_data;
+       int             cpt = lnet_cpt_of_nid(nid);
+       unsigned long   flags;
+
+       LASSERT(net != NULL);
+       LASSERT(nid != LNET_NID_ANY);
+
+       LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+       if (peer == NULL) {
+               CERROR("Cannot allocate peer\n");
+               return -ENOMEM;
+       }
+
+       memset(peer, 0, sizeof(*peer));  /* zero flags etc */
+
+       peer->ibp_ni = ni;
+       peer->ibp_nid = nid;
+       peer->ibp_error = 0;
+       peer->ibp_last_alive = 0;
+       atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+       INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+       INIT_LIST_HEAD(&peer->ibp_conns);
+       INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       /* always called with a ref on ni, which prevents ni being shutdown */
+       LASSERT (net->ibn_shutdown == 0);
+
+       /* npeers only grows with the global lock held */
+       atomic_inc(&net->ibn_npeers);
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+       kib_net_t *net = peer->ibp_ni->ni_data;
+
+       LASSERT (net != NULL);
+       LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+       LASSERT (!kiblnd_peer_active(peer));
+       LASSERT (peer->ibp_connecting == 0);
+       LASSERT (peer->ibp_accepting == 0);
+       LASSERT (list_empty(&peer->ibp_conns));
+       LASSERT (list_empty(&peer->ibp_tx_queue));
+
+       LIBCFS_FREE(peer, sizeof(*peer));
+
+       /* NB a peer's connections keep a reference on their peer until
+        * they are destroyed, so we can be assured that _all_ state to do
+        * with this peer has been cleaned up when its refcount drops to
+        * zero. */
+       atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+       /* the caller is responsible for accounting the additional reference
+        * that this creates */
+       struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+       struct list_head       *tmp;
+       kib_peer_t       *peer;
+
+       list_for_each (tmp, peer_list) {
+
+               peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+               LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+                        peer->ibp_accepting > 0 ||
+                        !list_empty(&peer->ibp_conns));  /* active conn */
+
+               if (peer->ibp_nid != nid)
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+                      peer, libcfs_nid2str(nid),
+                      atomic_read(&peer->ibp_refcount),
+                      peer->ibp_version);
+               return peer;
+       }
+       return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+       LASSERT (list_empty(&peer->ibp_conns));
+
+       LASSERT (kiblnd_peer_active(peer));
+       list_del_init(&peer->ibp_list);
+       /* lose peerlist's ref */
+       kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+                     lnet_nid_t *nidp, int *count)
+{
+       kib_peer_t          *peer;
+       struct list_head            *ptmp;
+       int                 i;
+       unsigned long     flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+               list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (index-- > 0)
+                               continue;
+
+                       *nidp = peer->ibp_nid;
+                       *count = atomic_read(&peer->ibp_refcount);
+
+                       read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                              flags);
+                       return 0;
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+       return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+       struct list_head           *ctmp;
+       struct list_head           *cnxt;
+       kib_conn_t         *conn;
+
+       if (list_empty(&peer->ibp_conns)) {
+               kiblnd_unlink_peer_locked(peer);
+       } else {
+               list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                       conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                       kiblnd_close_conn_locked(conn, 0);
+               }
+               /* NB closing peer's last conn unlinked it. */
+       }
+       /* NB peer now unlinked; might even be freed if the peer table had the
+        * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+       LIST_HEAD        (zombies);
+       struct list_head            *ptmp;
+       struct list_head            *pnxt;
+       kib_peer_t          *peer;
+       int                 lo;
+       int                 hi;
+       int                 i;
+       unsigned long     flags;
+       int                 rc = -ENOENT;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (nid != LNET_NID_ANY) {
+               lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+       } else {
+               lo = 0;
+               hi = kiblnd_data.kib_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+                               continue;
+
+                       if (!list_empty(&peer->ibp_tx_queue)) {
+                               LASSERT (list_empty(&peer->ibp_conns));
+
+                               list_splice_init(&peer->ibp_tx_queue,
+                                                    &zombies);
+                       }
+
+                       kiblnd_del_peer_locked(peer);
+                       rc = 0;  /* matched something */
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_txlist_done(ni, &zombies, -EIO);
+
+       return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+       kib_peer_t          *peer;
+       struct list_head            *ptmp;
+       kib_conn_t          *conn;
+       struct list_head            *ctmp;
+       int                 i;
+       unsigned long     flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+               list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       list_for_each (ctmp, &peer->ibp_conns) {
+                               if (index-- > 0)
+                                       continue;
+
+                               conn = list_entry(ctmp, kib_conn_t,
+                                                     ibc_list);
+                               kiblnd_conn_addref(conn);
+                               read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                                      flags);
+                               return conn;
+                       }
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+       return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+       CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+              rx, rx->rx_status, rx->rx_msg->ibm_type,
+              rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+       CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+              "cookie "LPX64" msg %s%s type %x cred %d\n",
+              tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+              tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+              tx->tx_lntmsg[0] == NULL ? "-" : "!",
+              tx->tx_lntmsg[1] == NULL ? "-" : "!",
+              tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+       struct list_head        *tmp;
+       int             i;
+
+       spin_lock(&conn->ibc_lock);
+
+       CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+              atomic_read(&conn->ibc_refcount), conn,
+              conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+       CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+              conn->ibc_state, conn->ibc_noops_posted,
+              conn->ibc_nsends_posted, conn->ibc_credits,
+              conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+       CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+       CDEBUG(D_CONSOLE, "   early_rxs:\n");
+       list_for_each(tmp, &conn->ibc_early_rxs)
+               kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_noops:\n");
+       list_for_each(tmp, &conn->ibc_tx_noops)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   active_txs:\n");
+       list_for_each(tmp, &conn->ibc_active_txs)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   rxs:\n");
+       for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+               kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+       spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+       switch (value) {
+       default:
+               return -1;
+       case 0:
+               return 0;
+       case 256:
+               return IB_MTU_256;
+       case 512:
+               return IB_MTU_512;
+       case 1024:
+               return IB_MTU_1024;
+       case 2048:
+               return IB_MTU_2048;
+       case 4096:
+               return IB_MTU_4096;
+       }
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+       int        mtu;
+
+       /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+       if (cmid->route.path_rec == NULL)
+               return;
+
+       mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+       LASSERT (mtu >= 0);
+       if (mtu != 0)
+               cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+       cpumask_t       *mask;
+       int             vectors;
+       int             off;
+       int             i;
+
+       vectors = conn->ibc_cmid->device->num_comp_vectors;
+       if (vectors <= 1)
+               return 0;
+
+       mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+       /* hash NID to CPU id in this partition... */
+       off = conn->ibc_peer->ibp_nid % cpus_weight(*mask);
+       for_each_cpu_mask(i, *mask) {
+               if (off-- == 0)
+                       return i % vectors;
+       }
+
+       LBUG();
+       return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+                  int state, int version)
+{
+       /* CAVEAT EMPTOR:
+        * If the new conn is created successfully it takes over the caller's
+        * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+        * is destroyed.  On failure, the caller's ref on 'peer' remains and
+        * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+        * to destroy 'cmid' here since I'm called from the CM which still has
+        * its ref on 'cmid'). */
+       rwlock_t                *glock = &kiblnd_data.kib_global_lock;
+       kib_net_t             *net = peer->ibp_ni->ni_data;
+       kib_dev_t             *dev;
+       struct ib_qp_init_attr *init_qp_attr;
+       struct kib_sched_info   *sched;
+       kib_conn_t              *conn;
+       struct ib_cq            *cq;
+       unsigned long           flags;
+       int                     cpt;
+       int                     rc;
+       int                     i;
+
+       LASSERT(net != NULL);
+       LASSERT(!in_interrupt());
+
+       dev = net->ibn_dev;
+
+       cpt = lnet_cpt_of_nid(peer->ibp_nid);
+       sched = kiblnd_data.kib_scheds[cpt];
+
+       LASSERT(sched->ibs_nthreads > 0);
+
+       LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+                        sizeof(*init_qp_attr));
+       if (init_qp_attr == NULL) {
+               CERROR("Can't allocate qp_attr for %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               goto failed_0;
+       }
+
+       LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+       if (conn == NULL) {
+               CERROR("Can't allocate connection for %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               goto failed_1;
+       }
+
+       conn->ibc_state = IBLND_CONN_INIT;
+       conn->ibc_version = version;
+       conn->ibc_peer = peer;            /* I take the caller's ref */
+       cmid->context = conn;              /* for future CM callbacks */
+       conn->ibc_cmid = cmid;
+
+       INIT_LIST_HEAD(&conn->ibc_early_rxs);
+       INIT_LIST_HEAD(&conn->ibc_tx_noops);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+       INIT_LIST_HEAD(&conn->ibc_active_txs);
+       spin_lock_init(&conn->ibc_lock);
+
+       LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+                        sizeof(*conn->ibc_connvars));
+       if (conn->ibc_connvars == NULL) {
+               CERROR("Can't allocate in-progress connection state\n");
+               goto failed_2;
+       }
+
+       write_lock_irqsave(glock, flags);
+       if (dev->ibd_failover) {
+               write_unlock_irqrestore(glock, flags);
+               CERROR("%s: failover in progress\n", dev->ibd_ifname);
+               goto failed_2;
+       }
+
+       if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+               /* wakeup failover thread and teardown connection */
+               if (kiblnd_dev_can_failover(dev)) {
+                       list_add_tail(&dev->ibd_fail_list,
+                                     &kiblnd_data.kib_failed_devs);
+                       wake_up(&kiblnd_data.kib_failover_waitq);
+               }
+
+               write_unlock_irqrestore(glock, flags);
+               CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+                      cmid->device->name, dev->ibd_ifname);
+               goto failed_2;
+       }
+
+       kiblnd_hdev_addref_locked(dev->ibd_hdev);
+       conn->ibc_hdev = dev->ibd_hdev;
+
+       kiblnd_setup_mtu_locked(cmid);
+
+       write_unlock_irqrestore(glock, flags);
+
+       LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+                        IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+       if (conn->ibc_rxs == NULL) {
+               CERROR("Cannot allocate RX buffers\n");
+               goto failed_2;
+       }
+
+       rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+                               IBLND_RX_MSG_PAGES(version));
+       if (rc != 0)
+               goto failed_2;
+
+       kiblnd_map_rx_descs(conn);
+
+       cq = ib_create_cq(cmid->device,
+                         kiblnd_cq_completion, kiblnd_cq_event, conn,
+                         IBLND_CQ_ENTRIES(version),
+                         kiblnd_get_completion_vector(conn, cpt));
+       if (IS_ERR(cq)) {
+               CERROR("Can't create CQ: %ld, cqe: %d\n",
+                      PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+               goto failed_2;
+       }
+
+       conn->ibc_cq = cq;
+
+       rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       if (rc != 0) {
+               CERROR("Can't request completion notificiation: %d\n", rc);
+               goto failed_2;
+       }
+
+       init_qp_attr->event_handler = kiblnd_qp_event;
+       init_qp_attr->qp_context = conn;
+       init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+       init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+       init_qp_attr->cap.max_send_sge = 1;
+       init_qp_attr->cap.max_recv_sge = 1;
+       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr->qp_type = IB_QPT_RC;
+       init_qp_attr->send_cq = cq;
+       init_qp_attr->recv_cq = cq;
+
+       conn->ibc_sched = sched;
+
+       rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+       if (rc != 0) {
+               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+                      rc, init_qp_attr->cap.max_send_wr,
+                      init_qp_attr->cap.max_recv_wr);
+               goto failed_2;
+       }
+
+       LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+       /* 1 ref for caller and each rxmsg */
+       atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+       conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+       /* post receives */
+       for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+               rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+                                   IBLND_POSTRX_NO_CREDIT);
+               if (rc != 0) {
+                       CERROR("Can't post rxmsg: %d\n", rc);
+
+                       /* Make posted receives complete */
+                       kiblnd_abort_receives(conn);
+
+                       /* correct # of posted buffers
+                        * NB locking needed now I'm racing with completion */
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+                       conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       /* cmid will be destroyed by CM(ofed) after cm_callback
+                        * returned, so we can't refer it anymore
+                        * (by kiblnd_connd()->kiblnd_destroy_conn) */
+                       rdma_destroy_qp(conn->ibc_cmid);
+                       conn->ibc_cmid = NULL;
+
+                       /* Drop my own and unused rxbuffer refcounts */
+                       while (i++ <= IBLND_RX_MSGS(version))
+                               kiblnd_conn_decref(conn);
+
+                       return NULL;
+               }
+       }
+
+       /* Init successful! */
+       LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+                state == IBLND_CONN_PASSIVE_WAIT);
+       conn->ibc_state = state;
+
+       /* 1 more conn */
+       atomic_inc(&net->ibn_nconns);
+       return conn;
+
+ failed_2:
+       kiblnd_destroy_conn(conn);
+ failed_1:
+       LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+       return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+       struct rdma_cm_id *cmid = conn->ibc_cmid;
+       kib_peer_t      *peer = conn->ibc_peer;
+       int             rc;
+
+       LASSERT (!in_interrupt());
+       LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+       LASSERT (list_empty(&conn->ibc_early_rxs));
+       LASSERT (list_empty(&conn->ibc_tx_noops));
+       LASSERT (list_empty(&conn->ibc_tx_queue));
+       LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+       LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+       LASSERT (list_empty(&conn->ibc_active_txs));
+       LASSERT (conn->ibc_noops_posted == 0);
+       LASSERT (conn->ibc_nsends_posted == 0);
+
+       switch (conn->ibc_state) {
+       default:
+               /* conn must be completely disengaged from the network */
+               LBUG();
+
+       case IBLND_CONN_DISCONNECTED:
+               /* connvars should have been freed already */
+               LASSERT (conn->ibc_connvars == NULL);
+               break;
+
+       case IBLND_CONN_INIT:
+               break;
+       }
+
+       /* conn->ibc_cmid might be destroyed by CM already */
+       if (cmid != NULL && cmid->qp != NULL)
+               rdma_destroy_qp(cmid);
+
+       if (conn->ibc_cq != NULL) {
+               rc = ib_destroy_cq(conn->ibc_cq);
+               if (rc != 0)
+                       CWARN("Error destroying CQ: %d\n", rc);
+       }
+
+       if (conn->ibc_rx_pages != NULL)
+               kiblnd_unmap_rx_descs(conn);
+
+       if (conn->ibc_rxs != NULL) {
+               LIBCFS_FREE(conn->ibc_rxs,
+                           IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+       }
+
+       if (conn->ibc_connvars != NULL)
+               LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+       if (conn->ibc_hdev != NULL)
+               kiblnd_hdev_decref(conn->ibc_hdev);
+
+       /* See CAVEAT EMPTOR above in kiblnd_create_conn */
+       if (conn->ibc_state != IBLND_CONN_INIT) {
+               kib_net_t *net = peer->ibp_ni->ni_data;
+
+               kiblnd_peer_decref(peer);
+               rdma_destroy_id(cmid);
+               atomic_dec(&net->ibn_nconns);
+       }
+
+       LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+       kib_conn_t           *conn;
+       struct list_head             *ctmp;
+       struct list_head             *cnxt;
+       int                  count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+               conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+               CDEBUG(D_NET, "Closing conn -> %s, "
+                             "version: %x, reason: %d\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_version, why);
+
+               kiblnd_close_conn_locked(conn, why);
+               count++;
+       }
+
+       return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+                                int version, __u64 incarnation)
+{
+       kib_conn_t           *conn;
+       struct list_head             *ctmp;
+       struct list_head             *cnxt;
+       int                  count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+               conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+               if (conn->ibc_version     == version &&
+                   conn->ibc_incarnation == incarnation)
+                       continue;
+
+               CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+                             "incarnation:"LPX64"(%x, "LPX64")\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_version, conn->ibc_incarnation,
+                      version, incarnation);
+
+               kiblnd_close_conn_locked(conn, -ESTALE);
+               count++;
+       }
+
+       return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+       kib_peer_t           *peer;
+       struct list_head             *ptmp;
+       struct list_head             *pnxt;
+       int                  lo;
+       int                  hi;
+       int                  i;
+       unsigned long      flags;
+       int                  count = 0;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (nid != LNET_NID_ANY)
+               lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+       else {
+               lo = 0;
+               hi = kiblnd_data.kib_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+                               continue;
+
+                       count += kiblnd_close_peer_conns_locked(peer, 0);
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* wildcards always succeed */
+       if (nid == LNET_NID_ANY)
+               return 0;
+
+       return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       int                    rc = -EINVAL;
+
+       switch(cmd) {
+       case IOC_LIBCFS_GET_PEER: {
+               lnet_nid_t   nid = 0;
+               int       count = 0;
+
+               rc = kiblnd_get_peer_info(ni, data->ioc_count,
+                                         &nid, &count);
+               data->ioc_nid    = nid;
+               data->ioc_count  = count;
+               break;
+       }
+
+       case IOC_LIBCFS_DEL_PEER: {
+               rc = kiblnd_del_peer(ni, data->ioc_nid);
+               break;
+       }
+       case IOC_LIBCFS_GET_CONN: {
+               kib_conn_t *conn;
+
+               rc = 0;
+               conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+               if (conn == NULL) {
+                       rc = -ENOENT;
+                       break;
+               }
+
+               LASSERT (conn->ibc_cmid != NULL);
+               data->ioc_nid = conn->ibc_peer->ibp_nid;
+               if (conn->ibc_cmid->route.path_rec == NULL)
+                       data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+               else
+                       data->ioc_u32[0] =
+                       ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+               kiblnd_conn_decref(conn);
+               break;
+       }
+       case IOC_LIBCFS_CLOSE_CONNECTION: {
+               rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+               break;
+       }
+
+       default:
+               break;
+       }
+
+       return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       cfs_time_t      last_alive = 0;
+       cfs_time_t      now = cfs_time_current();
+       rwlock_t        *glock = &kiblnd_data.kib_global_lock;
+       kib_peer_t      *peer;
+       unsigned long   flags;
+
+       read_lock_irqsave(glock, flags);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+                        peer->ibp_accepting > 0 ||
+                        !list_empty(&peer->ibp_conns));  /* active conn */
+               last_alive = peer->ibp_last_alive;
+       }
+
+       read_unlock_irqrestore(glock, flags);
+
+       if (last_alive != 0)
+               *when = last_alive;
+
+       /* peer is not persistent in hash, trigger peer creation
+        * and connection establishment with a NULL tx */
+       if (peer == NULL)
+               kiblnd_launch_tx(ni, NULL, nid);
+
+       CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+              libcfs_nid2str(nid), peer,
+              last_alive ? cfs_duration_sec(now - last_alive) : -1);
+       return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+       int     npages = p->ibp_npages;
+       int     i;
+
+       for (i = 0; i < npages; i++) {
+               if (p->ibp_pages[i] != NULL)
+                       __free_page(p->ibp_pages[i]);
+       }
+
+       LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+       kib_pages_t     *p;
+       int             i;
+
+       LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+                        offsetof(kib_pages_t, ibp_pages[npages]));
+       if (p == NULL) {
+               CERROR("Can't allocate descriptor for %d pages\n", npages);
+               return -ENOMEM;
+       }
+
+       memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+       p->ibp_npages = npages;
+
+       for (i = 0; i < npages; i++) {
+               p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+                                                    __GFP_IO);
+               if (p->ibp_pages[i] == NULL) {
+                       CERROR("Can't allocate page %d of %d\n", i, npages);
+                       kiblnd_free_pages(p);
+                       return -ENOMEM;
+               }
+       }
+
+       *pp = p;
+       return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+       kib_rx_t *rx;
+       int       i;
+
+       LASSERT (conn->ibc_rxs != NULL);
+       LASSERT (conn->ibc_hdev != NULL);
+
+       for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+               rx = &conn->ibc_rxs[i];
+
+               LASSERT (rx->rx_nob >= 0); /* not posted */
+
+               kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+                                       KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+                                                         rx->rx_msgaddr),
+                                       IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+       }
+
+       kiblnd_free_pages(conn->ibc_rx_pages);
+
+       conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+       kib_rx_t       *rx;
+       struct page    *pg;
+       int          pg_off;
+       int          ipg;
+       int          i;
+
+       for (pg_off = ipg = i = 0;
+            i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+               pg = conn->ibc_rx_pages->ibp_pages[ipg];
+               rx = &conn->ibc_rxs[i];
+
+               rx->rx_conn = conn;
+               rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+               rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+                                                      rx->rx_msg, IBLND_MSG_SIZE,
+                                                      DMA_FROM_DEVICE);
+               LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+                                                  rx->rx_msgaddr));
+               KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+               CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+                      i, rx->rx_msg, rx->rx_msgaddr,
+                      lnet_page2phys(pg) + pg_off);
+
+               pg_off += IBLND_MSG_SIZE;
+               LASSERT (pg_off <= PAGE_SIZE);
+
+               if (pg_off == PAGE_SIZE) {
+                       pg_off = 0;
+                       ipg++;
+                       LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+               }
+       }
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+       kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+       kib_tx_t       *tx;
+       int          i;
+
+       LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+       if (hdev == NULL)
+               return;
+
+       for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+               tx = &tpo->tpo_tx_descs[i];
+               kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+                                       KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+                                                         tx->tx_msgaddr),
+                                       IBLND_MSG_SIZE, DMA_TO_DEVICE);
+       }
+
+       kiblnd_hdev_decref(hdev);
+       tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+       kib_hca_dev_t *hdev;
+       unsigned long  flags;
+       int         i = 0;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       while (dev->ibd_failover) {
+               read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+               if (i++ % 50 == 0)
+                       CDEBUG(D_NET, "%s: Wait for failover\n",
+                              dev->ibd_ifname);
+               schedule_timeout(cfs_time_seconds(1) / 100);
+
+               read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       }
+
+       kiblnd_hdev_addref_locked(dev->ibd_hdev);
+       hdev = dev->ibd_hdev;
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+       kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+       kib_pool_t     *pool  = &tpo->tpo_pool;
+       kib_net_t      *net   = pool->po_owner->ps_net;
+       kib_dev_t      *dev;
+       struct page    *page;
+       kib_tx_t       *tx;
+       int          page_offset;
+       int          ipage;
+       int          i;
+
+       LASSERT (net != NULL);
+
+       dev = net->ibn_dev;
+
+       /* pre-mapped messages are not bigger than 1 page */
+       CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+       /* No fancy arithmetic when we do the buffer calculations */
+       CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+       tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+       for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+               page = txpgs->ibp_pages[ipage];
+               tx = &tpo->tpo_tx_descs[i];
+
+               tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+                                          page_offset);
+
+               tx->tx_msgaddr = kiblnd_dma_map_single(
+                       tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+                       IBLND_MSG_SIZE, DMA_TO_DEVICE);
+               LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+                                                  tx->tx_msgaddr));
+               KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+               list_add(&tx->tx_list, &pool->po_free_list);
+
+               page_offset += IBLND_MSG_SIZE;
+               LASSERT (page_offset <= PAGE_SIZE);
+
+               if (page_offset == PAGE_SIZE) {
+                       page_offset = 0;
+                       ipage++;
+                       LASSERT (ipage <= txpgs->ibp_npages);
+               }
+       }
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+       __u64   index;
+
+       LASSERT (hdev->ibh_mrs[0] != NULL);
+
+       if (hdev->ibh_nmrs == 1)
+               return hdev->ibh_mrs[0];
+
+       index = addr >> hdev->ibh_mr_shift;
+
+       if (index <  hdev->ibh_nmrs &&
+           index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+               return hdev->ibh_mrs[index];
+
+       return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+       struct ib_mr *prev_mr;
+       struct ib_mr *mr;
+       int        i;
+
+       LASSERT (hdev->ibh_mrs[0] != NULL);
+
+       if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+           *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+               return NULL;
+
+       if (hdev->ibh_nmrs == 1)
+               return hdev->ibh_mrs[0];
+
+       for (i = 0, mr = prev_mr = NULL;
+            i < rd->rd_nfrags; i++) {
+               mr = kiblnd_find_dma_mr(hdev,
+                                       rd->rd_frags[i].rf_addr,
+                                       rd->rd_frags[i].rf_nob);
+               if (prev_mr == NULL)
+                       prev_mr = mr;
+
+               if (mr == NULL || prev_mr != mr) {
+                       /* Can't covered by one single MR */
+                       mr = NULL;
+                       break;
+               }
+       }
+
+       return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+       LASSERT (pool->fpo_map_count == 0);
+
+       if (pool->fpo_fmr_pool != NULL)
+               ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+       if (pool->fpo_hdev != NULL)
+               kiblnd_hdev_decref(pool->fpo_hdev);
+
+       LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+       kib_fmr_pool_t *pool;
+
+       while (!list_empty(head)) {
+               pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+               list_del(&pool->fpo_list);
+               kiblnd_destroy_fmr_pool(pool);
+       }
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+       return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+       return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+       /* FMR pool for RDMA */
+       kib_dev_t              *dev = fps->fps_net->ibn_dev;
+       kib_fmr_pool_t    *fpo;
+       struct ib_fmr_pool_param param = {
+               .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+               .page_shift     = PAGE_SHIFT,
+               .access     = (IB_ACCESS_LOCAL_WRITE |
+                                     IB_ACCESS_REMOTE_WRITE),
+               .pool_size         = fps->fps_pool_size,
+               .dirty_watermark   = fps->fps_flush_trigger,
+               .flush_function    = NULL,
+               .flush_arg       = NULL,
+               .cache       = !!*kiblnd_tunables.kib_fmr_cache};
+       int rc;
+
+       LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+       if (fpo == NULL)
+               return -ENOMEM;
+
+       fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+       fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+       if (IS_ERR(fpo->fpo_fmr_pool)) {
+               rc = PTR_ERR(fpo->fpo_fmr_pool);
+               CERROR("Failed to create FMR pool: %d\n", rc);
+
+               kiblnd_hdev_decref(fpo->fpo_hdev);
+               LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+               return rc;
+       }
+
+       fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+       fpo->fpo_owner    = fps;
+       *pp_fpo = fpo;
+
+       return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+       if (fps->fps_net == NULL) /* intialized? */
+               return;
+
+       spin_lock(&fps->fps_lock);
+
+       while (!list_empty(&fps->fps_pool_list)) {
+               kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+                                                kib_fmr_pool_t, fpo_list);
+               fpo->fpo_failed = 1;
+               list_del(&fpo->fpo_list);
+               if (fpo->fpo_map_count == 0)
+                       list_add(&fpo->fpo_list, zombies);
+               else
+                       list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+       }
+
+       spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+       if (fps->fps_net != NULL) { /* initialized? */
+               kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+               kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+       }
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+                       int pool_size, int flush_trigger)
+{
+       kib_fmr_pool_t *fpo;
+       int          rc;
+
+       memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+       fps->fps_net = net;
+       fps->fps_cpt = cpt;
+       fps->fps_pool_size = pool_size;
+       fps->fps_flush_trigger = flush_trigger;
+       spin_lock_init(&fps->fps_lock);
+       INIT_LIST_HEAD(&fps->fps_pool_list);
+       INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+       rc = kiblnd_create_fmr_pool(fps, &fpo);
+       if (rc == 0)
+               list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+       return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+       if (fpo->fpo_map_count != 0) /* still in use */
+               return 0;
+       if (fpo->fpo_failed)
+               return 1;
+       return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+       LIST_HEAD     (zombies);
+       kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+       kib_fmr_poolset_t *fps = fpo->fpo_owner;
+       cfs_time_t       now = cfs_time_current();
+       kib_fmr_pool_t    *tmp;
+       int             rc;
+
+       rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+       LASSERT (rc == 0);
+
+       if (status != 0) {
+               rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+               LASSERT (rc == 0);
+       }
+
+       fmr->fmr_pool = NULL;
+       fmr->fmr_pfmr = NULL;
+
+       spin_lock(&fps->fps_lock);
+       fpo->fpo_map_count --;  /* decref the pool */
+
+       list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+               /* the first pool is persistent */
+               if (fps->fps_pool_list.next == &fpo->fpo_list)
+                       continue;
+
+               if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+                       list_move(&fpo->fpo_list, &zombies);
+                       fps->fps_version ++;
+               }
+       }
+       spin_unlock(&fps->fps_lock);
+
+       if (!list_empty(&zombies))
+               kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+                   __u64 iov, kib_fmr_t *fmr)
+{
+       struct ib_pool_fmr *pfmr;
+       kib_fmr_pool_t     *fpo;
+       __u64          version;
+       int              rc;
+
+ again:
+       spin_lock(&fps->fps_lock);
+       version = fps->fps_version;
+       list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+               fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+               fpo->fpo_map_count++;
+               spin_unlock(&fps->fps_lock);
+
+               pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+                                           pages, npages, iov);
+               if (likely(!IS_ERR(pfmr))) {
+                       fmr->fmr_pool = fpo;
+                       fmr->fmr_pfmr = pfmr;
+                       return 0;
+               }
+
+               spin_lock(&fps->fps_lock);
+               fpo->fpo_map_count--;
+               if (PTR_ERR(pfmr) != -EAGAIN) {
+                       spin_unlock(&fps->fps_lock);
+                       return PTR_ERR(pfmr);
+               }
+
+               /* EAGAIN and ... */
+               if (version != fps->fps_version) {
+                       spin_unlock(&fps->fps_lock);
+                       goto again;
+               }
+       }
+
+       if (fps->fps_increasing) {
+               spin_unlock(&fps->fps_lock);
+               CDEBUG(D_NET, "Another thread is allocating new "
+                      "FMR pool, waiting for her to complete\n");
+               schedule();
+               goto again;
+
+       }
+
+       if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+               /* someone failed recently */
+               spin_unlock(&fps->fps_lock);
+               return -EAGAIN;
+       }
+
+       fps->fps_increasing = 1;
+       spin_unlock(&fps->fps_lock);
+
+       CDEBUG(D_NET, "Allocate new FMR pool\n");
+       rc = kiblnd_create_fmr_pool(fps, &fpo);
+       spin_lock(&fps->fps_lock);
+       fps->fps_increasing = 0;
+       if (rc == 0) {
+               fps->fps_version++;
+               list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+       } else {
+               fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+       }
+       spin_unlock(&fps->fps_lock);
+
+       goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+       LASSERT (list_empty(&pool->po_free_list));
+       LASSERT (pool->po_allocated == 0);
+
+       CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+       CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+       memset(pool, 0, sizeof(kib_pool_t));
+       INIT_LIST_HEAD(&pool->po_free_list);
+       pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+       pool->po_owner    = ps;
+       pool->po_size     = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+       kib_pool_t *pool;
+
+       while (!list_empty(head)) {
+               pool = list_entry(head->next, kib_pool_t, po_list);
+               list_del(&pool->po_list);
+
+               LASSERT (pool->po_owner != NULL);
+               pool->po_owner->ps_pool_destroy(pool);
+       }
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+       if (ps->ps_net == NULL) /* intialized? */
+               return;
+
+       spin_lock(&ps->ps_lock);
+       while (!list_empty(&ps->ps_pool_list)) {
+               kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+                                           kib_pool_t, po_list);
+               po->po_failed = 1;
+               list_del(&po->po_list);
+               if (po->po_allocated == 0)
+                       list_add(&po->po_list, zombies);
+               else
+                       list_add(&po->po_list, &ps->ps_failed_pool_list);
+       }
+       spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+       if (ps->ps_net != NULL) { /* initialized? */
+               kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+               kiblnd_destroy_pool_list(&ps->ps_pool_list);
+       }
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+                   kib_net_t *net, char *name, int size,
+                   kib_ps_pool_create_t po_create,
+                   kib_ps_pool_destroy_t po_destroy,
+                   kib_ps_node_init_t nd_init,
+                   kib_ps_node_fini_t nd_fini)
+{
+       kib_pool_t      *pool;
+       int             rc;
+
+       memset(ps, 0, sizeof(kib_poolset_t));
+
+       ps->ps_cpt          = cpt;
+       ps->ps_net        = net;
+       ps->ps_pool_create  = po_create;
+       ps->ps_pool_destroy = po_destroy;
+       ps->ps_node_init    = nd_init;
+       ps->ps_node_fini    = nd_fini;
+       ps->ps_pool_size    = size;
+       if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+           >= sizeof(ps->ps_name))
+               return -E2BIG;
+       spin_lock_init(&ps->ps_lock);
+       INIT_LIST_HEAD(&ps->ps_pool_list);
+       INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+       rc = ps->ps_pool_create(ps, size, &pool);
+       if (rc == 0)
+               list_add(&pool->po_list, &ps->ps_pool_list);
+       else
+               CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+       return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+       if (pool->po_allocated != 0) /* still in use */
+               return 0;
+       if (pool->po_failed)
+               return 1;
+       return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+       LIST_HEAD  (zombies);
+       kib_poolset_t  *ps = pool->po_owner;
+       kib_pool_t     *tmp;
+       cfs_time_t      now = cfs_time_current();
+
+       spin_lock(&ps->ps_lock);
+
+       if (ps->ps_node_fini != NULL)
+               ps->ps_node_fini(pool, node);
+
+       LASSERT (pool->po_allocated > 0);
+       list_add(node, &pool->po_free_list);
+       pool->po_allocated --;
+
+       list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+               /* the first pool is persistent */
+               if (ps->ps_pool_list.next == &pool->po_list)
+                       continue;
+
+               if (kiblnd_pool_is_idle(pool, now))
+                       list_move(&pool->po_list, &zombies);
+       }
+       spin_unlock(&ps->ps_lock);
+
+       if (!list_empty(&zombies))
+               kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+       struct list_head            *node;
+       kib_pool_t          *pool;
+       int                 rc;
+
+ again:
+       spin_lock(&ps->ps_lock);
+       list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+               if (list_empty(&pool->po_free_list))
+                       continue;
+
+               pool->po_allocated ++;
+               pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+               node = pool->po_free_list.next;
+               list_del(node);
+
+               if (ps->ps_node_init != NULL) {
+                       /* still hold the lock */
+                       ps->ps_node_init(pool, node);
+               }
+               spin_unlock(&ps->ps_lock);
+               return node;
+       }
+
+       /* no available tx pool and ... */
+       if (ps->ps_increasing) {
+               /* another thread is allocating a new pool */
+               spin_unlock(&ps->ps_lock);
+               CDEBUG(D_NET, "Another thread is allocating new "
+                      "%s pool, waiting for her to complete\n",
+                      ps->ps_name);
+               schedule();
+               goto again;
+       }
+
+       if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+               /* someone failed recently */
+               spin_unlock(&ps->ps_lock);
+               return NULL;
+       }
+
+       ps->ps_increasing = 1;
+       spin_unlock(&ps->ps_lock);
+
+       CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+       rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+       spin_lock(&ps->ps_lock);
+       ps->ps_increasing = 0;
+       if (rc == 0) {
+               list_add_tail(&pool->po_list, &ps->ps_pool_list);
+       } else {
+               ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+               CERROR("Can't allocate new %s pool because out of memory\n",
+                      ps->ps_name);
+       }
+       spin_unlock(&ps->ps_lock);
+
+       goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+       kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+       struct ib_mr    *mr  = pmr->pmr_mr;
+
+       pmr->pmr_mr = NULL;
+       kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+       if (mr != NULL)
+               ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+                   kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+       kib_phys_mr_t *pmr;
+       struct list_head    *node;
+       int         rc;
+       int         i;
+
+       node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+       if (node == NULL) {
+               CERROR("Failed to allocate PMR descriptor\n");
+               return -ENOMEM;
+       }
+
+       pmr = container_of(node, kib_phys_mr_t, pmr_list);
+       if (pmr->pmr_pool->ppo_hdev != hdev) {
+               kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+               return -EAGAIN;
+       }
+
+       for (i = 0; i < rd->rd_nfrags; i ++) {
+               pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+               pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+       }
+
+       pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+                                    pmr->pmr_ipb, rd->rd_nfrags,
+                                    IB_ACCESS_LOCAL_WRITE |
+                                    IB_ACCESS_REMOTE_WRITE,
+                                    iova);
+       if (!IS_ERR(pmr->pmr_mr)) {
+               pmr->pmr_iova = *iova;
+               *pp_pmr = pmr;
+               return 0;
+       }
+
+       rc = PTR_ERR(pmr->pmr_mr);
+       CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+       pmr->pmr_mr = NULL;
+       kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+       return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+       kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+       kib_phys_mr_t  *pmr;
+
+       LASSERT (pool->po_allocated == 0);
+
+       while (!list_empty(&pool->po_free_list)) {
+               pmr = list_entry(pool->po_free_list.next,
+                                    kib_phys_mr_t, pmr_list);
+
+               LASSERT (pmr->pmr_mr == NULL);
+               list_del(&pmr->pmr_list);
+
+               if (pmr->pmr_ipb != NULL) {
+                       LIBCFS_FREE(pmr->pmr_ipb,
+                                   IBLND_MAX_RDMA_FRAGS *
+                                   sizeof(struct ib_phys_buf));
+               }
+
+               LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+       }
+
+       kiblnd_fini_pool(pool);
+       if (ppo->ppo_hdev != NULL)
+               kiblnd_hdev_decref(ppo->ppo_hdev);
+
+       LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+       return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+       struct kib_pmr_pool     *ppo;
+       struct kib_pool         *pool;
+       kib_phys_mr_t           *pmr;
+       int                     i;
+
+       LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+                        ps->ps_cpt, sizeof(kib_pmr_pool_t));
+       if (ppo == NULL) {
+               CERROR("Failed to allocate PMR pool\n");
+               return -ENOMEM;
+       }
+
+       pool = &ppo->ppo_pool;
+       kiblnd_init_pool(ps, pool, size);
+
+       for (i = 0; i < size; i++) {
+               LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+                                ps->ps_cpt, sizeof(kib_phys_mr_t));
+               if (pmr == NULL)
+                       break;
+
+               pmr->pmr_pool = ppo;
+               LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+                                IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+               if (pmr->pmr_ipb == NULL)
+                       break;
+
+               list_add(&pmr->pmr_list, &pool->po_free_list);
+       }
+
+       if (i < size) {
+               ps->ps_pool_destroy(pool);
+               return -ENOMEM;
+       }
+
+       ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+       *pp_po = pool;
+       return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+       kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+       int          i;
+
+       LASSERT (pool->po_allocated == 0);
+
+       if (tpo->tpo_tx_pages != NULL) {
+               kiblnd_unmap_tx_pool(tpo);
+               kiblnd_free_pages(tpo->tpo_tx_pages);
+       }
+
+       if (tpo->tpo_tx_descs == NULL)
+               goto out;
+
+       for (i = 0; i < pool->po_size; i++) {
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+               list_del(&tx->tx_list);
+               if (tx->tx_pages != NULL)
+                       LIBCFS_FREE(tx->tx_pages,
+                                   LNET_MAX_IOV *
+                                   sizeof(*tx->tx_pages));
+               if (tx->tx_frags != NULL)
+                       LIBCFS_FREE(tx->tx_frags,
+                                   IBLND_MAX_RDMA_FRAGS *
+                                           sizeof(*tx->tx_frags));
+               if (tx->tx_wrq != NULL)
+                       LIBCFS_FREE(tx->tx_wrq,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) *
+                                   sizeof(*tx->tx_wrq));
+               if (tx->tx_sge != NULL)
+                       LIBCFS_FREE(tx->tx_sge,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) *
+                                   sizeof(*tx->tx_sge));
+               if (tx->tx_rd != NULL)
+                       LIBCFS_FREE(tx->tx_rd,
+                                   offsetof(kib_rdma_desc_t,
+                                            rd_frags[IBLND_MAX_RDMA_FRAGS]));
+       }
+
+       LIBCFS_FREE(tpo->tpo_tx_descs,
+                   pool->po_size * sizeof(kib_tx_t));
+out:
+       kiblnd_fini_pool(pool);
+       LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+       int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+       return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+       int         i;
+       int         npg;
+       kib_pool_t    *pool;
+       kib_tx_pool_t *tpo;
+
+       LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+       if (tpo == NULL) {
+               CERROR("Failed to allocate TX pool\n");
+               return -ENOMEM;
+       }
+
+       pool = &tpo->tpo_pool;
+       kiblnd_init_pool(ps, pool, size);
+       tpo->tpo_tx_descs = NULL;
+       tpo->tpo_tx_pages = NULL;
+
+       npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+       if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+               CERROR("Can't allocate tx pages: %d\n", npg);
+               LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+               return -ENOMEM;
+       }
+
+       LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+                        size * sizeof(kib_tx_t));
+       if (tpo->tpo_tx_descs == NULL) {
+               CERROR("Can't allocate %d tx descriptors\n", size);
+               ps->ps_pool_destroy(pool);
+               return -ENOMEM;
+       }
+
+       memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+       for (i = 0; i < size; i++) {
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+               tx->tx_pool = tpo;
+               if (ps->ps_net->ibn_fmr_ps != NULL) {
+                       LIBCFS_CPT_ALLOC(tx->tx_pages,
+                                        lnet_cpt_table(), ps->ps_cpt,
+                                        LNET_MAX_IOV * sizeof(*tx->tx_pages));
+                       if (tx->tx_pages == NULL)
+                               break;
+               }
+
+               LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+                                IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+               if (tx->tx_frags == NULL)
+                       break;
+
+               sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+               LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                sizeof(*tx->tx_wrq));
+               if (tx->tx_wrq == NULL)
+                       break;
+
+               LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                sizeof(*tx->tx_sge));
+               if (tx->tx_sge == NULL)
+                       break;
+
+               LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+                                offsetof(kib_rdma_desc_t,
+                                         rd_frags[IBLND_MAX_RDMA_FRAGS]));
+               if (tx->tx_rd == NULL)
+                       break;
+       }
+
+       if (i == size) {
+               kiblnd_map_tx_pool(tpo);
+               *pp_po = pool;
+               return 0;
+       }
+
+       ps->ps_pool_destroy(pool);
+       return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+       kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+                                            tps_poolset);
+       kib_tx_t         *tx  = list_entry(node, kib_tx_t, tx_list);
+
+       tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+       int     i;
+
+       cfs_cpt_for_each(i, lnet_cpt_table()) {
+               kib_tx_poolset_t        *tps;
+               kib_fmr_poolset_t       *fps;
+               kib_pmr_poolset_t       *pps;
+
+               if (net->ibn_tx_ps != NULL) {
+                       tps = net->ibn_tx_ps[i];
+                       kiblnd_fini_poolset(&tps->tps_poolset);
+               }
+
+               if (net->ibn_fmr_ps != NULL) {
+                       fps = net->ibn_fmr_ps[i];
+                       kiblnd_fini_fmr_poolset(fps);
+               }
+
+               if (net->ibn_pmr_ps != NULL) {
+                       pps = net->ibn_pmr_ps[i];
+                       kiblnd_fini_poolset(&pps->pps_poolset);
+               }
+       }
+
+       if (net->ibn_tx_ps != NULL) {
+               cfs_percpt_free(net->ibn_tx_ps);
+               net->ibn_tx_ps = NULL;
+       }
+
+       if (net->ibn_fmr_ps != NULL) {
+               cfs_percpt_free(net->ibn_fmr_ps);
+               net->ibn_fmr_ps = NULL;
+       }
+
+       if (net->ibn_pmr_ps != NULL) {
+               cfs_percpt_free(net->ibn_pmr_ps);
+               net->ibn_pmr_ps = NULL;
+       }
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+       unsigned long   flags;
+       int             cpt;
+       int             rc;
+       int             i;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+           net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+               read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                          flags);
+               goto create_tx_pool;
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (*kiblnd_tunables.kib_fmr_pool_size <
+           *kiblnd_tunables.kib_ntx / 4) {
+               CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+                      *kiblnd_tunables.kib_fmr_pool_size,
+                      *kiblnd_tunables.kib_ntx / 4);
+               rc = -EINVAL;
+               goto failed;
+       }
+
+       /* TX pool must be created later than FMR/PMR, see LU-2268
+        * for details */
+       LASSERT(net->ibn_tx_ps == NULL);
+
+       /* premapping can fail if ibd_nmr > 1, so we always create
+        * FMR/PMR pool and map-on-demand if premapping failed */
+
+       net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                          sizeof(kib_fmr_poolset_t));
+       if (net->ibn_fmr_ps == NULL) {
+               CERROR("Failed to allocate FMR pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+                                            kiblnd_fmr_pool_size(ncpts),
+                                            kiblnd_fmr_flush_trigger(ncpts));
+               if (rc == -ENOSYS && i == 0) /* no FMR */
+                       break; /* create PMR pool */
+
+               if (rc != 0) { /* a real error */
+                       CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+       if (i > 0) {
+               LASSERT(i == ncpts);
+               goto create_tx_pool;
+       }
+
+       cfs_percpt_free(net->ibn_fmr_ps);
+       net->ibn_fmr_ps = NULL;
+
+       CWARN("Device does not support FMR, failing back to PMR\n");
+
+       if (*kiblnd_tunables.kib_pmr_pool_size <
+           *kiblnd_tunables.kib_ntx / 4) {
+               CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+                      *kiblnd_tunables.kib_pmr_pool_size,
+                      *kiblnd_tunables.kib_ntx / 4);
+               rc = -EINVAL;
+               goto failed;
+       }
+
+       net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                          sizeof(kib_pmr_poolset_t));
+       if (net->ibn_pmr_ps == NULL) {
+               CERROR("Failed to allocate PMR pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+                                        cpt, net, "PMR",
+                                        kiblnd_pmr_pool_size(ncpts),
+                                        kiblnd_create_pmr_pool,
+                                        kiblnd_destroy_pmr_pool, NULL, NULL);
+               if (rc != 0) {
+                       CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+ create_tx_pool:
+       net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                         sizeof(kib_tx_poolset_t));
+       if (net->ibn_tx_ps == NULL) {
+               CERROR("Failed to allocate tx pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+                                        cpt, net, "TX",
+                                        kiblnd_tx_pool_size(ncpts),
+                                        kiblnd_create_tx_pool,
+                                        kiblnd_destroy_tx_pool,
+                                        kiblnd_tx_init, NULL);
+               if (rc != 0) {
+                       CERROR("Can't initialize TX pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+       return 0;
+ failed:
+       kiblnd_net_fini_pools(net);
+       LASSERT(rc != 0);
+       return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+       struct ib_device_attr *attr;
+       int                 rc;
+
+       /* It's safe to assume a HCA can handle a page size
+        * matching that of the native system */
+       hdev->ibh_page_shift = PAGE_SHIFT;
+       hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+       hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+       LIBCFS_ALLOC(attr, sizeof(*attr));
+       if (attr == NULL) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       rc = ib_query_device(hdev->ibh_ibdev, attr);
+       if (rc == 0)
+               hdev->ibh_mr_size = attr->max_mr_size;
+
+       LIBCFS_FREE(attr, sizeof(*attr));
+
+       if (rc != 0) {
+               CERROR("Failed to query IB device: %d\n", rc);
+               return rc;
+       }
+
+       if (hdev->ibh_mr_size == ~0ULL) {
+               hdev->ibh_mr_shift = 64;
+               return 0;
+       }
+
+       for (hdev->ibh_mr_shift = 0;
+            hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+               if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+                   hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+                       return 0;
+       }
+
+       CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+       return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+       int     i;
+
+       if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+               return;
+
+       for (i = 0; i < hdev->ibh_nmrs; i++) {
+               if (hdev->ibh_mrs[i] == NULL)
+                       break;
+
+               ib_dereg_mr(hdev->ibh_mrs[i]);
+       }
+
+       LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+       hdev->ibh_mrs  = NULL;
+       hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+       kiblnd_hdev_cleanup_mrs(hdev);
+
+       if (hdev->ibh_pd != NULL)
+               ib_dealloc_pd(hdev->ibh_pd);
+
+       if (hdev->ibh_cmid != NULL)
+               rdma_destroy_id(hdev->ibh_cmid);
+
+       LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+       struct ib_mr *mr;
+       int        i;
+       int        rc;
+       __u64    mm_size;
+       __u64    mr_size;
+       int        acflags = IB_ACCESS_LOCAL_WRITE |
+                               IB_ACCESS_REMOTE_WRITE;
+
+       rc = kiblnd_hdev_get_attr(hdev);
+       if (rc != 0)
+               return rc;
+
+       if (hdev->ibh_mr_shift == 64) {
+               LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+               if (hdev->ibh_mrs == NULL) {
+                       CERROR("Failed to allocate MRs table\n");
+                       return -ENOMEM;
+               }
+
+               hdev->ibh_mrs[0] = NULL;
+               hdev->ibh_nmrs   = 1;
+
+               mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+               if (IS_ERR(mr)) {
+                       CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+                       kiblnd_hdev_cleanup_mrs(hdev);
+                       return PTR_ERR(mr);
+               }
+
+               hdev->ibh_mrs[0] = mr;
+
+               goto out;
+       }
+
+       mr_size = (1ULL << hdev->ibh_mr_shift);
+       mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+       hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+       if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+               /* it's 4T..., assume we will re-code at that time */
+               CERROR("Can't support memory size: x"LPX64
+                      " with MR size: x"LPX64"\n", mm_size, mr_size);
+               return -EINVAL;
+       }
+
+       /* create an array of MRs to cover all memory */
+       LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+       if (hdev->ibh_mrs == NULL) {
+               CERROR("Failed to allocate MRs' table\n");
+               return -ENOMEM;
+       }
+
+       memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+       for (i = 0; i < hdev->ibh_nmrs; i++) {
+               struct ib_phys_buf ipb;
+               __u64         iova;
+
+               ipb.size = hdev->ibh_mr_size;
+               ipb.addr = i * mr_size;
+               iova     = ipb.addr;
+
+               mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+               if (IS_ERR(mr)) {
+                       CERROR("Failed ib_reg_phys_mr addr "LPX64
+                              " size "LPX64" : %ld\n",
+                              ipb.addr, ipb.size, PTR_ERR(mr));
+                       kiblnd_hdev_cleanup_mrs(hdev);
+                       return PTR_ERR(mr);
+               }
+
+               LASSERT (iova == ipb.addr);
+
+               hdev->ibh_mrs[i] = mr;
+       }
+
+out:
+       if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+               LCONSOLE_INFO("Register global MR array, MR size: "
+                             LPX64", array size: %d\n",
+                             hdev->ibh_mr_size, hdev->ibh_nmrs);
+       return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+       return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+       struct rdma_cm_id  *cmid;
+       struct sockaddr_in  srcaddr;
+       struct sockaddr_in  dstaddr;
+       int              rc;
+
+       if (dev->ibd_hdev == NULL || /* initializing */
+           dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+           *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+               return 1;
+
+       /* XXX: it's UGLY, but I don't have better way to find
+        * ib-bonding HCA failover because:
+        *
+        * a. no reliable CM event for HCA failover...
+        * b. no OFED API to get ib_device for current net_device...
+        *
+        * We have only two choices at this point:
+        *
+        * a. rdma_bind_addr(), it will conflict with listener cmid
+        * b. rdma_resolve_addr() to zero addr */
+       cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+       if (IS_ERR(cmid)) {
+               rc = PTR_ERR(cmid);
+               CERROR("Failed to create cmid for failover: %d\n", rc);
+               return rc;
+       }
+
+       memset(&srcaddr, 0, sizeof(srcaddr));
+       srcaddr.sin_family      = AF_INET;
+       srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+       memset(&dstaddr, 0, sizeof(dstaddr));
+       dstaddr.sin_family = AF_INET;
+       rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+                              (struct sockaddr *)&dstaddr, 1);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
+               rdma_destroy_id(cmid);
+               return rc;
+       }
+
+       if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+               /* don't need device failover */
+               rdma_destroy_id(cmid);
+               return 0;
+       }
+
+       return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+       LIST_HEAD      (zombie_tpo);
+       LIST_HEAD      (zombie_ppo);
+       LIST_HEAD      (zombie_fpo);
+       struct rdma_cm_id  *cmid  = NULL;
+       kib_hca_dev_t      *hdev  = NULL;
+       kib_hca_dev_t      *old;
+       struct ib_pd       *pd;
+       kib_net_t         *net;
+       struct sockaddr_in  addr;
+       unsigned long       flags;
+       int              rc = 0;
+       int                 i;
+
+       LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+                dev->ibd_can_failover ||
+                dev->ibd_hdev == NULL);
+
+       rc = kiblnd_dev_need_failover(dev);
+       if (rc <= 0)
+               goto out;
+
+       if (dev->ibd_hdev != NULL &&
+           dev->ibd_hdev->ibh_cmid != NULL) {
+               /* XXX it's not good to close old listener at here,
+                * because we can fail to create new listener.
+                * But we have to close it now, otherwise rdma_bind_addr
+                * will return EADDRINUSE... How crap! */
+               write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+               cmid = dev->ibd_hdev->ibh_cmid;
+               /* make next schedule of kiblnd_dev_need_failover()
+                * return 1 for me */
+               dev->ibd_hdev->ibh_cmid  = NULL;
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               rdma_destroy_id(cmid);
+       }
+
+       cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+       if (IS_ERR(cmid)) {
+               rc = PTR_ERR(cmid);
+               CERROR("Failed to create cmid for failover: %d\n", rc);
+               goto out;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sin_family      = AF_INET;
+       addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+       addr.sin_port   = htons(*kiblnd_tunables.kib_service);
+
+       /* Bind to failover device or port */
+       rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
+               rdma_destroy_id(cmid);
+               goto out;
+       }
+
+       LIBCFS_ALLOC(hdev, sizeof(*hdev));
+       if (hdev == NULL) {
+               CERROR("Failed to allocate kib_hca_dev\n");
+               rdma_destroy_id(cmid);
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       atomic_set(&hdev->ibh_ref, 1);
+       hdev->ibh_dev   = dev;
+       hdev->ibh_cmid  = cmid;
+       hdev->ibh_ibdev = cmid->device;
+
+       pd = ib_alloc_pd(cmid->device);
+       if (IS_ERR(pd)) {
+               rc = PTR_ERR(pd);
+               CERROR("Can't allocate PD: %d\n", rc);
+               goto out;
+       }
+
+       hdev->ibh_pd = pd;
+
+       rc = rdma_listen(cmid, 0);
+       if (rc != 0) {
+               CERROR("Can't start new listener: %d\n", rc);
+               goto out;
+       }
+
+       rc = kiblnd_hdev_setup_mrs(hdev);
+       if (rc != 0) {
+               CERROR("Can't setup device: %d\n", rc);
+               goto out;
+       }
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       old = dev->ibd_hdev;
+       dev->ibd_hdev = hdev; /* take over the refcount */
+       hdev = old;
+
+       list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+               cfs_cpt_for_each(i, lnet_cpt_table()) {
+                       kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+                                           &zombie_tpo);
+
+                       if (net->ibn_fmr_ps != NULL) {
+                               kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+                                                       &zombie_fpo);
+
+                       } else if (net->ibn_pmr_ps != NULL) {
+                               kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+                                                   pps_poolset, &zombie_ppo);
+                       }
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+       if (!list_empty(&zombie_tpo))
+               kiblnd_destroy_pool_list(&zombie_tpo);
+       if (!list_empty(&zombie_ppo))
+               kiblnd_destroy_pool_list(&zombie_ppo);
+       if (!list_empty(&zombie_fpo))
+               kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+       if (hdev != NULL)
+               kiblnd_hdev_decref(hdev);
+
+       if (rc != 0)
+               dev->ibd_failed_failover++;
+       else
+               dev->ibd_failed_failover = 0;
+
+       return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+       LASSERT (dev->ibd_nnets == 0);
+       LASSERT (list_empty(&dev->ibd_nets));
+
+       list_del(&dev->ibd_fail_list);
+       list_del(&dev->ibd_list);
+
+       if (dev->ibd_hdev != NULL)
+               kiblnd_hdev_decref(dev->ibd_hdev);
+
+       LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+       struct net_device *netdev;
+       kib_dev_t        *dev;
+       __u32         netmask;
+       __u32         ip;
+       int             up;
+       int             rc;
+
+       rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+       if (rc != 0) {
+               CERROR("Can't query IPoIB interface %s: %d\n",
+                      ifname, rc);
+               return NULL;
+       }
+
+       if (!up) {
+               CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(dev, sizeof(*dev));
+       if (dev == NULL)
+               return NULL;
+
+       memset(dev, 0, sizeof(*dev));
+       netdev = dev_get_by_name(&init_net, ifname);
+       if (netdev == NULL) {
+               dev->ibd_can_failover = 0;
+       } else {
+               dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+               dev_put(netdev);
+       }
+
+       INIT_LIST_HEAD(&dev->ibd_nets);
+       INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+       INIT_LIST_HEAD(&dev->ibd_fail_list);
+       dev->ibd_ifip = ip;
+       strcpy(&dev->ibd_ifname[0], ifname);
+
+       /* initialize the device */
+       rc = kiblnd_dev_failover(dev);
+       if (rc != 0) {
+               CERROR("Can't initialize device: %d\n", rc);
+               LIBCFS_FREE(dev, sizeof(*dev));
+               return NULL;
+       }
+
+       list_add_tail(&dev->ibd_list,
+                         &kiblnd_data.kib_devs);
+       return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+       struct kib_sched_info   *sched;
+       int                     i;
+
+       LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+       CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       switch (kiblnd_data.kib_init) {
+       default:
+               LBUG();
+
+       case IBLND_INIT_ALL:
+       case IBLND_INIT_DATA:
+               LASSERT (kiblnd_data.kib_peers != NULL);
+               for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+                       LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+               }
+               LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+               LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+               /* flag threads to terminate; wake and wait for them to die */
+               kiblnd_data.kib_shutdown = 1;
+
+               /* NB: we really want to stop scheduler threads net by net
+                * instead of the whole module, this should be improved
+                * with dynamic configuration LNet */
+               cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+                       wake_up_all(&sched->ibs_waitq);
+
+               wake_up_all(&kiblnd_data.kib_connd_waitq);
+               wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+               i = 2;
+               while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                              "Waiting for %d threads to terminate\n",
+                              atomic_read(&kiblnd_data.kib_nthreads));
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               /* fall through */
+
+       case IBLND_INIT_NOTHING:
+               break;
+       }
+
+       if (kiblnd_data.kib_peers != NULL) {
+               LIBCFS_FREE(kiblnd_data.kib_peers,
+                           sizeof(struct list_head) *
+                           kiblnd_data.kib_peer_hash_size);
+       }
+
+       if (kiblnd_data.kib_scheds != NULL)
+               cfs_percpt_free(kiblnd_data.kib_scheds);
+
+       CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+       module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+       kib_net_t       *net = ni->ni_data;
+       rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+       int            i;
+       unsigned long     flags;
+
+       LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+       if (net == NULL)
+               goto out;
+
+       CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       write_lock_irqsave(g_lock, flags);
+       net->ibn_shutdown = 1;
+       write_unlock_irqrestore(g_lock, flags);
+
+       switch (net->ibn_init) {
+       default:
+               LBUG();
+
+       case IBLND_INIT_ALL:
+               /* nuke all existing peers within this net */
+               kiblnd_del_peer(ni, LNET_NID_ANY);
+
+               /* Wait for all peer state to clean up */
+               i = 2;
+               while (atomic_read(&net->ibn_npeers) != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+                              "%s: waiting for %d peers to disconnect\n",
+                              libcfs_nid2str(ni->ni_nid),
+                              atomic_read(&net->ibn_npeers));
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               kiblnd_net_fini_pools(net);
+
+               write_lock_irqsave(g_lock, flags);
+               LASSERT(net->ibn_dev->ibd_nnets > 0);
+               net->ibn_dev->ibd_nnets--;
+               list_del(&net->ibn_list);
+               write_unlock_irqrestore(g_lock, flags);
+
+               /* fall through */
+
+       case IBLND_INIT_NOTHING:
+               LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+               if (net->ibn_dev != NULL &&
+                   net->ibn_dev->ibd_nnets == 0)
+                       kiblnd_destroy_dev(net->ibn_dev);
+
+               break;
+       }
+
+       CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       net->ibn_init = IBLND_INIT_NOTHING;
+       ni->ni_data = NULL;
+
+       LIBCFS_FREE(net, sizeof(*net));
+
+out:
+       if (list_empty(&kiblnd_data.kib_devs))
+               kiblnd_base_shutdown();
+       return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+       struct kib_sched_info   *sched;
+       int                     rc;
+       int                     i;
+
+       LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+       try_module_get(THIS_MODULE);
+       memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+       rwlock_init(&kiblnd_data.kib_global_lock);
+
+       INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+       INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+       kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+       LIBCFS_ALLOC(kiblnd_data.kib_peers,
+                    sizeof(struct list_head) *
+                           kiblnd_data.kib_peer_hash_size);
+       if (kiblnd_data.kib_peers == NULL) {
+               goto failed;
+       }
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+               INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+       spin_lock_init(&kiblnd_data.kib_connd_lock);
+       INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+       INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+       init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+       init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+       kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+                                                 sizeof(*sched));
+       if (kiblnd_data.kib_scheds == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+               int     nthrs;
+
+               spin_lock_init(&sched->ibs_lock);
+               INIT_LIST_HEAD(&sched->ibs_conns);
+               init_waitqueue_head(&sched->ibs_waitq);
+
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*kiblnd_tunables.kib_nscheds > 0) {
+                       nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+               } else {
+                       /* max to half of CPUs, another half is reserved for
+                        * upper layer modules */
+                       nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+               }
+
+               sched->ibs_nthreads_max = nthrs;
+               sched->ibs_cpt = i;
+       }
+
+       kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+       /* lists/ptrs/locks initialised */
+       kiblnd_data.kib_init = IBLND_INIT_DATA;
+       /*****************************************************/
+
+       rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+       if (rc != 0) {
+               CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+               goto failed;
+       }
+
+       if (*kiblnd_tunables.kib_dev_failover != 0)
+               rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+                                        "kiblnd_failover");
+
+       if (rc != 0) {
+               CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+               goto failed;
+       }
+
+       /* flag everything initialised */
+       kiblnd_data.kib_init = IBLND_INIT_ALL;
+       /*****************************************************/
+
+       return 0;
+
+ failed:
+       kiblnd_base_shutdown();
+       return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+       int     rc = 0;
+       int     nthrs;
+       int     i;
+
+       if (sched->ibs_nthreads == 0) {
+               if (*kiblnd_tunables.kib_nscheds > 0) {
+                       nthrs = sched->ibs_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              sched->ibs_cpt);
+                       nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+                       nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+               }
+       } else {
+               LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+               /* increase one thread if there is new interface */
+               nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long    id;
+               char    name[20];
+               id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+               snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+                        KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+               rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+               break;
+       }
+
+       sched->ibs_nthreads += i;
+       return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+       int     cpt;
+       int     rc;
+       int     i;
+
+       for (i = 0; i < ncpts; i++) {
+               struct kib_sched_info *sched;
+
+               cpt = (cpts == NULL) ? i : cpts[i];
+               sched = kiblnd_data.kib_scheds[cpt];
+
+               if (!newdev && sched->ibs_nthreads > 0)
+                       continue;
+
+               rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+               if (rc != 0) {
+                       CERROR("Failed to start scheduler threads for %s\n",
+                              dev->ibd_ifname);
+                       return rc;
+               }
+       }
+       return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+       kib_dev_t       *alias = NULL;
+       kib_dev_t       *dev;
+       char            *colon;
+       char            *colon2;
+
+       colon = strchr(ifname, ':');
+       list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+                       return dev;
+
+               if (alias != NULL)
+                       continue;
+
+               colon2 = strchr(dev->ibd_ifname, ':');
+               if (colon != NULL)
+                       *colon = 0;
+               if (colon2 != NULL)
+                       *colon2 = 0;
+
+               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+                       alias = dev;
+
+               if (colon != NULL)
+                       *colon = ':';
+               if (colon2 != NULL)
+                       *colon2 = ':';
+       }
+       return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+       char                 *ifname;
+       kib_dev_t               *ibdev = NULL;
+       kib_net_t               *net;
+       struct timeval      tv;
+       unsigned long        flags;
+       int                    rc;
+       int                       newdev;
+
+       LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+       if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+               rc = kiblnd_base_startup();
+               if (rc != 0)
+                       return rc;
+       }
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       ni->ni_data = net;
+       if (net == NULL)
+               goto failed;
+
+       memset(net, 0, sizeof(*net));
+
+       do_gettimeofday(&tv);
+       net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+       ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+       ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+       ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+       ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+       if (ni->ni_interfaces[0] != NULL) {
+               /* Use the IPoIB interface specified in 'networks=' */
+
+               CLASSERT (LNET_MAX_INTERFACES > 1);
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Multiple interfaces not supported\n");
+                       goto failed;
+               }
+
+               ifname = ni->ni_interfaces[0];
+       } else {
+               ifname = *kiblnd_tunables.kib_default_ipif;
+       }
+
+       if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+               CERROR("IPoIB interface name too long: %s\n", ifname);
+               goto failed;
+       }
+
+       ibdev = kiblnd_dev_search(ifname);
+
+       newdev = ibdev == NULL;
+       /* hmm...create kib_dev even for alias */
+       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+               ibdev = kiblnd_create_dev(ifname);
+
+       if (ibdev == NULL)
+               goto failed;
+
+       net->ibn_dev = ibdev;
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+       rc = kiblnd_dev_start_threads(ibdev, newdev,
+                                     ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto failed;
+
+       rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0) {
+               CERROR("Failed to initialize NI pools: %d\n", rc);
+               goto failed;
+       }
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       ibdev->ibd_nnets++;
+       list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       net->ibn_init = IBLND_INIT_ALL;
+
+       return 0;
+
+failed:
+       if (net->ibn_dev == NULL && ibdev != NULL)
+               kiblnd_destroy_dev(ibdev);
+
+       kiblnd_shutdown(ni);
+
+       CDEBUG(D_NET, "kiblnd_startup failed\n");
+       return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+       lnet_unregister_lnd(&the_o2iblnd);
+       kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+       int    rc;
+
+       CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+       CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                 <= IBLND_MSG_SIZE);
+       CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                 <= IBLND_MSG_SIZE);
+
+       rc = kiblnd_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       lnet_register_lnd(&the_o2iblnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644 (file)
index 0000000..e4626bf
--- /dev/null
@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE           101     /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED                  100
+
+#define IBLND_N_SCHED                  2
+#define IBLND_N_SCHED_HIGH             4
+
+typedef struct
+{
+       int           *kib_dev_failover;     /* HCA failover */
+       unsigned int     *kib_service;    /* IB service number */
+       int           *kib_min_reconnect_interval; /* first failed connection retry... */
+       int           *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+       int           *kib_cksum;           /* checksum kib_msg_t? */
+       int           *kib_timeout;       /* comms timeout (seconds) */
+       int           *kib_keepalive;   /* keepalive timeout (seconds) */
+       int           *kib_ntx;       /* # tx descs */
+       int           *kib_credits;       /* # concurrent sends */
+       int           *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+       int           *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+       int           *kib_peercredits_hiw;  /* # when eagerly to return credits */
+       int           *kib_peertimeout;      /* seconds to consider peer dead */
+       char        **kib_default_ipif;     /* default IPoIB interface */
+       int           *kib_retry_count;
+       int           *kib_rnr_retry_count;
+       int           *kib_concurrent_sends; /* send work queue sizing */
+       int              *kib_ib_mtu;           /* IB MTU */
+       int           *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+                                                * than this value, 0 disable map-on-demand */
+       int           *kib_pmr_pool_size;    /* # physical MR in pool */
+       int           *kib_fmr_pool_size;    /* # FMRs in pool */
+       int           *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+       int           *kib_fmr_cache;   /* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ctl_table_header_t *kib_sysctl;  /* sysctl interface */
+#endif
+       int           *kib_require_priv_port;/* accept only privileged ports */
+       int           *kib_use_priv_port;    /* use privileged port for active connect */
+       /* # threads on each CPT */
+       int              *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8   /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7   /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT  8         /* default # of peer credits */
+#define IBLND_CREDITS_MAX        ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_MSG_QUEUE_SIZE_V1 :   \
+                                    *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_CREDIT_HIGHWATER_V1 : \
+                                    *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+       if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+               return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+               return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+       return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+                                    kiblnd_concurrent_sends_v1() : \
+                                    *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)         (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS    LNET_MAX_IOV      /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+                                   *kiblnd_tunables.kib_map_on_demand :      \
+                                    IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL                  256
+#define IBLND_PMR_POOL                 256
+#define IBLND_FMR_POOL                 256
+#define IBLND_FMR_POOL_FLUSH           192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()            (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)           (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)          IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)        ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)     (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
+
+typedef struct
+{
+       struct list_head           ibd_list;      /* chain on kib_devs */
+       struct list_head           ibd_fail_list;     /* chain on kib_failed_devs */
+       __u32           ibd_ifip;         /* IPoIB interface IP */
+       /** IPoIB interface name */
+       char             ibd_ifname[KIB_IFNAME_SIZE];
+       int               ibd_nnets;     /* # nets extant */
+
+       cfs_time_t         ibd_next_failover;
+       int               ibd_failed_failover; /* # failover failures */
+       unsigned int     ibd_failover;      /* failover in progress */
+       unsigned int     ibd_can_failover;  /* IPoIB interface is a bonding master */
+       struct list_head           ibd_nets;
+       struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+       struct rdma_cm_id   *ibh_cmid;    /* listener cmid */
+       struct ib_device    *ibh_ibdev;  /* IB device */
+       int               ibh_page_shift;    /* page shift of current HCA */
+       int               ibh_page_size;     /* page size of current HCA */
+       __u64           ibh_page_mask;     /* page mask of current HCA */
+       int               ibh_mr_shift;      /* bits shift of max MR size */
+       __u64           ibh_mr_size;       /* size of MR */
+       int               ibh_nmrs;       /* # of global MRs */
+       struct ib_mr       **ibh_mrs;      /* global MR */
+       struct ib_pd    *ibh_pd;            /* PD */
+       kib_dev_t          *ibh_dev;       /* owner */
+       atomic_t         ibh_ref;          /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY       1
+
+typedef struct
+{
+       int                  ibp_npages;             /* # pages */
+       struct page         *ibp_pages[0];         /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+       struct list_head              pmr_list;        /* chain node */
+       struct ib_phys_buf     *pmr_ipb;                /* physical buffer */
+       struct ib_mr       *pmr_mr;              /* IB MR */
+       struct kib_pmr_pool    *pmr_pool;              /* owner of this MR */
+       __u64              pmr_iova;           /* Virtual I/O address */
+       int                  pmr_refcount;         /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+                                    int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+       spinlock_t              ps_lock;                /* serialize */
+       struct kib_net   *ps_net;                /* network it belongs to */
+       char                ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+       struct list_head              ps_pool_list;        /* list of pools */
+       struct list_head              ps_failed_pool_list;    /* failed pool list */
+       cfs_time_t            ps_next_retry;      /* time stamp for retry if failed to allocate */
+       int                  ps_increasing;       /* is allocating new pool */
+       int                  ps_pool_size;         /* new pool size */
+       int                     ps_cpt;                 /* CPT id */
+
+       kib_ps_pool_create_t    ps_pool_create;  /* create a new pool */
+       kib_ps_pool_destroy_t   ps_pool_destroy;        /* destroy a pool */
+       kib_ps_node_init_t      ps_node_init;      /* initialize new allocated node */
+       kib_ps_node_fini_t      ps_node_fini;      /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+       struct list_head              po_list;          /* chain on pool list */
+       struct list_head              po_free_list;        /* pre-allocated node */
+       kib_poolset_t     *po_owner;           /* pool_set of this pool */
+       cfs_time_t            po_deadline;          /* deadline of this pool */
+       int                  po_allocated;         /* # of elements in use */
+       int                  po_failed;       /* pool is created on failed HCA */
+       int                  po_size;           /* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+       kib_poolset_t      tps_poolset;     /* pool-set */
+       __u64              tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+       kib_pool_t            tpo_pool;        /* pool */
+       struct kib_hca_dev     *tpo_hdev;              /* device for this pool */
+       struct kib_tx     *tpo_tx_descs;           /* all the tx descriptors */
+       kib_pages_t         *tpo_tx_pages;         /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+       kib_poolset_t      pps_poolset;     /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+       struct kib_hca_dev     *ppo_hdev;              /* device for this pool */
+       kib_pool_t            ppo_pool;        /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+       spinlock_t              fps_lock;               /* serialize */
+       struct kib_net   *fps_net;              /* IB network */
+       struct list_head              fps_pool_list;      /* FMR pool list */
+       struct list_head              fps_failed_pool_list;   /* FMR pool list */
+       __u64              fps_version;     /* validity stamp */
+       int                     fps_cpt;                /* CPT id */
+       int                     fps_pool_size;
+       int                     fps_flush_trigger;
+       /* is allocating new pool */
+       int                     fps_increasing;
+       /* time stamp for retry if failed to allocate */
+       cfs_time_t              fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+       struct list_head              fpo_list;        /* chain on pool list */
+       struct kib_hca_dev     *fpo_hdev;              /* device for this pool */
+       kib_fmr_poolset_t      *fpo_owner;            /* owner of this pool */
+       struct ib_fmr_pool     *fpo_fmr_pool;      /* IB FMR pool */
+       cfs_time_t            fpo_deadline;        /* deadline of this pool */
+       int                  fpo_failed;             /* fmr pool is failed */
+       int                  fpo_map_count;       /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+       struct ib_pool_fmr     *fmr_pfmr;              /* IB pool fmr */
+       kib_fmr_pool_t   *fmr_pool;            /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+       struct list_head           ibn_list;      /* chain on kib_dev_t::ibd_nets */
+       __u64           ibn_incarnation;   /* my epoch */
+       int               ibn_init;       /* initialisation state */
+       int               ibn_shutdown;      /* shutting down? */
+
+       atomic_t                ibn_npeers;     /* # peers extant */
+       atomic_t                ibn_nconns;     /* # connections extant */
+
+       kib_tx_poolset_t        **ibn_tx_ps;    /* tx pool-set */
+       kib_fmr_poolset_t       **ibn_fmr_ps;   /* fmr pool-set */
+       kib_pmr_poolset_t       **ibn_pmr_ps;   /* pmr pool-set */
+
+       kib_dev_t               *ibn_dev;       /* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT               16
+#define KIB_THREAD_ID(cpt, tid)                ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)             ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)             ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+       /* serialise */
+       spinlock_t              ibs_lock;
+       /* schedulers sleep here */
+       wait_queue_head_t               ibs_waitq;
+       /* conns to check for rx completions */
+       struct list_head                ibs_conns;
+       /* number of scheduler threads */
+       int                     ibs_nthreads;
+       /* max allowed scheduler threads */
+       int                     ibs_nthreads_max;
+       int                     ibs_cpt;        /* CPT id */
+};
+
+typedef struct
+{
+       int                     kib_init;       /* initialisation state */
+       int                     kib_shutdown;   /* shut down? */
+       struct list_head                kib_devs;       /* IB devices extant */
+       /* list head of failed devices */
+       struct list_head                kib_failed_devs;
+       /* schedulers sleep here */
+       wait_queue_head_t               kib_failover_waitq;
+       atomic_t                kib_nthreads;   /* # live threads */
+       /* stabilize net/dev/peer/conn ops */
+       rwlock_t                kib_global_lock;
+       /* hash table of all my known peers */
+       struct list_head                *kib_peers;
+       /* size of kib_peers */
+       int                     kib_peer_hash_size;
+       /* the connd task (serialisation assertions) */
+       void                    *kib_connd;
+       /* connections to setup/teardown */
+       struct list_head                kib_connd_conns;
+       /* connections with zero refcount */
+       struct list_head                kib_connd_zombies;
+       /* connection daemon sleeps here */
+       wait_queue_head_t               kib_connd_waitq;
+       spinlock_t              kib_connd_lock; /* serialise */
+       struct ib_qp_attr       kib_error_qpa;  /* QP->ERROR */
+       /* percpt data for schedulers */
+       struct kib_sched_info   **kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING      0
+#define IBLND_INIT_DATA            1
+#define IBLND_INIT_ALL      2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+       __u16        ibcp_queue_depth;
+       __u16        ibcp_max_frags;
+       __u32        ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibim_hdr;            /* portals header */
+       char          ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+       __u32        rf_nob;           /* # bytes this frag */
+       __u64        rf_addr;         /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+       __u32        rd_key;           /* local/remote key */
+       __u32        rd_nfrags;     /* # fragments */
+       kib_rdma_frag_t   rd_frags[0];    /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibprm_hdr;          /* portals header */
+       __u64        ibprm_cookie;       /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+       __u64        ibpam_src_cookie;     /* reflected completion cookie */
+       __u64        ibpam_dst_cookie;     /* opaque completion cookie */
+       kib_rdma_desc_t   ibpam_rd;          /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibgm_hdr;            /* portals header */
+       __u64        ibgm_cookie;         /* opaque completion cookie */
+       kib_rdma_desc_t   ibgm_rd;            /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+       __u64        ibcm_cookie;         /* opaque completion cookie */
+       __s32        ibcm_status;         /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+       /* First 2 fields fixed FOR ALL TIME */
+       __u32        ibm_magic;     /* I'm an ibnal message */
+       __u16        ibm_version;         /* this is my version number */
+
+       __u8          ibm_type;      /* msg type */
+       __u8          ibm_credits;        /* returned credits */
+       __u32        ibm_nob;         /* # bytes in whole message */
+       __u32        ibm_cksum;     /* checksum (0 == no checksum) */
+       __u64        ibm_srcnid;           /* sender's NID */
+       __u64        ibm_srcstamp;       /* sender's incarnation */
+       __u64        ibm_dstnid;           /* destination's NID */
+       __u64        ibm_dststamp;       /* destination's incarnation */
+
+       union {
+               kib_connparams_t      connparams;
+               kib_immediate_msg_t   immediate;
+               kib_putreq_msg_t      putreq;
+               kib_putack_msg_t      putack;
+               kib_get_msg_t    get;
+               kib_completion_msg_t  completion;
+       } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC    /* unique magic */
+
+#define IBLND_MSG_VERSION_1     0x11
+#define IBLND_MSG_VERSION_2     0x12
+#define IBLND_MSG_VERSION         IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ         0xc0 /* connection request */
+#define IBLND_MSG_CONNACK         0xc1 /* connection acknowledge */
+#define IBLND_MSG_NOOP       0xd0      /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE     0xd1   /* immediate */
+#define IBLND_MSG_PUT_REQ         0xd2 /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK         0xd3 /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK         0xd4 /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE       0xd5  /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ         0xd6 /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE       0xd7  /* completion (src->sink: all OK) */
+
+typedef struct {
+       __u32       ibr_magic;       /* sender's magic */
+       __u16       ibr_version;           /* sender's version */
+       __u8         ibr_why;          /* reject reason */
+       __u8         ibr_padding;          /* padding */
+       __u64       ibr_incarnation;       /* incarnation of peer */
+       kib_connparams_t ibr_cp;                /* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1   /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2   /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL        3      /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4   /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5   /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6   /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7   /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx                     /* receive message */
+{
+       struct list_head                rx_list;      /* queue for attention */
+       struct kib_conn   *rx_conn;      /* owning conn */
+       int                    rx_nob;       /* # bytes received (-1 while posted) */
+       enum ib_wc_status        rx_status;    /* completion status */
+       kib_msg_t               *rx_msg;       /* message buffer (host vaddr) */
+       __u64                rx_msgaddr;   /* message buffer (I/O addr) */
+       DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+       struct ib_recv_wr        rx_wrq;       /* receive work item... */
+       struct ib_sge        rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0         /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1         /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2         /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3         /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx                     /* transmit message */
+{
+       struct list_head                tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+       kib_tx_pool_t       *tx_pool;      /* pool I'm from */
+       struct kib_conn   *tx_conn;      /* owning conn */
+       short                tx_sending;   /* # tx callbacks outstanding */
+       short                tx_queued;    /* queued for sending */
+       short                tx_waiting;   /* waiting for peer */
+       int                    tx_status;    /* LNET completion status */
+       unsigned long        tx_deadline;  /* completion deadline */
+       __u64                tx_cookie;    /* completion cookie */
+       lnet_msg_t             *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+       kib_msg_t               *tx_msg;       /* message buffer (host vaddr) */
+       __u64                tx_msgaddr;   /* message buffer (I/O addr) */
+       DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+       int                    tx_nwrq;      /* # send work items */
+       struct ib_send_wr       *tx_wrq;       /* send work items... */
+       struct ib_sge       *tx_sge;       /* ...and their memory */
+       kib_rdma_desc_t   *tx_rd;       /* rdma descriptor */
+       int                    tx_nfrags;    /* # entries in... */
+       struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+       __u64               *tx_pages;     /* rdma phys page addrs */
+       union {
+               kib_phys_mr_t      *pmr;        /* MR for physical buffer */
+               kib_fmr_t          fmr; /* FMR */
+       }                        tx_u;
+       int                    tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+       /* connection-in-progress variables */
+       kib_msg_t                cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+       struct kib_sched_info *ibc_sched;       /* scheduler information */
+       struct kib_peer     *ibc_peer;    /* owning peer */
+       kib_hca_dev_t       *ibc_hdev;    /* HCA bound on */
+       struct list_head           ibc_list;      /* stash on peer's conn list */
+       struct list_head           ibc_sched_list;    /* schedule for attention */
+       __u16           ibc_version;       /* version of connection */
+       __u64           ibc_incarnation;   /* which instance of the peer */
+       atomic_t         ibc_refcount;      /* # users */
+       int               ibc_state;     /* what's happening */
+       int               ibc_nsends_posted; /* # uncompleted sends */
+       int               ibc_noops_posted;  /* # uncompleted NOOPs */
+       int               ibc_credits;       /* # credits I have */
+       int               ibc_outstanding_credits; /* # credits to return */
+       int               ibc_reserved_credits;/* # ACK/DONE msg credits */
+       int               ibc_comms_error;   /* set on comms error */
+       unsigned int         ibc_nrx:16;        /* receive buffers owned */
+       unsigned int         ibc_scheduled:1;   /* scheduled for attention */
+       unsigned int         ibc_ready:1;       /* CQ callback fired */
+       /* time of last send */
+       unsigned long   ibc_last_send;
+       /** link chain for kiblnd_check_conns only */
+       struct list_head           ibc_connd_list;
+       /** rxs completed before ESTABLISHED */
+       struct list_head           ibc_early_rxs;
+       /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+       struct list_head           ibc_tx_noops;
+       struct list_head           ibc_tx_queue;       /* sends that need a credit */
+       struct list_head           ibc_tx_queue_nocred;/* sends that don't need a credit */
+       struct list_head           ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+       struct list_head           ibc_active_txs;     /* active tx awaiting completion */
+       spinlock_t           ibc_lock;           /* serialise */
+       kib_rx_t            *ibc_rxs;       /* the rx descs */
+       kib_pages_t      *ibc_rx_pages;       /* premapped rx msg pages */
+
+       struct rdma_cm_id   *ibc_cmid;     /* CM id */
+       struct ib_cq    *ibc_cq;             /* completion queue */
+
+       kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT               0         /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED 3        /* connection established */
+#define IBLND_CONN_CLOSING         4    /* being closed */
+#define IBLND_CONN_DISCONNECTED       5         /* disconnected */
+
+typedef struct kib_peer
+{
+       struct list_head           ibp_list;       /* stash on global peer list */
+       lnet_nid_t         ibp_nid;         /* who's on the other end(s) */
+       lnet_ni_t          *ibp_ni;          /* LNet interface */
+       atomic_t         ibp_refcount;       /* # users */
+       struct list_head           ibp_conns;     /* all active connections */
+       struct list_head           ibp_tx_queue;       /* msgs waiting for a conn */
+       __u16           ibp_version;    /* version of peer */
+       __u64           ibp_incarnation;    /* incarnation of peer */
+       int               ibp_connecting;     /* current active connection attempts */
+       int               ibp_accepting;      /* current passive connection attempts */
+       int               ibp_error;      /* errno on closing this peer */
+       cfs_time_t         ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+       LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+       atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+       LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+       if (atomic_dec_and_test(&hdev->ibh_ref))
+               kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+       if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+               return 0;
+
+       if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+               return 0;
+
+       if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+               return 1;
+
+       return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)                               \
+do {                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)++\n",                    \
+              (conn), atomic_read(&(conn)->ibc_refcount)); \
+       atomic_inc(&(conn)->ibc_refcount);                \
+} while (0)
+
+#define kiblnd_conn_decref(conn)                                       \
+do {                                                                   \
+       unsigned long flags;                                            \
+                                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)--\n",                              \
+              (conn), atomic_read(&(conn)->ibc_refcount));             \
+       LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                      \
+       if (atomic_dec_and_test(&(conn)->ibc_refcount)) {               \
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);  \
+               list_add_tail(&(conn)->ibc_list,                        \
+                                 &kiblnd_data.kib_connd_zombies);      \
+               wake_up(&kiblnd_data.kib_connd_waitq);          \
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+       }                                                               \
+} while (0)
+
+#define kiblnd_peer_addref(peer)                               \
+do {                                                       \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),  \
+              atomic_read (&(peer)->ibp_refcount));    \
+       atomic_inc(&(peer)->ibp_refcount);                \
+} while (0)
+
+#define kiblnd_peer_decref(peer)                               \
+do {                                                       \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),  \
+              atomic_read (&(peer)->ibp_refcount));    \
+       LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);            \
+       if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+               kiblnd_destroy_peer(peer);                    \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+       unsigned int hash =
+               ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+       return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+       /* Am I in the peer hash table? */
+       return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+       LASSERT (!list_empty(&peer->ibp_conns));
+
+       /* just return the first connection */
+       return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+       return (*kiblnd_tunables.kib_keepalive > 0) &&
+               cfs_time_after(jiffies, conn->ibc_last_send +
+                              *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       if (conn->ibc_outstanding_credits <
+           IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+           !kiblnd_send_keepalive(conn))
+               return 0; /* No need to send NOOP */
+
+       if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+               if (!list_empty(&conn->ibc_tx_queue_nocred))
+                       return 0; /* NOOP can be piggybacked */
+
+               /* No tx to piggyback NOOP onto or no credit to send a tx */
+               return (list_empty(&conn->ibc_tx_queue) ||
+                       conn->ibc_credits == 0);
+       }
+
+       if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+           !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+           conn->ibc_credits == 0)                 /* no credit */
+               return 0;
+
+       if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+           conn->ibc_outstanding_credits == 0) /* giving back credits */
+               return 0;
+
+       /* No tx to piggyback NOOP onto or no credit to send a tx */
+       return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+       ib_modify_qp(conn->ibc_cmid->qp,
+                    &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+       if (q == &conn->ibc_tx_queue)
+               return "tx_queue";
+
+       if (q == &conn->ibc_tx_queue_rsrvd)
+               return "tx_queue_rsrvd";
+
+       if (q == &conn->ibc_tx_queue_nocred)
+               return "tx_queue_nocred";
+
+       if (q == &conn->ibc_active_txs)
+               return "active_txs";
+
+       LBUG();
+       return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+       unsigned long lptr = (unsigned long)ptr;
+
+       LASSERT ((lptr & IBLND_WID_MASK) == 0);
+       LASSERT ((type & ~IBLND_WID_MASK) == 0);
+       return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+       return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+       return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+       conn->ibc_state = state;
+       mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+       msg->ibm_type = type;
+       msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+       int   i;
+       int   size;
+
+       for (i = size = 0; i < rd->rd_nfrags; i++)
+               size += rd->rd_frags[i].rf_nob;
+
+       return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+       if (nob < rd->rd_frags[index].rf_nob) {
+               rd->rd_frags[index].rf_addr += nob;
+               rd->rd_frags[index].rf_nob  -= nob;
+       } else {
+               index ++;
+       }
+
+       return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+       LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+                msgtype == IBLND_MSG_PUT_ACK);
+
+       return msgtype == IBLND_MSG_GET_REQ ?
+              offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+              offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+                                         void *msg, size_t size,
+                                         enum dma_data_direction direction)
+{
+       return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+                                          __u64 addr, size_t size,
+                                         enum dma_data_direction direction)
+{
+       ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+                                   struct scatterlist *sg, int nents,
+                                   enum dma_data_direction direction)
+{
+       return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+                                      struct scatterlist *sg, int nents,
+                                      enum dma_data_direction direction)
+{
+       ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+                                         struct scatterlist *sg)
+{
+       return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+                                            struct scatterlist *sg)
+{
+       return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)       ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)       ((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+                                   kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+                                __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+                 kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+                        int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+                        kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+                       struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+                                     int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+                               int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+                      int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+                        int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+                     int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+                unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644 (file)
index 0000000..cc62321
--- /dev/null
@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+       lnet_msg_t *lntmsg[2];
+       kib_net_t  *net = ni->ni_data;
+       int      rc;
+       int      i;
+
+       LASSERT (net != NULL);
+       LASSERT (!in_interrupt());
+       LASSERT (!tx->tx_queued);              /* mustn't be queued for sending */
+       LASSERT (tx->tx_sending == 0);    /* mustn't be awaiting sent callback */
+       LASSERT (!tx->tx_waiting);            /* mustn't be awaiting peer response */
+       LASSERT (tx->tx_pool != NULL);
+
+       kiblnd_unmap_tx(ni, tx);
+
+       /* tx may have up to 2 lnet msgs to finalise */
+       lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+       lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+       rc = tx->tx_status;
+
+       if (tx->tx_conn != NULL) {
+               LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+               kiblnd_conn_decref(tx->tx_conn);
+               tx->tx_conn = NULL;
+       }
+
+       tx->tx_nwrq = 0;
+       tx->tx_status = 0;
+
+       kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+       /* delay finalize until my descs have been freed */
+       for (i = 0; i < 2; i++) {
+               if (lntmsg[i] == NULL)
+                       continue;
+
+               lnet_finalize(ni, lntmsg[i], rc);
+       }
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+       kib_tx_t *tx;
+
+       while (!list_empty (txlist)) {
+               tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+               list_del(&tx->tx_list);
+               /* complete now */
+               tx->tx_waiting = 0;
+               tx->tx_status = status;
+               kiblnd_tx_done(ni, tx);
+       }
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+       kib_net_t               *net = (kib_net_t *)ni->ni_data;
+       struct list_head                *node;
+       kib_tx_t                *tx;
+       kib_tx_poolset_t        *tps;
+
+       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+       node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+       if (node == NULL)
+               return NULL;
+       tx = container_of(node, kib_tx_t, tx_list);
+
+       LASSERT (tx->tx_nwrq == 0);
+       LASSERT (!tx->tx_queued);
+       LASSERT (tx->tx_sending == 0);
+       LASSERT (!tx->tx_waiting);
+       LASSERT (tx->tx_status == 0);
+       LASSERT (tx->tx_conn == NULL);
+       LASSERT (tx->tx_lntmsg[0] == NULL);
+       LASSERT (tx->tx_lntmsg[1] == NULL);
+       LASSERT (tx->tx_u.pmr == NULL);
+       LASSERT (tx->tx_nfrags == 0);
+
+       return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+       kib_conn_t              *conn   = rx->rx_conn;
+       struct kib_sched_info   *sched  = conn->ibc_sched;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+       LASSERT(conn->ibc_nrx > 0);
+       conn->ibc_nrx--;
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+       kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+       kib_conn_t       *conn = rx->rx_conn;
+       kib_net_t         *net = conn->ibc_peer->ibp_ni->ni_data;
+       struct ib_recv_wr  *bad_wrq = NULL;
+       struct ib_mr       *mr;
+       int              rc;
+
+       LASSERT (net != NULL);
+       LASSERT (!in_interrupt());
+       LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+                credit == IBLND_POSTRX_PEER_CREDIT ||
+                credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+       mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+       LASSERT (mr != NULL);
+
+       rx->rx_sge.lkey   = mr->lkey;
+       rx->rx_sge.addr   = rx->rx_msgaddr;
+       rx->rx_sge.length = IBLND_MSG_SIZE;
+
+       rx->rx_wrq.next = NULL;
+       rx->rx_wrq.sg_list = &rx->rx_sge;
+       rx->rx_wrq.num_sge = 1;
+       rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+       LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+       LASSERT (rx->rx_nob >= 0);            /* not posted */
+
+       if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+               kiblnd_drop_rx(rx);          /* No more posts for this rx */
+               return 0;
+       }
+
+       rx->rx_nob = -1;                        /* flag posted */
+
+       rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+       if (rc != 0) {
+               CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+               rx->rx_nob = 0;
+       }
+
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+               return rc;
+
+       if (rc != 0) {
+               kiblnd_close_conn(conn, rc);
+               kiblnd_drop_rx(rx);          /* No more posts for this rx */
+               return rc;
+       }
+
+       if (credit == IBLND_POSTRX_NO_CREDIT)
+               return 0;
+
+       spin_lock(&conn->ibc_lock);
+       if (credit == IBLND_POSTRX_PEER_CREDIT)
+               conn->ibc_outstanding_credits++;
+       else
+               conn->ibc_reserved_credits++;
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+       return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+       struct list_head   *tmp;
+
+       list_for_each(tmp, &conn->ibc_active_txs) {
+               kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+               LASSERT (!tx->tx_queued);
+               LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+               if (tx->tx_cookie != cookie)
+                       continue;
+
+               if (tx->tx_waiting &&
+                   tx->tx_msg->ibm_type == txtype)
+                       return tx;
+
+               CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                     tx->tx_waiting ? "" : "NOT ",
+                     tx->tx_msg->ibm_type, txtype);
+       }
+       return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+       kib_tx_t    *tx;
+       lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+       int       idle;
+
+       spin_lock(&conn->ibc_lock);
+
+       tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+       if (tx == NULL) {
+               spin_unlock(&conn->ibc_lock);
+
+               CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                     txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               kiblnd_close_conn(conn, -EPROTO);
+               return;
+       }
+
+       if (tx->tx_status == 0) {              /* success so far */
+               if (status < 0) {              /* failed? */
+                       tx->tx_status = status;
+               } else if (txtype == IBLND_MSG_GET_REQ) {
+                       lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+               }
+       }
+
+       tx->tx_waiting = 0;
+
+       idle = !tx->tx_queued && (tx->tx_sending == 0);
+       if (idle)
+               list_del(&tx->tx_list);
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (idle)
+               kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+       lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+       kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+       if (tx == NULL) {
+               CERROR("Can't get tx for completion %x for %s\n",
+                      type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+       }
+
+       tx->tx_msg->ibm_u.completion.ibcm_status = status;
+       tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+       kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+       kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+       kib_msg_t    *msg = rx->rx_msg;
+       kib_conn_t   *conn = rx->rx_conn;
+       lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+       int        credits = msg->ibm_credits;
+       kib_tx_t     *tx;
+       int        rc = 0;
+       int        rc2;
+       int        post_credit;
+
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       CDEBUG (D_NET, "Received %x[%d] from %s\n",
+               msg->ibm_type, credits,
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+       if (credits != 0) {
+               /* Have I received credits that will let me send? */
+               spin_lock(&conn->ibc_lock);
+
+               if (conn->ibc_credits + credits >
+                   IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+                       rc2 = conn->ibc_credits;
+                       spin_unlock(&conn->ibc_lock);
+
+                       CERROR("Bad credits from %s: %d + %d > %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              rc2, credits,
+                              IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+                       kiblnd_close_conn(conn, -EPROTO);
+                       kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+                       return;
+               }
+
+               conn->ibc_credits += credits;
+
+               /* This ensures the credit taken by NOOP can be returned */
+               if (msg->ibm_type == IBLND_MSG_NOOP &&
+                   !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                       conn->ibc_outstanding_credits++;
+
+               spin_unlock(&conn->ibc_lock);
+               kiblnd_check_sends(conn);
+       }
+
+       switch (msg->ibm_type) {
+       default:
+               CERROR("Bad IBLND message type %x from %s\n",
+                      msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               post_credit = IBLND_POSTRX_NO_CREDIT;
+               rc = -EPROTO;
+               break;
+
+       case IBLND_MSG_NOOP:
+               if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+                       post_credit = IBLND_POSTRX_NO_CREDIT;
+                       break;
+               }
+
+               if (credits != 0) /* credit already posted */
+                       post_credit = IBLND_POSTRX_NO_CREDIT;
+               else          /* a keepalive NOOP */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_IMMEDIATE:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+                               msg->ibm_srcnid, rx, 0);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_PUT_REQ:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+                               msg->ibm_srcnid, rx, 1);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+               CWARN ("PUT_NACK from %s\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+
+       case IBLND_MSG_PUT_ACK:
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+               spin_lock(&conn->ibc_lock);
+               tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+                                       msg->ibm_u.putack.ibpam_src_cookie);
+               if (tx != NULL)
+                       list_del(&tx->tx_list);
+               spin_unlock(&conn->ibc_lock);
+
+               if (tx == NULL) {
+                       CERROR("Unmatched PUT_ACK from %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       rc = -EPROTO;
+                       break;
+               }
+
+               LASSERT (tx->tx_waiting);
+               /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                * (a) I can overwrite tx_msg since my peer has received it!
+                * (b) tx_waiting set tells tx_complete() it's not done. */
+
+               tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+               rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+                                      kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                      &msg->ibm_u.putack.ibpam_rd,
+                                      msg->ibm_u.putack.ibpam_dst_cookie);
+               if (rc2 < 0)
+                       CERROR("Can't setup rdma for PUT to %s: %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+               spin_lock(&conn->ibc_lock);
+               tx->tx_waiting = 0;     /* clear waiting and queue atomically */
+               kiblnd_queue_tx_locked(tx, conn);
+               spin_unlock(&conn->ibc_lock);
+               break;
+
+       case IBLND_MSG_PUT_DONE:
+               post_credit = IBLND_POSTRX_PEER_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+
+       case IBLND_MSG_GET_REQ:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+                               msg->ibm_srcnid, rx, 1);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_GET_DONE:
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+       }
+
+       if (rc < 0)                          /* protocol error */
+               kiblnd_close_conn(conn, rc);
+
+       if (post_credit != IBLND_POSTRX_DONT_POST)
+               kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+       kib_msg_t    *msg = rx->rx_msg;
+       kib_conn_t   *conn = rx->rx_conn;
+       lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+       kib_net_t    *net = ni->ni_data;
+       int        rc;
+       int        err = -EIO;
+
+       LASSERT (net != NULL);
+       LASSERT (rx->rx_nob < 0);              /* was posted */
+       rx->rx_nob = 0;                  /* isn't now */
+
+       if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+               goto ignore;
+
+       if (status != IB_WC_SUCCESS) {
+               CNETERR("Rx from %s failed: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+               goto failed;
+       }
+
+       LASSERT (nob >= 0);
+       rx->rx_nob = nob;
+
+       rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+       if (rc != 0) {
+               CERROR ("Error %d unpacking rx from %s\n",
+                       rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               goto failed;
+       }
+
+       if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+           msg->ibm_dstnid != ni->ni_nid ||
+           msg->ibm_srcstamp != conn->ibc_incarnation ||
+           msg->ibm_dststamp != net->ibn_incarnation) {
+               CERROR ("Stale rx from %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               err = -ESTALE;
+               goto failed;
+       }
+
+       /* set time last known alive */
+       kiblnd_peer_alive(conn->ibc_peer);
+
+       /* racing with connection establishment/teardown! */
+
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+               rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+               unsigned long  flags;
+
+               write_lock_irqsave(g_lock, flags);
+               /* must check holding global lock to eliminate race */
+               if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                       list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                       write_unlock_irqrestore(g_lock, flags);
+                       return;
+               }
+               write_unlock_irqrestore(g_lock, flags);
+       }
+       kiblnd_handle_rx(rx);
+       return;
+
+ failed:
+       CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+       kiblnd_close_conn(conn, err);
+ ignore:
+       kiblnd_drop_rx(rx);                  /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+       struct page *page;
+
+       if (vaddr >= VMALLOC_START &&
+           vaddr < VMALLOC_END) {
+               page = vmalloc_to_page ((void *)vaddr);
+               LASSERT (page != NULL);
+               return page;
+       }
+#ifdef CONFIG_HIGHMEM
+       if (vaddr >= PKMAP_BASE &&
+           vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+               /* No highmem pages only used for bulk (kiov) I/O */
+               CERROR("find page for address in highmem\n");
+               LBUG();
+       }
+#endif
+       page = virt_to_page (vaddr);
+       LASSERT (page != NULL);
+       return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+       kib_hca_dev_t           *hdev;
+       __u64                   *pages = tx->tx_pages;
+       kib_fmr_poolset_t       *fps;
+       int                     npages;
+       int                     size;
+       int                     cpt;
+       int                     rc;
+       int                     i;
+
+       LASSERT(tx->tx_pool != NULL);
+       LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+       hdev  = tx->tx_pool->tpo_hdev;
+
+       for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+               for (size = 0; size <  rd->rd_frags[i].rf_nob;
+                              size += hdev->ibh_page_size) {
+                       pages[npages ++] = (rd->rd_frags[i].rf_addr &
+                                           hdev->ibh_page_mask) + size;
+               }
+       }
+
+       cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+       fps = net->ibn_fmr_ps[cpt];
+       rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+       if (rc != 0) {
+               CERROR ("Can't map %d pages: %d\n", npages, rc);
+               return rc;
+       }
+
+       /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+        * the rkey */
+       rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+                                        tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+       rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+       rd->rd_frags[0].rf_nob   = nob;
+       rd->rd_nfrags = 1;
+
+       return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+       kib_hca_dev_t           *hdev;
+       kib_pmr_poolset_t       *pps;
+       __u64                   iova;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT(tx->tx_pool != NULL);
+       LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+       hdev = tx->tx_pool->tpo_hdev;
+
+       iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+       cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+       pps = net->ibn_pmr_ps[cpt];
+       rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+       if (rc != 0) {
+               CERROR("Failed to create MR by phybuf: %d\n", rc);
+               return rc;
+       }
+
+       /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+        * the rkey */
+       rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+                                        tx->tx_u.pmr->pmr_mr->lkey;
+       rd->rd_nfrags = 1;
+       rd->rd_frags[0].rf_addr = iova;
+       rd->rd_frags[0].rf_nob  = nob;
+
+       return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+       kib_net_t  *net = ni->ni_data;
+
+       LASSERT(net != NULL);
+
+       if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+               kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+               tx->tx_u.fmr.fmr_pfmr = NULL;
+
+       } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+               kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+               tx->tx_u.pmr = NULL;
+       }
+
+       if (tx->tx_nfrags != 0) {
+               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+               tx->tx_nfrags = 0;
+       }
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+             kib_rdma_desc_t *rd, int nfrags)
+{
+       kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+       kib_net_t         *net   = ni->ni_data;
+       struct ib_mr       *mr    = NULL;
+       __u32          nob;
+       int              i;
+
+       /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+        * RDMA sink */
+       tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+       tx->tx_nfrags = nfrags;
+
+       rd->rd_nfrags =
+               kiblnd_dma_map_sg(hdev->ibh_ibdev,
+                                 tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+       for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+               rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                       hdev->ibh_ibdev, &tx->tx_frags[i]);
+               rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                       hdev->ibh_ibdev, &tx->tx_frags[i]);
+               nob += rd->rd_frags[i].rf_nob;
+       }
+
+       /* looking for pre-mapping MR */
+       mr = kiblnd_find_rd_dma_mr(hdev, rd);
+       if (mr != NULL) {
+               /* found pre-mapping MR */
+               rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+               return 0;
+       }
+
+       if (net->ibn_fmr_ps != NULL)
+               return kiblnd_fmr_map_tx(net, tx, rd, nob);
+       else if (net->ibn_pmr_ps != NULL)
+               return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+       return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                   unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+       kib_net_t         *net = ni->ni_data;
+       struct page     *page;
+       struct scatterlist *sg;
+       unsigned long       vaddr;
+       int              fragnob;
+       int              page_offset;
+
+       LASSERT (nob > 0);
+       LASSERT (niov > 0);
+       LASSERT (net != NULL);
+
+       while (offset >= iov->iov_len) {
+               offset -= iov->iov_len;
+               niov--;
+               iov++;
+               LASSERT (niov > 0);
+       }
+
+       sg = tx->tx_frags;
+       do {
+               LASSERT (niov > 0);
+
+               vaddr = ((unsigned long)iov->iov_base) + offset;
+               page_offset = vaddr & (PAGE_SIZE - 1);
+               page = kiblnd_kvaddr_to_page(vaddr);
+               if (page == NULL) {
+                       CERROR ("Can't find page\n");
+                       return -EFAULT;
+               }
+
+               fragnob = min((int)(iov->iov_len - offset), nob);
+               fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+               sg_set_page(sg, page, fragnob, page_offset);
+               sg++;
+
+               if (offset + fragnob < iov->iov_len) {
+                       offset += fragnob;
+               } else {
+                       offset = 0;
+                       iov++;
+                       niov--;
+               }
+               nob -= fragnob;
+       } while (nob > 0);
+
+       return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+       kib_net_t         *net = ni->ni_data;
+       struct scatterlist *sg;
+       int              fragnob;
+
+       CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+       LASSERT (nob > 0);
+       LASSERT (nkiov > 0);
+       LASSERT (net != NULL);
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT (nkiov > 0);
+       }
+
+       sg = tx->tx_frags;
+       do {
+               LASSERT (nkiov > 0);
+
+               fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+               sg_set_page(sg, kiov->kiov_page, fragnob,
+                           kiov->kiov_offset + offset);
+               sg++;
+
+               offset = 0;
+               kiov++;
+               nkiov--;
+               nob -= fragnob;
+       } while (nob > 0);
+
+       return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+       kib_msg_t        *msg = tx->tx_msg;
+       kib_peer_t      *peer = conn->ibc_peer;
+       int             ver = conn->ibc_version;
+       int             rc;
+       int             done;
+       struct ib_send_wr *bad_wrq;
+
+       LASSERT (tx->tx_queued);
+       /* We rely on this for QP sizing */
+       LASSERT (tx->tx_nwrq > 0);
+       LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+       LASSERT (credit == 0 || credit == 1);
+       LASSERT (conn->ibc_outstanding_credits >= 0);
+       LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+       LASSERT (conn->ibc_credits >= 0);
+       LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+       if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+               /* tx completions outstanding... */
+               CDEBUG(D_NET, "%s: posted enough\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+               CDEBUG(D_NET, "%s: no credits\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+           conn->ibc_credits == 1 &&   /* last credit reserved */
+           msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+               CDEBUG(D_NET, "%s: not using last credit\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       /* NB don't drop ibc_lock before bumping tx_sending */
+       list_del(&tx->tx_list);
+       tx->tx_queued = 0;
+
+       if (msg->ibm_type == IBLND_MSG_NOOP &&
+           (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+            (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+             conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+               /* OK to drop when posted enough NOOPs, since
+                * kiblnd_check_sends will queue NOOP again when
+                * posted NOOPs complete */
+               spin_unlock(&conn->ibc_lock);
+               kiblnd_tx_done(peer->ibp_ni, tx);
+               spin_lock(&conn->ibc_lock);
+               CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_noops_posted);
+               return 0;
+       }
+
+       kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+                       peer->ibp_nid, conn->ibc_incarnation);
+
+       conn->ibc_credits -= credit;
+       conn->ibc_outstanding_credits = 0;
+       conn->ibc_nsends_posted++;
+       if (msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted++;
+
+       /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+        * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+        * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+        * and then re-queued here.  It's (just) possible that
+        * tx_sending is non-zero if we've not done the tx_complete()
+        * from the first send; hence the ++ rather than = below. */
+       tx->tx_sending++;
+       list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+       /* I'm still holding ibc_lock! */
+       if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+               rc = -ECONNABORTED;
+       } else if (tx->tx_pool->tpo_pool.po_failed ||
+                conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+               /* close_conn will launch failover */
+               rc = -ENETDOWN;
+       } else {
+               rc = ib_post_send(conn->ibc_cmid->qp,
+                                 tx->tx_wrq, &bad_wrq);
+       }
+
+       conn->ibc_last_send = jiffies;
+
+       if (rc == 0)
+               return 0;
+
+       /* NB credits are transferred in the actual
+        * message, which can only be the last work item */
+       conn->ibc_credits += credit;
+       conn->ibc_outstanding_credits += msg->ibm_credits;
+       conn->ibc_nsends_posted--;
+       if (msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted--;
+
+       tx->tx_status = rc;
+       tx->tx_waiting = 0;
+       tx->tx_sending--;
+
+       done = (tx->tx_sending == 0);
+       if (done)
+               list_del(&tx->tx_list);
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+               CERROR("Error %d posting transmit to %s\n",
+                      rc, libcfs_nid2str(peer->ibp_nid));
+       else
+               CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+                      rc, libcfs_nid2str(peer->ibp_nid));
+
+       kiblnd_close_conn(conn, rc);
+
+       if (done)
+               kiblnd_tx_done(peer->ibp_ni, tx);
+
+       spin_lock(&conn->ibc_lock);
+
+       return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+       int     ver = conn->ibc_version;
+       lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+       kib_tx_t  *tx;
+
+       /* Don't send anything until after the connection is established */
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+               CDEBUG(D_NET, "%s too soon\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+       }
+
+       spin_lock(&conn->ibc_lock);
+
+       LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+       LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+                conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+       LASSERT (conn->ibc_reserved_credits >= 0);
+
+       while (conn->ibc_reserved_credits > 0 &&
+              !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+               tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                   kib_tx_t, tx_list);
+               list_del(&tx->tx_list);
+               list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+               conn->ibc_reserved_credits--;
+       }
+
+       if (kiblnd_need_noop(conn)) {
+               spin_unlock(&conn->ibc_lock);
+
+               tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+               if (tx != NULL)
+                       kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+               spin_lock(&conn->ibc_lock);
+               if (tx != NULL)
+                       kiblnd_queue_tx_locked(tx, conn);
+       }
+
+       kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+       for (;;) {
+               int credit;
+
+               if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                       credit = 0;
+                       tx = list_entry(conn->ibc_tx_queue_nocred.next,
+                                           kib_tx_t, tx_list);
+               } else if (!list_empty(&conn->ibc_tx_noops)) {
+                       LASSERT (!IBLND_OOB_CAPABLE(ver));
+                       credit = 1;
+                       tx = list_entry(conn->ibc_tx_noops.next,
+                                       kib_tx_t, tx_list);
+               } else if (!list_empty(&conn->ibc_tx_queue)) {
+                       credit = 1;
+                       tx = list_entry(conn->ibc_tx_queue.next,
+                                           kib_tx_t, tx_list);
+               } else
+                       break;
+
+               if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+                       break;
+       }
+
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+       int        failed = (status != IB_WC_SUCCESS);
+       kib_conn_t   *conn = tx->tx_conn;
+       int        idle;
+
+       LASSERT (tx->tx_sending > 0);
+
+       if (failed) {
+               if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                       CNETERR("Tx -> %s cookie "LPX64
+                               " sending %d waiting %d: failed %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                               status);
+
+               kiblnd_close_conn(conn, -EIO);
+       } else {
+               kiblnd_peer_alive(conn->ibc_peer);
+       }
+
+       spin_lock(&conn->ibc_lock);
+
+       /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+        * gets to free it, which also drops its ref on 'conn'. */
+
+       tx->tx_sending--;
+       conn->ibc_nsends_posted--;
+       if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted--;
+
+       if (failed) {
+               tx->tx_waiting = 0;          /* don't wait for peer */
+               tx->tx_status = -EIO;
+       }
+
+       idle = (tx->tx_sending == 0) &&  /* This is the final callback */
+              !tx->tx_waiting &&              /* Not waiting for peer */
+              !tx->tx_queued;            /* Not re-queued (PUT_DONE) */
+       if (idle)
+               list_del(&tx->tx_list);
+
+       kiblnd_conn_addref(conn);              /* 1 ref for me.... */
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (idle)
+               kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+       kiblnd_check_sends(conn);
+
+       kiblnd_conn_decref(conn);              /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+       kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+       struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+       struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+       int             nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+       struct ib_mr      *mr;
+
+       LASSERT (tx->tx_nwrq >= 0);
+       LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+       LASSERT (nob <= IBLND_MSG_SIZE);
+
+       kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+       mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+       LASSERT (mr != NULL);
+
+       sge->lkey   = mr->lkey;
+       sge->addr   = tx->tx_msgaddr;
+       sge->length = nob;
+
+       memset(wrq, 0, sizeof(*wrq));
+
+       wrq->next       = NULL;
+       wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+       wrq->sg_list    = sge;
+       wrq->num_sge    = 1;
+       wrq->opcode     = IB_WR_SEND;
+       wrq->send_flags = IB_SEND_SIGNALED;
+
+       tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+                 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+       kib_msg_t        *ibmsg = tx->tx_msg;
+       kib_rdma_desc_t   *srcrd = tx->tx_rd;
+       struct ib_sge     *sge = &tx->tx_sge[0];
+       struct ib_send_wr *wrq = &tx->tx_wrq[0];
+       int             rc  = resid;
+       int             srcidx;
+       int             dstidx;
+       int             wrknob;
+
+       LASSERT (!in_interrupt());
+       LASSERT (tx->tx_nwrq == 0);
+       LASSERT (type == IBLND_MSG_GET_DONE ||
+                type == IBLND_MSG_PUT_DONE);
+
+       srcidx = dstidx = 0;
+
+       while (resid > 0) {
+               if (srcidx >= srcrd->rd_nfrags) {
+                       CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (dstidx == dstrd->rd_nfrags) {
+                       CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+                       CERROR("RDMA too fragmented for %s (%d): "
+                              "%d/%d src %d/%d dst frags\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              IBLND_RDMA_FRAGS(conn->ibc_version),
+                              srcidx, srcrd->rd_nfrags,
+                              dstidx, dstrd->rd_nfrags);
+                       rc = -EMSGSIZE;
+                       break;
+               }
+
+               wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+                                kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+               sge = &tx->tx_sge[tx->tx_nwrq];
+               sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+               sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+               sge->length = wrknob;
+
+               wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+               wrq->next       = wrq + 1;
+               wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+               wrq->sg_list    = sge;
+               wrq->num_sge    = 1;
+               wrq->opcode     = IB_WR_RDMA_WRITE;
+               wrq->send_flags = 0;
+
+               wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+               wrq->wr.rdma.rkey       = kiblnd_rd_frag_key(dstrd, dstidx);
+
+               srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+               dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+               resid -= wrknob;
+
+               tx->tx_nwrq++;
+               wrq++;
+               sge++;
+       }
+
+       if (rc < 0)                          /* no RDMA if completing with failure */
+               tx->tx_nwrq = 0;
+
+       ibmsg->ibm_u.completion.ibcm_status = rc;
+       ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+       kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+                          type, sizeof (kib_completion_msg_t));
+
+       return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+       struct list_head   *q;
+
+       LASSERT (tx->tx_nwrq > 0);            /* work items set up */
+       LASSERT (!tx->tx_queued);              /* not queued for sending already */
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       tx->tx_queued = 1;
+       tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+       if (tx->tx_conn == NULL) {
+               kiblnd_conn_addref(conn);
+               tx->tx_conn = conn;
+               LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+       } else {
+               /* PUT_DONE first attached to conn as a PUT_REQ */
+               LASSERT (tx->tx_conn == conn);
+               LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+       }
+
+       switch (tx->tx_msg->ibm_type) {
+       default:
+               LBUG();
+
+       case IBLND_MSG_PUT_REQ:
+       case IBLND_MSG_GET_REQ:
+               q = &conn->ibc_tx_queue_rsrvd;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_ACK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               q = &conn->ibc_tx_queue_nocred;
+               break;
+
+       case IBLND_MSG_NOOP:
+               if (IBLND_OOB_CAPABLE(conn->ibc_version))
+                       q = &conn->ibc_tx_queue_nocred;
+               else
+                       q = &conn->ibc_tx_noops;
+               break;
+
+       case IBLND_MSG_IMMEDIATE:
+               q = &conn->ibc_tx_queue;
+               break;
+       }
+
+       list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+       spin_lock(&conn->ibc_lock);
+       kiblnd_queue_tx_locked(tx, conn);
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+                              struct sockaddr_in *srcaddr,
+                              struct sockaddr_in *dstaddr,
+                              int timeout_ms)
+{
+       unsigned short port;
+       int rc;
+
+       /* allow the port to be reused */
+       rc = rdma_set_reuseaddr(cmid, 1);
+       if (rc != 0) {
+               CERROR("Unable to set reuse on cmid: %d\n", rc);
+               return rc;
+       }
+
+       /* look for a free privileged port */
+       for (port = PROT_SOCK-1; port > 0; port--) {
+               srcaddr->sin_port = htons(port);
+               rc = rdma_resolve_addr(cmid,
+                                      (struct sockaddr *)srcaddr,
+                                      (struct sockaddr *)dstaddr,
+                                      timeout_ms);
+               if (rc == 0) {
+                       CDEBUG(D_NET, "bound to port %hu\n", port);
+                       return 0;
+               } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+                       CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+                              port, rc);
+               } else {
+                       return rc;
+               }
+       }
+
+       CERROR("Failed to bind to a free privileged port\n");
+       return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+       struct rdma_cm_id *cmid;
+       kib_dev_t        *dev;
+       kib_net_t        *net = peer->ibp_ni->ni_data;
+       struct sockaddr_in srcaddr;
+       struct sockaddr_in dstaddr;
+       int             rc;
+
+       LASSERT (net != NULL);
+       LASSERT (peer->ibp_connecting > 0);
+
+       cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+
+       if (IS_ERR(cmid)) {
+               CERROR("Can't create CMID for %s: %ld\n",
+                      libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+               rc = PTR_ERR(cmid);
+               goto failed;
+       }
+
+       dev = net->ibn_dev;
+       memset(&srcaddr, 0, sizeof(srcaddr));
+       srcaddr.sin_family = AF_INET;
+       srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+       memset(&dstaddr, 0, sizeof(dstaddr));
+       dstaddr.sin_family = AF_INET;
+       dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+       dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+       kiblnd_peer_addref(peer);              /* cmid's ref */
+
+       if (*kiblnd_tunables.kib_use_priv_port) {
+               rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+                                        *kiblnd_tunables.kib_timeout * 1000);
+       } else {
+               rc = rdma_resolve_addr(cmid,
+                                      (struct sockaddr *)&srcaddr,
+                                      (struct sockaddr *)&dstaddr,
+                                      *kiblnd_tunables.kib_timeout * 1000);
+       }
+       if (rc != 0) {
+               /* Can't initiate address resolution:  */
+               CERROR("Can't resolve addr for %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               goto failed2;
+       }
+
+       LASSERT (cmid->device != NULL);
+       CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+              libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+              HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+       return;
+
+ failed2:
+       kiblnd_peer_decref(peer);              /* cmid's ref */
+       rdma_destroy_id(cmid);
+ failed:
+       kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+       kib_peer_t      *peer;
+       kib_peer_t      *peer2;
+       kib_conn_t      *conn;
+       rwlock_t        *g_lock = &kiblnd_data.kib_global_lock;
+       unsigned long      flags;
+       int             rc;
+
+       /* If I get here, I've committed to send, so I complete the tx with
+        * failure on any problems */
+
+       LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+       LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+       /* First time, just use a read lock since I expect to find my peer
+        * connected */
+       read_lock_irqsave(g_lock, flags);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+               /* Found a peer with an established connection */
+               conn = kiblnd_get_conn_locked(peer);
+               kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+               read_unlock_irqrestore(g_lock, flags);
+
+               if (tx != NULL)
+                       kiblnd_queue_tx(tx, conn);
+               kiblnd_conn_decref(conn); /* ...to here */
+               return;
+       }
+
+       read_unlock(g_lock);
+       /* Re-try with a write lock */
+       write_lock(g_lock);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               if (list_empty(&peer->ibp_conns)) {
+                       /* found a peer, but it's still connecting... */
+                       LASSERT (peer->ibp_connecting != 0 ||
+                                peer->ibp_accepting != 0);
+                       if (tx != NULL)
+                               list_add_tail(&tx->tx_list,
+                                                 &peer->ibp_tx_queue);
+                       write_unlock_irqrestore(g_lock, flags);
+               } else {
+                       conn = kiblnd_get_conn_locked(peer);
+                       kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       if (tx != NULL)
+                               kiblnd_queue_tx(tx, conn);
+                       kiblnd_conn_decref(conn); /* ...to here */
+               }
+               return;
+       }
+
+       write_unlock_irqrestore(g_lock, flags);
+
+       /* Allocate a peer ready to add to the peer table and retry */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+               if (tx != NULL) {
+                       tx->tx_status = -EHOSTUNREACH;
+                       tx->tx_waiting = 0;
+                       kiblnd_tx_done(ni, tx);
+               }
+               return;
+       }
+
+       write_lock_irqsave(g_lock, flags);
+
+       peer2 = kiblnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               if (list_empty(&peer2->ibp_conns)) {
+                       /* found a peer, but it's still connecting... */
+                       LASSERT (peer2->ibp_connecting != 0 ||
+                                peer2->ibp_accepting != 0);
+                       if (tx != NULL)
+                               list_add_tail(&tx->tx_list,
+                                                 &peer2->ibp_tx_queue);
+                       write_unlock_irqrestore(g_lock, flags);
+               } else {
+                       conn = kiblnd_get_conn_locked(peer2);
+                       kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       if (tx != NULL)
+                               kiblnd_queue_tx(tx, conn);
+                       kiblnd_conn_decref(conn); /* ...to here */
+               }
+
+               kiblnd_peer_decref(peer);
+               return;
+       }
+
+       /* Brand new peer */
+       LASSERT (peer->ibp_connecting == 0);
+       peer->ibp_connecting = 1;
+
+       /* always called with a ref on ni, which prevents ni being shutdown */
+       LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+       if (tx != NULL)
+               list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+       kiblnd_peer_addref(peer);
+       list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+       write_unlock_irqrestore(g_lock, flags);
+
+       kiblnd_connect_peer(peer);
+       kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+       int            type = lntmsg->msg_type;
+       lnet_process_id_t target = lntmsg->msg_target;
+       int            target_is_router = lntmsg->msg_target_is_router;
+       int            routing = lntmsg->msg_routing;
+       unsigned int      payload_niov = lntmsg->msg_niov;
+       struct iovec     *payload_iov = lntmsg->msg_iov;
+       lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+       unsigned int      payload_offset = lntmsg->msg_offset;
+       unsigned int      payload_nob = lntmsg->msg_len;
+       kib_msg_t       *ibmsg;
+       kib_tx_t         *tx;
+       int            nob;
+       int            rc;
+
+       /* NB 'private' is different depending on what we're sending.... */
+
+       CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+              payload_nob, payload_niov, libcfs_id2str(target));
+
+       LASSERT (payload_nob == 0 || payload_niov > 0);
+       LASSERT (payload_niov <= LNET_MAX_IOV);
+
+       /* Thread context */
+       LASSERT (!in_interrupt());
+       /* payload is either all vaddrs or all pages */
+       LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+       switch (type) {
+       default:
+               LBUG();
+               return (-EIO);
+
+       case LNET_MSG_ACK:
+               LASSERT (payload_nob == 0);
+               break;
+
+       case LNET_MSG_GET:
+               if (routing || target_is_router)
+                       break;            /* send IMMEDIATE */
+
+               /* is the REPLY message too small for RDMA? */
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+               if (nob <= IBLND_MSG_SIZE)
+                       break;            /* send IMMEDIATE */
+
+               tx = kiblnd_get_idle_tx(ni, target.nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate txd for GET to %s\n",
+                              libcfs_nid2str(target.nid));
+                       return -ENOMEM;
+               }
+
+               ibmsg = tx->tx_msg;
+
+               if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                       rc = kiblnd_setup_rd_iov(ni, tx,
+                                                &ibmsg->ibm_u.get.ibgm_rd,
+                                                lntmsg->msg_md->md_niov,
+                                                lntmsg->msg_md->md_iov.iov,
+                                                0, lntmsg->msg_md->md_length);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx,
+                                                 &ibmsg->ibm_u.get.ibgm_rd,
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.kiov,
+                                                 0, lntmsg->msg_md->md_length);
+               if (rc != 0) {
+                       CERROR("Can't setup GET sink for %s: %d\n",
+                              libcfs_nid2str(target.nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+               ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+               ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+               tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+               if (tx->tx_lntmsg[1] == NULL) {
+                       CERROR("Can't create reply for GET -> %s\n",
+                              libcfs_nid2str(target.nid));
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+               tx->tx_waiting = 1;          /* waiting for GET_DONE */
+               kiblnd_launch_tx(ni, tx, target.nid);
+               return 0;
+
+       case LNET_MSG_REPLY:
+       case LNET_MSG_PUT:
+               /* Is the payload small enough not to need RDMA? */
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+               if (nob <= IBLND_MSG_SIZE)
+                       break;            /* send IMMEDIATE */
+
+               tx = kiblnd_get_idle_tx(ni, target.nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate %s txd for %s\n",
+                              type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                              libcfs_nid2str(target.nid));
+                       return -ENOMEM;
+               }
+
+               if (payload_kiov == NULL)
+                       rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                                payload_niov, payload_iov,
+                                                payload_offset, payload_nob);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                                 payload_niov, payload_kiov,
+                                                 payload_offset, payload_nob);
+               if (rc != 0) {
+                       CERROR("Can't setup PUT src for %s: %d\n",
+                              libcfs_nid2str(target.nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               ibmsg = tx->tx_msg;
+               ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+               ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+               tx->tx_waiting = 1;          /* waiting for PUT_{ACK,NAK} */
+               kiblnd_launch_tx(ni, tx, target.nid);
+               return 0;
+       }
+
+       /* send IMMEDIATE */
+
+       LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                <= IBLND_MSG_SIZE);
+
+       tx = kiblnd_get_idle_tx(ni, target.nid);
+       if (tx == NULL) {
+               CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                       type, libcfs_nid2str(target.nid));
+               return -ENOMEM;
+       }
+
+       ibmsg = tx->tx_msg;
+       ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+       if (payload_kiov != NULL)
+               lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_kiov,
+                                   payload_offset, payload_nob);
+       else
+               lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+                                  offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                  payload_niov, payload_iov,
+                                  payload_offset, payload_nob);
+
+       nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+       kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+       tx->tx_lntmsg[0] = lntmsg;            /* finalise lntmsg on completion */
+       kiblnd_launch_tx(ni, tx, target.nid);
+       return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+       lnet_process_id_t target = lntmsg->msg_target;
+       unsigned int      niov = lntmsg->msg_niov;
+       struct iovec     *iov = lntmsg->msg_iov;
+       lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+       unsigned int      offset = lntmsg->msg_offset;
+       unsigned int      nob = lntmsg->msg_len;
+       kib_tx_t         *tx;
+       int            rc;
+
+       tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+       if (tx == NULL) {
+               CERROR("Can't get tx for REPLY to %s\n",
+                      libcfs_nid2str(target.nid));
+               goto failed_0;
+       }
+
+       if (nob == 0)
+               rc = 0;
+       else if (kiov == NULL)
+               rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                        niov, iov, offset, nob);
+       else
+               rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                         niov, kiov, offset, nob);
+
+       if (rc != 0) {
+               CERROR("Can't setup GET src for %s: %d\n",
+                      libcfs_nid2str(target.nid), rc);
+               goto failed_1;
+       }
+
+       rc = kiblnd_init_rdma(rx->rx_conn, tx,
+                             IBLND_MSG_GET_DONE, nob,
+                             &rx->rx_msg->ibm_u.get.ibgm_rd,
+                             rx->rx_msg->ibm_u.get.ibgm_cookie);
+       if (rc < 0) {
+               CERROR("Can't setup rdma for GET from %s: %d\n",
+                      libcfs_nid2str(target.nid), rc);
+               goto failed_1;
+       }
+
+       if (nob == 0) {
+               /* No RDMA: local completion may happen now! */
+               lnet_finalize(ni, lntmsg, 0);
+       } else {
+               /* RDMA: lnet_finalize(lntmsg) when it
+                * completes */
+               tx->tx_lntmsg[0] = lntmsg;
+       }
+
+       kiblnd_queue_tx(tx, rx->rx_conn);
+       return;
+
+ failed_1:
+       kiblnd_tx_done(ni, tx);
+ failed_0:
+       lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+            unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       kib_rx_t    *rx = private;
+       kib_msg_t   *rxmsg = rx->rx_msg;
+       kib_conn_t  *conn = rx->rx_conn;
+       kib_tx_t    *tx;
+       kib_msg_t   *txmsg;
+       int       nob;
+       int       post_credit = IBLND_POSTRX_PEER_CREDIT;
+       int       rc = 0;
+
+       LASSERT (mlen <= rlen);
+       LASSERT (!in_interrupt());
+       /* Either all pages or all vaddrs */
+       LASSERT (!(kiov != NULL && iov != NULL));
+
+       switch (rxmsg->ibm_type) {
+       default:
+               LBUG();
+
+       case IBLND_MSG_IMMEDIATE:
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+               if (nob > rx->rx_nob) {
+                       CERROR ("Immediate message from %s too big: %d(%d)\n",
+                               libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                               nob, rx->rx_nob);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kiov != NULL)
+                       lnet_copy_flat2kiov(niov, kiov, offset,
+                                           IBLND_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+               else
+                       lnet_copy_flat2iov(niov, iov, offset,
+                                          IBLND_MSG_SIZE, rxmsg,
+                                          offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                          mlen);
+               lnet_finalize (ni, lntmsg, 0);
+               break;
+
+       case IBLND_MSG_PUT_REQ:
+               if (mlen == 0) {
+                       lnet_finalize(ni, lntmsg, 0);
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+                                              rxmsg->ibm_u.putreq.ibprm_cookie);
+                       break;
+               }
+
+               tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate tx for %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       /* Not replying will break the connection */
+                       rc = -ENOMEM;
+                       break;
+               }
+
+               txmsg = tx->tx_msg;
+               if (kiov == NULL)
+                       rc = kiblnd_setup_rd_iov(ni, tx,
+                                                &txmsg->ibm_u.putack.ibpam_rd,
+                                                niov, iov, offset, mlen);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx,
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 niov, kiov, offset, mlen);
+               if (rc != 0) {
+                       CERROR("Can't setup PUT sink for %s: %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       /* tell peer it's over */
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+                                              rxmsg->ibm_u.putreq.ibprm_cookie);
+                       break;
+               }
+
+               nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+               txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+               txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+               tx->tx_waiting = 1;          /* waiting for PUT_DONE */
+               kiblnd_queue_tx(tx, conn);
+
+               /* reposted buffer reserved for PUT_DONE */
+               post_credit = IBLND_POSTRX_NO_CREDIT;
+               break;
+
+       case IBLND_MSG_GET_REQ:
+               if (lntmsg != NULL) {
+                       /* Optimized GET; RDMA lntmsg's payload */
+                       kiblnd_reply(ni, rx, lntmsg);
+               } else {
+                       /* GET didn't match anything */
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+                                              -ENODATA,
+                                              rxmsg->ibm_u.get.ibgm_cookie);
+               }
+               break;
+       }
+
+       kiblnd_post_rx(rx, post_credit);
+       return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+       task_t *task = kthread_run(fn, arg, name);
+
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       atomic_inc(&kiblnd_data.kib_nthreads);
+       return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+       atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+       /* This is racy, but everyone's only writing cfs_time_current() */
+       peer->ibp_last_alive = cfs_time_current();
+       mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+       int        error = 0;
+       cfs_time_t    last_alive = 0;
+       unsigned long flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (list_empty(&peer->ibp_conns) &&
+           peer->ibp_accepting == 0 &&
+           peer->ibp_connecting == 0 &&
+           peer->ibp_error != 0) {
+               error = peer->ibp_error;
+               peer->ibp_error = 0;
+
+               last_alive = peer->ibp_last_alive;
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (error != 0)
+               lnet_notify(peer->ibp_ni,
+                           peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+       /* This just does the immediate housekeeping.  'error' is zero for a
+        * normal shutdown which can happen only after the connection has been
+        * established.  If the connection is established, schedule the
+        * connection to be finished off by the connd.  Otherwise the connd is
+        * already dealing with it (either to set it up or tear it down).
+        * Caller holds kib_global_lock exclusively in irq context */
+       kib_peer_t       *peer = conn->ibc_peer;
+       kib_dev_t       *dev;
+       unsigned long     flags;
+
+       LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       if (error != 0 && conn->ibc_comms_error == 0)
+               conn->ibc_comms_error = error;
+
+       if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+               return; /* already being handled  */
+
+       if (error == 0 &&
+           list_empty(&conn->ibc_tx_noops) &&
+           list_empty(&conn->ibc_tx_queue) &&
+           list_empty(&conn->ibc_tx_queue_rsrvd) &&
+           list_empty(&conn->ibc_tx_queue_nocred) &&
+           list_empty(&conn->ibc_active_txs)) {
+               CDEBUG(D_NET, "closing conn to %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+       } else {
+               CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+                      libcfs_nid2str(peer->ibp_nid), error,
+                      list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                      list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+                      list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+                      list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+                      list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+       }
+
+       dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+       list_del(&conn->ibc_list);
+       /* connd (see below) takes over ibc_list's ref */
+
+       if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+           kiblnd_peer_active(peer)) {  /* still in peer table */
+               kiblnd_unlink_peer_locked(peer);
+
+               /* set/clear error on last conn */
+               peer->ibp_error = conn->ibc_comms_error;
+       }
+
+       kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+       if (error != 0 &&
+           kiblnd_dev_can_failover(dev)) {
+               list_add_tail(&dev->ibd_fail_list,
+                             &kiblnd_data.kib_failed_devs);
+               wake_up(&kiblnd_data.kib_failover_waitq);
+       }
+
+       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+       list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+       wake_up(&kiblnd_data.kib_connd_waitq);
+
+       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+       unsigned long flags;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_close_conn_locked(conn, error);
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+       unsigned long    flags;
+       kib_rx_t        *rx;
+
+       LASSERT(!in_interrupt());
+       LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       while (!list_empty(&conn->ibc_early_rxs)) {
+               rx = list_entry(conn->ibc_early_rxs.next,
+                                   kib_rx_t, rx_list);
+               list_del(&rx->rx_list);
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               kiblnd_handle_rx(rx);
+
+               write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       }
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+       LIST_HEAD       (zombies);
+       struct list_head          *tmp;
+       struct list_head          *nxt;
+       kib_tx_t            *tx;
+
+       spin_lock(&conn->ibc_lock);
+
+       list_for_each_safe (tmp, nxt, txs) {
+               tx = list_entry (tmp, kib_tx_t, tx_list);
+
+               if (txs == &conn->ibc_active_txs) {
+                       LASSERT (!tx->tx_queued);
+                       LASSERT (tx->tx_waiting ||
+                                tx->tx_sending != 0);
+               } else {
+                       LASSERT (tx->tx_queued);
+               }
+
+               tx->tx_status = -ECONNABORTED;
+               tx->tx_waiting = 0;
+
+               if (tx->tx_sending == 0) {
+                       tx->tx_queued = 0;
+                       list_del (&tx->tx_list);
+                       list_add (&tx->tx_list, &zombies);
+               }
+       }
+
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+       LASSERT (!in_interrupt());
+       LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+       kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+       /* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+        * for connections that didn't get as far as being connected, because
+        * rdma_disconnect() does this for free. */
+       kiblnd_abort_receives(conn);
+
+       /* Complete all tx descs not waiting for sends to complete.
+        * NB we should be safe from RDMA now that the QP has changed state */
+
+       kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+       kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+       kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+       LIST_HEAD    (zombies);
+       unsigned long     flags;
+
+       LASSERT (error != 0);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (active) {
+               LASSERT (peer->ibp_connecting > 0);
+               peer->ibp_connecting--;
+       } else {
+               LASSERT (peer->ibp_accepting > 0);
+               peer->ibp_accepting--;
+       }
+
+       if (peer->ibp_connecting != 0 ||
+           peer->ibp_accepting != 0) {
+               /* another connection attempt under way... */
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                           flags);
+               return;
+       }
+
+       if (list_empty(&peer->ibp_conns)) {
+               /* Take peer's blocked transmits to complete with error */
+               list_add(&zombies, &peer->ibp_tx_queue);
+               list_del_init(&peer->ibp_tx_queue);
+
+               if (kiblnd_peer_active(peer))
+                       kiblnd_unlink_peer_locked(peer);
+
+               peer->ibp_error = error;
+       } else {
+               /* Can't have blocked transmits if there are connections */
+               LASSERT (list_empty(&peer->ibp_tx_queue));
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_peer_notify(peer);
+
+       if (list_empty (&zombies))
+               return;
+
+       CNETERR("Deleting messages for %s: connection failed\n",
+               libcfs_nid2str(peer->ibp_nid));
+
+       kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+       kib_peer_t      *peer = conn->ibc_peer;
+       kib_tx_t          *tx;
+       struct list_head         txs;
+       unsigned long      flags;
+       int             active;
+
+       active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+       CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+              libcfs_nid2str(peer->ibp_nid), active,
+              conn->ibc_version, status);
+
+       LASSERT (!in_interrupt());
+       LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+                 peer->ibp_connecting > 0) ||
+                (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+                 peer->ibp_accepting > 0));
+
+       LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+       conn->ibc_connvars = NULL;
+
+       if (status != 0) {
+               /* failed to establish connection */
+               kiblnd_peer_connect_failed(peer, active, status);
+               kiblnd_finalise_conn(conn);
+               return;
+       }
+
+       /* connection established */
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       conn->ibc_last_send = jiffies;
+       kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+       kiblnd_peer_alive(peer);
+
+       /* Add conn to peer's list and nuke any dangling conns from a different
+        * peer instance... */
+       kiblnd_conn_addref(conn);              /* +1 ref for ibc_list */
+       list_add(&conn->ibc_list, &peer->ibp_conns);
+       if (active)
+               peer->ibp_connecting--;
+       else
+               peer->ibp_accepting--;
+
+       if (peer->ibp_version == 0) {
+               peer->ibp_version     = conn->ibc_version;
+               peer->ibp_incarnation = conn->ibc_incarnation;
+       }
+
+       if (peer->ibp_version     != conn->ibc_version ||
+           peer->ibp_incarnation != conn->ibc_incarnation) {
+               kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+                                               conn->ibc_incarnation);
+               peer->ibp_version     = conn->ibc_version;
+               peer->ibp_incarnation = conn->ibc_incarnation;
+       }
+
+       /* grab pending txs while I have the lock */
+       list_add(&txs, &peer->ibp_tx_queue);
+       list_del_init(&peer->ibp_tx_queue);
+
+       if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
+           conn->ibc_comms_error != 0) {       /* error has happened already */
+               lnet_ni_t *ni = peer->ibp_ni;
+
+               /* start to shut down connection */
+               kiblnd_close_conn_locked(conn, -ECONNABORTED);
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+               return;
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* Schedule blocked txs */
+       spin_lock(&conn->ibc_lock);
+       while (!list_empty(&txs)) {
+               tx = list_entry(txs.next, kib_tx_t, tx_list);
+               list_del(&tx->tx_list);
+
+               kiblnd_queue_tx_locked(tx, conn);
+       }
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+
+       /* schedule blocked rxs */
+       kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+       int       rc;
+
+       rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+       if (rc != 0)
+               CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+       rwlock_t                *g_lock = &kiblnd_data.kib_global_lock;
+       kib_msg_t            *reqmsg = priv;
+       kib_msg_t            *ackmsg;
+       kib_dev_t            *ibdev;
+       kib_peer_t          *peer;
+       kib_peer_t          *peer2;
+       kib_conn_t          *conn;
+       lnet_ni_t            *ni  = NULL;
+       kib_net_t            *net = NULL;
+       lnet_nid_t           nid;
+       struct rdma_conn_param cp;
+       kib_rej_t             rej;
+       int                 version = IBLND_MSG_VERSION;
+       unsigned long     flags;
+       int                 rc;
+       struct sockaddr_in    *peer_addr;
+       LASSERT (!in_interrupt());
+
+       /* cmid inherits 'context' from the corresponding listener id */
+       ibdev = (kib_dev_t *)cmid->context;
+       LASSERT (ibdev != NULL);
+
+       memset(&rej, 0, sizeof(rej));
+       rej.ibr_magic           = IBLND_MSG_MAGIC;
+       rej.ibr_why               = IBLND_REJECT_FATAL;
+       rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+       peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+       if (*kiblnd_tunables.kib_require_priv_port &&
+           ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+               __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+               CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+                      HIPQUAD(ip), ntohs(peer_addr->sin_port));
+               goto failed;
+       }
+
+       if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+               CERROR("Short connection request\n");
+               goto failed;
+       }
+
+       /* Future protocol version compatibility support!  If the
+        * o2iblnd-specific protocol changes, or when LNET unifies
+        * protocols over all LNDs, the initial connection will
+        * negotiate a protocol version.  I trap this here to avoid
+        * console errors; the reject tells the peer which protocol I
+        * speak. */
+       if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+           reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+               goto failed;
+       if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+               goto failed;
+       if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+               goto failed;
+
+       rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+       if (rc != 0) {
+               CERROR("Can't parse connection request: %d\n", rc);
+               goto failed;
+       }
+
+       nid = reqmsg->ibm_srcnid;
+       ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+       if (ni != NULL) {
+               net = (kib_net_t *)ni->ni_data;
+               rej.ibr_incarnation = net->ibn_incarnation;
+       }
+
+       if (ni == NULL ||                        /* no matching net */
+           ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+           net->ibn_dev != ibdev) {          /* wrong device */
+               CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+                      "bad dst nid %s\n", libcfs_nid2str(nid),
+                      ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+                      ibdev->ibd_ifname, ibdev->ibd_nnets,
+                      HIPQUAD(ibdev->ibd_ifip),
+                      libcfs_nid2str(reqmsg->ibm_dstnid));
+
+               goto failed;
+       }
+
+       /* check time stamp as soon as possible */
+       if (reqmsg->ibm_dststamp != 0 &&
+           reqmsg->ibm_dststamp != net->ibn_incarnation) {
+               CWARN("Stale connection request\n");
+               rej.ibr_why = IBLND_REJECT_CONN_STALE;
+               goto failed;
+       }
+
+       /* I can accept peer's version */
+       version = reqmsg->ibm_version;
+
+       if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+               CERROR("Unexpected connreq msg type: %x from %s\n",
+                      reqmsg->ibm_type, libcfs_nid2str(nid));
+               goto failed;
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+           IBLND_MSG_QUEUE_SIZE(version)) {
+               CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+                      libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+                      IBLND_MSG_QUEUE_SIZE(version));
+
+               if (version == IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+               goto failed;
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+           IBLND_RDMA_FRAGS(version)) {
+               CERROR("Can't accept %s(version %x): "
+                      "incompatible max_frags %d (%d wanted)\n",
+                      libcfs_nid2str(nid), version,
+                      reqmsg->ibm_u.connparams.ibcp_max_frags,
+                      IBLND_RDMA_FRAGS(version));
+
+               if (version == IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+               goto failed;
+
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+               CERROR("Can't accept %s: message size %d too big (%d max)\n",
+                      libcfs_nid2str(nid),
+                      reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+                      IBLND_MSG_SIZE);
+               goto failed;
+       }
+
+       /* assume 'nid' is a new peer; create  */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+               rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+               goto failed;
+       }
+
+       write_lock_irqsave(g_lock, flags);
+
+       peer2 = kiblnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               if (peer2->ibp_version == 0) {
+                       peer2->ibp_version     = version;
+                       peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+               }
+
+               /* not the guy I've talked with */
+               if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+                   peer2->ibp_version     != version) {
+                       kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+                             libcfs_nid2str(nid), peer2->ibp_version, version);
+
+                       kiblnd_peer_decref(peer);
+                       rej.ibr_why = IBLND_REJECT_CONN_STALE;
+                       goto failed;
+               }
+
+               /* tie-break connection race in favour of the higher NID */
+               if (peer2->ibp_connecting != 0 &&
+                   nid < ni->ni_nid) {
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+                       kiblnd_peer_decref(peer);
+                       rej.ibr_why = IBLND_REJECT_CONN_RACE;
+                       goto failed;
+               }
+
+               peer2->ibp_accepting++;
+               kiblnd_peer_addref(peer2);
+
+               write_unlock_irqrestore(g_lock, flags);
+               kiblnd_peer_decref(peer);
+               peer = peer2;
+       } else {
+               /* Brand new peer */
+               LASSERT (peer->ibp_accepting == 0);
+               LASSERT (peer->ibp_version == 0 &&
+                        peer->ibp_incarnation == 0);
+
+               peer->ibp_accepting   = 1;
+               peer->ibp_version     = version;
+               peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+               /* I have a ref on ni that prevents it being shutdown */
+               LASSERT (net->ibn_shutdown == 0);
+
+               kiblnd_peer_addref(peer);
+               list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+               write_unlock_irqrestore(g_lock, flags);
+       }
+
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+       if (conn == NULL) {
+               kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+               kiblnd_peer_decref(peer);
+               rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+               goto failed;
+       }
+
+       /* conn now "owns" cmid, so I return success from here on to ensure the
+        * CM callback doesn't destroy cmid. */
+
+       conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+       conn->ibc_credits         = IBLND_MSG_QUEUE_SIZE(version);
+       conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+       LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+                <= IBLND_RX_MSGS(version));
+
+       ackmsg = &conn->ibc_connvars->cv_msg;
+       memset(ackmsg, 0, sizeof(*ackmsg));
+
+       kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+                       sizeof(ackmsg->ibm_u.connparams));
+       ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+       ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+       ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+       kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+       memset(&cp, 0, sizeof(cp));
+       cp.private_data = ackmsg;
+       cp.private_data_len    = ackmsg->ibm_nob;
+       cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
+       cp.initiator_depth     = 0;
+       cp.flow_control = 1;
+       cp.retry_count   = *kiblnd_tunables.kib_retry_count;
+       cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+       CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+       rc = rdma_accept(cmid, &cp);
+       if (rc != 0) {
+               CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+               rej.ibr_version = version;
+               rej.ibr_why     = IBLND_REJECT_FATAL;
+
+               kiblnd_reject(cmid, &rej);
+               kiblnd_connreq_done(conn, rc);
+               kiblnd_conn_decref(conn);
+       }
+
+       lnet_ni_decref(ni);
+       return 0;
+
+ failed:
+       if (ni != NULL)
+               lnet_ni_decref(ni);
+
+       rej.ibr_version = version;
+       rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+       rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+       kiblnd_reject(cmid, &rej);
+
+       return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+                 __u64 incarnation, int why, kib_connparams_t *cp)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+       char      *reason;
+       int         retry = 0;
+       unsigned long  flags;
+
+       LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+       LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       /* retry connection if it's still needed and no other connection
+        * attempts (active or passive) are in progress
+        * NB: reconnect is still needed even when ibp_tx_queue is
+        * empty if ibp_version != version because reconnect may be
+        * initiated by kiblnd_query() */
+       if ((!list_empty(&peer->ibp_tx_queue) ||
+            peer->ibp_version != version) &&
+           peer->ibp_connecting == 1 &&
+           peer->ibp_accepting == 0) {
+               retry = 1;
+               peer->ibp_connecting++;
+
+               peer->ibp_version     = version;
+               peer->ibp_incarnation = incarnation;
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (!retry)
+               return;
+
+       switch (why) {
+       default:
+               reason = "Unknown";
+               break;
+
+       case IBLND_REJECT_CONN_STALE:
+               reason = "stale";
+               break;
+
+       case IBLND_REJECT_CONN_RACE:
+               reason = "conn race";
+               break;
+
+       case IBLND_REJECT_CONN_UNCOMPAT:
+               reason = "version negotiation";
+               break;
+       }
+
+       CNETERR("%s: retrying (%s), %x, %x, "
+               "queue_dep: %d, max_frag: %d, msg_size: %d\n",
+               libcfs_nid2str(peer->ibp_nid),
+               reason, IBLND_MSG_VERSION, version,
+               cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+               cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
+               cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+       kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+
+       LASSERT (!in_interrupt());
+       LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+       switch (reason) {
+       case IB_CM_REJ_STALE_CONN:
+               kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+                                IBLND_REJECT_CONN_STALE, NULL);
+               break;
+
+       case IB_CM_REJ_INVALID_SERVICE_ID:
+               CNETERR("%s rejected: no listener at %d\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       *kiblnd_tunables.kib_service);
+               break;
+
+       case IB_CM_REJ_CONSUMER_DEFINED:
+               if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+                       kib_rej_t       *rej     = priv;
+                       kib_connparams_t *cp      = NULL;
+                       int            flip     = 0;
+                       __u64        incarnation = -1;
+
+                       /* NB. default incarnation is -1 because:
+                        * a) V1 will ignore dst incarnation in connreq.
+                        * b) V2 will provide incarnation while rejecting me,
+                        *    -1 will be overwrote.
+                        *
+                        * if I try to connect to a V1 peer with V2 protocol,
+                        * it rejected me then upgrade to V2, I have no idea
+                        * about the upgrading and try to reconnect with V1,
+                        * in this case upgraded V2 can find out I'm trying to
+                        * talk to the old guy and reject me(incarnation is -1).
+                        */
+
+                       if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+                           rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+                               __swab32s(&rej->ibr_magic);
+                               __swab16s(&rej->ibr_version);
+                               flip = 1;
+                       }
+
+                       if (priv_nob >= sizeof(kib_rej_t) &&
+                           rej->ibr_version > IBLND_MSG_VERSION_1) {
+                               /* priv_nob is always 148 in current version
+                                * of OFED, so we still need to check version.
+                                * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+                               cp = &rej->ibr_cp;
+
+                               if (flip) {
+                                       __swab64s(&rej->ibr_incarnation);
+                                       __swab16s(&cp->ibcp_queue_depth);
+                                       __swab16s(&cp->ibcp_max_frags);
+                                       __swab32s(&cp->ibcp_max_msg_size);
+                               }
+
+                               incarnation = rej->ibr_incarnation;
+                       }
+
+                       if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+                           rej->ibr_magic != LNET_PROTO_MAGIC) {
+                               CERROR("%s rejected: consumer defined fatal error\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+                       }
+
+                       if (rej->ibr_version != IBLND_MSG_VERSION &&
+                           rej->ibr_version != IBLND_MSG_VERSION_1) {
+                               CERROR("%s rejected: o2iblnd version %x error\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      rej->ibr_version);
+                               break;
+                       }
+
+                       if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+                           rej->ibr_version == IBLND_MSG_VERSION_1) {
+                               CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+                                      libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+                               if (conn->ibc_version != IBLND_MSG_VERSION_1)
+                                       rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+                       }
+
+                       switch (rej->ibr_why) {
+                       case IBLND_REJECT_CONN_RACE:
+                       case IBLND_REJECT_CONN_STALE:
+                       case IBLND_REJECT_CONN_UNCOMPAT:
+                               kiblnd_reconnect(conn, rej->ibr_version,
+                                                incarnation, rej->ibr_why, cp);
+                               break;
+
+                       case IBLND_REJECT_MSG_QUEUE_SIZE:
+                               CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+                                      libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+                                      IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+                               break;
+
+                       case IBLND_REJECT_RDMA_FRAGS:
+                               CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+                                      libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+                                      IBLND_RDMA_FRAGS(conn->ibc_version));
+                               break;
+
+                       case IBLND_REJECT_NO_RESOURCES:
+                               CERROR("%s rejected: o2iblnd no resources\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+
+                       case IBLND_REJECT_FATAL:
+                               CERROR("%s rejected: o2iblnd fatal error\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+
+                       default:
+                               CERROR("%s rejected: o2iblnd reason %d\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      rej->ibr_why);
+                               break;
+                       }
+                       break;
+               }
+               /* fall through */
+       default:
+               CNETERR("%s rejected: reason %d, size %d\n",
+                       libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+               break;
+       }
+
+       kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+       lnet_ni_t     *ni   = peer->ibp_ni;
+       kib_net_t     *net  = ni->ni_data;
+       kib_msg_t     *msg  = priv;
+       int         ver  = conn->ibc_version;
+       int         rc   = kiblnd_unpack_msg(msg, priv_nob);
+       unsigned long  flags;
+
+       LASSERT (net != NULL);
+
+       if (rc != 0) {
+               CERROR("Can't unpack connack from %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               goto failed;
+       }
+
+       if (msg->ibm_type != IBLND_MSG_CONNACK) {
+               CERROR("Unexpected message %d from %s\n",
+                      msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (ver != msg->ibm_version) {
+               CERROR("%s replied version %x is different with "
+                      "requested version %x\n",
+                      libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_queue_depth !=
+           IBLND_MSG_QUEUE_SIZE(ver)) {
+               CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_queue_depth,
+                      IBLND_MSG_QUEUE_SIZE(ver));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_max_frags !=
+           IBLND_RDMA_FRAGS(ver)) {
+               CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_max_frags,
+                      IBLND_RDMA_FRAGS(ver));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+               CERROR("%s max message size %d too big (%d max)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_max_msg_size,
+                      IBLND_MSG_SIZE);
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       if (msg->ibm_dstnid == ni->ni_nid &&
+           msg->ibm_dststamp == net->ibn_incarnation)
+               rc = 0;
+       else
+               rc = -ESTALE;
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (rc != 0) {
+               CERROR("Bad connection reply from %s, rc = %d, "
+                      "version: %x max_frags: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc,
+                      msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+               goto failed;
+       }
+
+       conn->ibc_incarnation      = msg->ibm_srcstamp;
+       conn->ibc_credits         =
+       conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+       LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+                <= IBLND_RX_MSGS(ver));
+
+       kiblnd_connreq_done(conn, 0);
+       return;
+
+ failed:
+       /* NB My QP has already established itself, so I handle anything going
+        * wrong here by setting ibc_comms_error.
+        * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+        * immediately tears it down. */
+
+       LASSERT (rc != 0);
+       conn->ibc_comms_error = rc;
+       kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+       kib_peer_t            *peer = (kib_peer_t *)cmid->context;
+       kib_conn_t            *conn;
+       kib_msg_t              *msg;
+       struct rdma_conn_param   cp;
+       int                   version;
+       __u64               incarnation;
+       unsigned long       flags;
+       int                   rc;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       incarnation = peer->ibp_incarnation;
+       version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+                                                peer->ibp_version;
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+       if (conn == NULL) {
+               kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+               kiblnd_peer_decref(peer); /* lose cmid's ref */
+               return -ENOMEM;
+       }
+
+       /* conn "owns" cmid now, so I return success from here on to ensure the
+        * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+        * on peer */
+
+       msg = &conn->ibc_connvars->cv_msg;
+
+       memset(msg, 0, sizeof(*msg));
+       kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+       msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+       msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+       msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+       kiblnd_pack_msg(peer->ibp_ni, msg, version,
+                       0, peer->ibp_nid, incarnation);
+
+       memset(&cp, 0, sizeof(cp));
+       cp.private_data = msg;
+       cp.private_data_len    = msg->ibm_nob;
+       cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
+       cp.initiator_depth     = 0;
+       cp.flow_control = 1;
+       cp.retry_count   = *kiblnd_tunables.kib_retry_count;
+       cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+       LASSERT(cmid->context == (void *)conn);
+       LASSERT(conn->ibc_cmid == cmid);
+
+       rc = rdma_connect(cmid, &cp);
+       if (rc != 0) {
+               CERROR("Can't connect to %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               kiblnd_connreq_done(conn, rc);
+               kiblnd_conn_decref(conn);
+       }
+
+       return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+       kib_peer_t  *peer;
+       kib_conn_t  *conn;
+       int       rc;
+
+       switch (event->event) {
+       default:
+               CERROR("Unexpected event: %d, status: %d\n",
+                      event->event, event->status);
+               LBUG();
+
+       case RDMA_CM_EVENT_CONNECT_REQUEST:
+               /* destroy cmid on failure */
+               rc = kiblnd_passive_connect(cmid,
+                                           (void *)KIBLND_CONN_PARAM(event),
+                                           KIBLND_CONN_PARAM_LEN(event));
+               CDEBUG(D_NET, "connreq: %d\n", rc);
+               return rc;
+
+       case RDMA_CM_EVENT_ADDR_ERROR:
+               peer = (kib_peer_t *)cmid->context;
+               CNETERR("%s: ADDR ERROR %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+               kiblnd_peer_decref(peer);
+               return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ADDR_RESOLVED:
+               peer = (kib_peer_t *)cmid->context;
+
+               CDEBUG(D_NET,"%s Addr resolved: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+
+               if (event->status != 0) {
+                       CNETERR("Can't resolve address for %s: %d\n",
+                               libcfs_nid2str(peer->ibp_nid), event->status);
+                       rc = event->status;
+               } else {
+                       rc = rdma_resolve_route(
+                               cmid, *kiblnd_tunables.kib_timeout * 1000);
+                       if (rc == 0)
+                               return 0;
+                       /* Can't initiate route resolution */
+                       CERROR("Can't resolve route for %s: %d\n",
+                              libcfs_nid2str(peer->ibp_nid), rc);
+               }
+               kiblnd_peer_connect_failed(peer, 1, rc);
+               kiblnd_peer_decref(peer);
+               return rc;                    /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_ERROR:
+               peer = (kib_peer_t *)cmid->context;
+               CNETERR("%s: ROUTE ERROR %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+               kiblnd_peer_decref(peer);
+               return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_RESOLVED:
+               peer = (kib_peer_t *)cmid->context;
+               CDEBUG(D_NET,"%s Route resolved: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+
+               if (event->status == 0)
+                       return kiblnd_active_connect(cmid);
+
+               CNETERR("Can't resolve route for %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, event->status);
+               kiblnd_peer_decref(peer);
+               return event->status;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_UNREACHABLE:
+               conn = (kib_conn_t *)cmid->context;
+               LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                       conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+               CNETERR("%s: UNREACHABLE %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+               kiblnd_connreq_done(conn, -ENETDOWN);
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_CONNECT_ERROR:
+               conn = (kib_conn_t *)cmid->context;
+               LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                       conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+               CNETERR("%s: CONNECT ERROR %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+               kiblnd_connreq_done(conn, -ENOTCONN);
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_REJECTED:
+               conn = (kib_conn_t *)cmid->context;
+               switch (conn->ibc_state) {
+               default:
+                       LBUG();
+
+               case IBLND_CONN_PASSIVE_WAIT:
+                       CERROR ("%s: REJECTED %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               event->status);
+                       kiblnd_connreq_done(conn, -ECONNRESET);
+                       break;
+
+               case IBLND_CONN_ACTIVE_CONNECT:
+                       kiblnd_rejected(conn, event->status,
+                                       (void *)KIBLND_CONN_PARAM(event),
+                                       KIBLND_CONN_PARAM_LEN(event));
+                       break;
+               }
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_ESTABLISHED:
+               conn = (kib_conn_t *)cmid->context;
+               switch (conn->ibc_state) {
+               default:
+                       LBUG();
+
+               case IBLND_CONN_PASSIVE_WAIT:
+                       CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_connreq_done(conn, 0);
+                       break;
+
+               case IBLND_CONN_ACTIVE_CONNECT:
+                       CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_check_connreply(conn,
+                                              (void *)KIBLND_CONN_PARAM(event),
+                                              KIBLND_CONN_PARAM_LEN(event));
+                       break;
+               }
+               /* net keeps its ref on conn! */
+               return 0;
+
+       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+               CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+               return 0;
+       case RDMA_CM_EVENT_DISCONNECTED:
+               conn = (kib_conn_t *)cmid->context;
+               if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                       CERROR("%s DISCONNECTED\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_connreq_done(conn, -ECONNRESET);
+               } else {
+                       kiblnd_close_conn(conn, 0);
+               }
+               kiblnd_conn_decref(conn);
+               cmid->context = NULL;
+               return 0;
+
+       case RDMA_CM_EVENT_DEVICE_REMOVAL:
+               LCONSOLE_ERROR_MSG(0x131,
+                                  "Received notification of device removal\n"
+                                  "Please shutdown LNET to allow this to proceed\n");
+               /* Can't remove network from underneath LNET for now, so I have
+                * to ignore this */
+               return 0;
+
+       case RDMA_CM_EVENT_ADDR_CHANGE:
+               LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+               return 0;
+       }
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+       kib_tx_t          *tx;
+       struct list_head        *ttmp;
+
+       list_for_each (ttmp, txs) {
+               tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+               if (txs != &conn->ibc_active_txs) {
+                       LASSERT (tx->tx_queued);
+               } else {
+                       LASSERT (!tx->tx_queued);
+                       LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+               }
+
+               if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+                       CERROR("Timed out tx: %s, %lu seconds\n",
+                              kiblnd_queue2str(conn, txs),
+                              cfs_duration_sec(jiffies - tx->tx_deadline));
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+       return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+       LIST_HEAD (closes);
+       LIST_HEAD (checksends);
+       struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+       struct list_head    *ptmp;
+       kib_peer_t    *peer;
+       kib_conn_t    *conn;
+       struct list_head    *ctmp;
+       unsigned long  flags;
+
+       /* NB. We expect to have a look at all the peers and not find any
+        * RDMAs to time out, so we just use a shared lock while we
+        * take a look... */
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       list_for_each (ptmp, peers) {
+               peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+               list_for_each (ctmp, &peer->ibp_conns) {
+                       int timedout;
+                       int sendnoop;
+
+                       conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                       LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+                       spin_lock(&conn->ibc_lock);
+
+                       sendnoop = kiblnd_need_noop(conn);
+                       timedout = kiblnd_conn_timed_out_locked(conn);
+                       if (!sendnoop && !timedout) {
+                               spin_unlock(&conn->ibc_lock);
+                               continue;
+                       }
+
+                       if (timedout) {
+                               CERROR("Timed out RDMA with %s (%lu): "
+                                      "c: %u, oc: %u, rc: %u\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      cfs_duration_sec(cfs_time_current() -
+                                                       peer->ibp_last_alive),
+                                      conn->ibc_credits,
+                                      conn->ibc_outstanding_credits,
+                                      conn->ibc_reserved_credits);
+                               list_add(&conn->ibc_connd_list, &closes);
+                       } else {
+                               list_add(&conn->ibc_connd_list,
+                                            &checksends);
+                       }
+                       /* +ref for 'closes' or 'checksends' */
+                       kiblnd_conn_addref(conn);
+
+                       spin_unlock(&conn->ibc_lock);
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* Handle timeout by closing the whole
+        * connection. We can only be sure RDMA activity
+        * has ceased once the QP has been modified. */
+       while (!list_empty(&closes)) {
+               conn = list_entry(closes.next,
+                                     kib_conn_t, ibc_connd_list);
+               list_del(&conn->ibc_connd_list);
+               kiblnd_close_conn(conn, -ETIMEDOUT);
+               kiblnd_conn_decref(conn);
+       }
+
+       /* In case we have enough credits to return via a
+        * NOOP, but there were no non-blocking tx descs
+        * free to do it last time... */
+       while (!list_empty(&checksends)) {
+               conn = list_entry(checksends.next,
+                                     kib_conn_t, ibc_connd_list);
+               list_del(&conn->ibc_connd_list);
+               kiblnd_check_sends(conn);
+               kiblnd_conn_decref(conn);
+       }
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+       LASSERT (!in_interrupt());
+       LASSERT (current == kiblnd_data.kib_connd);
+       LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+       rdma_disconnect(conn->ibc_cmid);
+       kiblnd_finalise_conn(conn);
+
+       kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+       wait_queue_t     wait;
+       unsigned long      flags;
+       kib_conn_t      *conn;
+       int             timeout;
+       int             i;
+       int             dropped_lock;
+       int             peer_index = 0;
+       unsigned long      deadline = jiffies;
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current (&wait);
+       kiblnd_data.kib_connd = current;
+
+       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+
+               dropped_lock = 0;
+
+               if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+                       conn = list_entry(kiblnd_data. \
+                                             kib_connd_zombies.next,
+                                             kib_conn_t, ibc_list);
+                       list_del(&conn->ibc_list);
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+                                              flags);
+                       dropped_lock = 1;
+
+                       kiblnd_destroy_conn(conn);
+
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+                       conn = list_entry(kiblnd_data.kib_connd_conns.next,
+                                             kib_conn_t, ibc_list);
+                       list_del(&conn->ibc_list);
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+                                              flags);
+                       dropped_lock = 1;
+
+                       kiblnd_disconnect_conn(conn);
+                       kiblnd_conn_decref(conn);
+
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               /* careful with the jiffy wrap... */
+               timeout = (int)(deadline - jiffies);
+               if (timeout <= 0) {
+                       const int n = 4;
+                       const int p = 1;
+                       int       chunk = kiblnd_data.kib_peer_hash_size;
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+                       dropped_lock = 1;
+
+                       /* Time to check for RDMA timeouts on a few more
+                        * peers: I do checks every 'p' seconds on a
+                        * proportion of the peer table and I need to check
+                        * every connection 'n' times within a timeout
+                        * interval, to ensure I detect a timeout on any
+                        * connection within (n+1)/n times the timeout
+                        * interval. */
+
+                       if (*kiblnd_tunables.kib_timeout > n * p)
+                               chunk = (chunk * n * p) /
+                                       *kiblnd_tunables.kib_timeout;
+                       if (chunk == 0)
+                               chunk = 1;
+
+                       for (i = 0; i < chunk; i++) {
+                               kiblnd_check_conns(peer_index);
+                               peer_index = (peer_index + 1) %
+                                            kiblnd_data.kib_peer_hash_size;
+                       }
+
+                       deadline += p * HZ;
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               if (dropped_lock)
+                       continue;
+
+               /* Nothing to do for 'timeout'  */
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+               waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+       kib_conn_t *conn = arg;
+
+       switch (event->event) {
+       case IB_EVENT_COMM_EST:
+               CDEBUG(D_NET, "%s established\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+
+       default:
+               CERROR("%s: Async QP event type %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+               return;
+       }
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+       switch (kiblnd_wreqid2type(wc->wr_id)) {
+       default:
+               LBUG();
+
+       case IBLND_WID_RDMA:
+               /* We only get RDMA completion notification if it fails.  All
+                * subsequent work items, including the final SEND will fail
+                * too.  However we can't print out any more info about the
+                * failing RDMA because 'tx' might be back on the idle list or
+                * even reused already if we didn't manage to post all our work
+                * items */
+               CNETERR("RDMA (tx: %p) failed: %d\n",
+                       kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+               return;
+
+       case IBLND_WID_TX:
+               kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+               return;
+
+       case IBLND_WID_RX:
+               kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+                                  wc->byte_len);
+               return;
+       }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+       /* NB I'm not allowed to schedule this conn once its refcount has
+        * reached 0.  Since fundamentally I'm racing with scheduler threads
+        * consuming my CQ I could be called after all completions have
+        * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+        * and this CQ is about to be destroyed so I NOOP. */
+       kib_conn_t              *conn = (kib_conn_t *)arg;
+       struct kib_sched_info   *sched = conn->ibc_sched;
+       unsigned long           flags;
+
+       LASSERT(cq == conn->ibc_cq);
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+       conn->ibc_ready = 1;
+
+       if (!conn->ibc_scheduled &&
+           (conn->ibc_nrx > 0 ||
+            conn->ibc_nsends_posted > 0)) {
+               kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+               conn->ibc_scheduled = 1;
+               list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+               if (waitqueue_active(&sched->ibs_waitq))
+                       wake_up(&sched->ibs_waitq);
+       }
+
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+       kib_conn_t *conn = arg;
+
+       CERROR("%s: async CQ event type %d\n",
+              libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+       long                    id = (long)arg;
+       struct kib_sched_info   *sched;
+       kib_conn_t              *conn;
+       wait_queue_t            wait;
+       unsigned long           flags;
+       struct ib_wc            wc;
+       int                     did_something;
+       int                     busy_loops = 0;
+       int                     rc;
+
+       cfs_block_allsigs();
+
+       init_waitqueue_entry_current(&wait);
+
+       sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+       rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+       if (rc != 0) {
+               CWARN("Failed to bind on CPT %d, please verify whether "
+                     "all CPUs are healthy and reload modules if necessary, "
+                     "otherwise your system might under risk of low "
+                     "performance\n", sched->ibs_cpt);
+       }
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+               if (busy_loops++ >= IBLND_RESCHED) {
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       cond_resched();
+                       busy_loops = 0;
+
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+               }
+
+               did_something = 0;
+
+               if (!list_empty(&sched->ibs_conns)) {
+                       conn = list_entry(sched->ibs_conns.next,
+                                             kib_conn_t, ibc_sched_list);
+                       /* take over kib_sched_conns' ref on conn... */
+                       LASSERT(conn->ibc_scheduled);
+                       list_del(&conn->ibc_sched_list);
+                       conn->ibc_ready = 0;
+
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                       if (rc == 0) {
+                               rc = ib_req_notify_cq(conn->ibc_cq,
+                                                     IB_CQ_NEXT_COMP);
+                               if (rc < 0) {
+                                       CWARN("%s: ib_req_notify_cq failed: %d, "
+                                             "closing connection\n",
+                                             libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                                       kiblnd_close_conn(conn, -EIO);
+                                       kiblnd_conn_decref(conn);
+                                       spin_lock_irqsave(&sched->ibs_lock,
+                                                             flags);
+                                       continue;
+                               }
+
+                               rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                       }
+
+                       if (rc < 0) {
+                               CWARN("%s: ib_poll_cq failed: %d, "
+                                     "closing connection\n",
+                                     libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                     rc);
+                               kiblnd_close_conn(conn, -EIO);
+                               kiblnd_conn_decref(conn);
+                               spin_lock_irqsave(&sched->ibs_lock, flags);
+                               continue;
+                       }
+
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+                       if (rc != 0 || conn->ibc_ready) {
+                               /* There may be another completion waiting; get
+                                * another scheduler to check while I handle
+                                * this one... */
+                               /* +1 ref for sched_conns */
+                               kiblnd_conn_addref(conn);
+                               list_add_tail(&conn->ibc_sched_list,
+                                                 &sched->ibs_conns);
+                               if (waitqueue_active(&sched->ibs_waitq))
+                                       wake_up(&sched->ibs_waitq);
+                       } else {
+                               conn->ibc_scheduled = 0;
+                       }
+
+                       if (rc != 0) {
+                               spin_unlock_irqrestore(&sched->ibs_lock, flags);
+                               kiblnd_complete(&wc);
+
+                               spin_lock_irqsave(&sched->ibs_lock, flags);
+                       }
+
+                       kiblnd_conn_decref(conn); /* ...drop my ref from above */
+                       did_something = 1;
+               }
+
+               if (did_something)
+                       continue;
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+               spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+               waitq_wait(&wait, TASK_INTERRUPTIBLE);
+               busy_loops = 0;
+
+               remove_wait_queue(&sched->ibs_waitq, &wait);
+               set_current_state(TASK_RUNNING);
+               spin_lock_irqsave(&sched->ibs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+       rwlock_t                *glock = &kiblnd_data.kib_global_lock;
+       kib_dev_t        *dev;
+       wait_queue_t     wait;
+       unsigned long      flags;
+       int             rc;
+
+       LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current(&wait);
+       write_lock_irqsave(glock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+               int     do_failover = 0;
+               int     long_sleep;
+
+               list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+                                   ibd_fail_list) {
+                       if (cfs_time_before(cfs_time_current(),
+                                           dev->ibd_next_failover))
+                               continue;
+                       do_failover = 1;
+                       break;
+               }
+
+               if (do_failover) {
+                       list_del_init(&dev->ibd_fail_list);
+                       dev->ibd_failover = 1;
+                       write_unlock_irqrestore(glock, flags);
+
+                       rc = kiblnd_dev_failover(dev);
+
+                       write_lock_irqsave(glock, flags);
+
+                       LASSERT (dev->ibd_failover);
+                       dev->ibd_failover = 0;
+                       if (rc >= 0) { /* Device is OK or failover succeed */
+                               dev->ibd_next_failover = cfs_time_shift(3);
+                               continue;
+                       }
+
+                       /* failed to failover, retry later */
+                       dev->ibd_next_failover =
+                               cfs_time_shift(min(dev->ibd_failed_failover, 10));
+                       if (kiblnd_dev_can_failover(dev)) {
+                               list_add_tail(&dev->ibd_fail_list,
+                                             &kiblnd_data.kib_failed_devs);
+                       }
+
+                       continue;
+               }
+
+               /* long sleep if no more pending failover */
+               long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+               write_unlock_irqrestore(glock, flags);
+
+               rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+                                                  cfs_time_seconds(1));
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+               write_lock_irqsave(glock, flags);
+
+               if (!long_sleep || rc != 0)
+                       continue;
+
+               /* have a long sleep, routine check all active devices,
+                * we need checking like this because if there is not active
+                * connection on the dev and no SEND from local, we may listen
+                * on wrong HCA for ever while there is a bonding failover */
+               list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+                       if (kiblnd_dev_can_failover(dev)) {
+                               list_add_tail(&dev->ibd_fail_list,
+                                             &kiblnd_data.kib_failed_devs);
+                       }
+               }
+       }
+
+       write_unlock_irqrestore(glock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644 (file)
index 0000000..e21028b
--- /dev/null
@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+               "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+               "number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+               "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+               "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+               "IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+               "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+               "RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+               "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+               "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+               "send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+               "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+               "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+               "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+               "non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+               "size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+              "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+               "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+               "use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+       .kib_dev_failover          = &dev_failover,
+       .kib_service            = &service,
+       .kib_cksum                = &cksum,
+       .kib_timeout            = &timeout,
+       .kib_keepalive        = &keepalive,
+       .kib_ntx                    = &ntx,
+       .kib_credits            = &credits,
+       .kib_peertxcredits        = &peer_credits,
+       .kib_peercredits_hiw    = &peer_credits_hiw,
+       .kib_peerrtrcredits      = &peer_buffer_credits,
+       .kib_peertimeout            = &peer_timeout,
+       .kib_default_ipif          = &ipif_name,
+       .kib_retry_count            = &retry_count,
+       .kib_rnr_retry_count    = &rnr_retry_count,
+       .kib_concurrent_sends       = &concurrent_sends,
+       .kib_ib_mtu              = &ib_mtu,
+       .kib_map_on_demand        = &map_on_demand,
+       .kib_fmr_pool_size        = &fmr_pool_size,
+       .kib_fmr_flush_trigger      = &fmr_flush_trigger,
+       .kib_fmr_cache        = &fmr_cache,
+       .kib_pmr_pool_size        = &pmr_pool_size,
+       .kib_require_priv_port      = &require_privileged_port,
+       .kib_use_priv_port          = &use_privileged_port,
+       .kib_nscheds                = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+       O2IBLND_SERVICE  = 1,
+       O2IBLND_CKSUM,
+       O2IBLND_TIMEOUT,
+       O2IBLND_NTX,
+       O2IBLND_CREDITS,
+       O2IBLND_PEER_TXCREDITS,
+       O2IBLND_PEER_CREDITS_HIW,
+       O2IBLND_PEER_RTRCREDITS,
+       O2IBLND_PEER_TIMEOUT,
+       O2IBLND_IPIF_BASENAME,
+       O2IBLND_RETRY_COUNT,
+       O2IBLND_RNR_RETRY_COUNT,
+       O2IBLND_KEEPALIVE,
+       O2IBLND_CONCURRENT_SENDS,
+       O2IBLND_IB_MTU,
+       O2IBLND_MAP_ON_DEMAND,
+       O2IBLND_FMR_POOL_SIZE,
+       O2IBLND_FMR_FLUSH_TRIGGER,
+       O2IBLND_FMR_CACHE,
+       O2IBLND_PMR_POOL_SIZE,
+       O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+       {
+               .ctl_name = O2IBLND_SERVICE,
+               .procname = "service",
+               .data     = &service,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CKSUM,
+               .procname = "cksum",
+               .data     = &cksum,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_TIMEOUT,
+               .procname = "timeout",
+               .data     = &timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_NTX,
+               .procname = "ntx",
+               .data     = &ntx,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CREDITS,
+               .procname = "credits",
+               .data     = &credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_TXCREDITS,
+               .procname = "peer_credits",
+               .data     = &peer_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_CREDITS_HIW,
+               .procname = "peer_credits_hiw",
+               .data     = &peer_credits_hiw,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_RTRCREDITS,
+               .procname = "peer_buffer_credits",
+               .data     = &peer_buffer_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_TIMEOUT,
+               .procname = "peer_timeout",
+               .data     = &peer_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_IPIF_BASENAME,
+               .procname = "ipif_name",
+               .data     = ipif_basename_space,
+               .maxlen   = sizeof(ipif_basename_space),
+               .mode     = 0444,
+               .proc_handler = &proc_dostring
+       },
+       {
+               .ctl_name = O2IBLND_RETRY_COUNT,
+               .procname = "retry_count",
+               .data     = &retry_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_RNR_RETRY_COUNT,
+               .procname = "rnr_retry_count",
+               .data     = &rnr_retry_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_KEEPALIVE,
+               .procname = "keepalive",
+               .data     = &keepalive,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CONCURRENT_SENDS,
+               .procname = "concurrent_sends",
+               .data     = &concurrent_sends,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_IB_MTU,
+               .procname = "ib_mtu",
+               .data     = &ib_mtu,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_MAP_ON_DEMAND,
+               .procname = "map_on_demand",
+               .data     = &map_on_demand,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+
+       {
+               .ctl_name = O2IBLND_FMR_POOL_SIZE,
+               .procname = "fmr_pool_size",
+               .data     = &fmr_pool_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+               .procname = "fmr_flush_trigger",
+               .data     = &fmr_flush_trigger,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_FMR_CACHE,
+               .procname = "fmr_cache",
+               .data     = &fmr_cache,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PMR_POOL_SIZE,
+               .procname = "pmr_pool_size",
+               .data     = &pmr_pool_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_DEV_FAILOVER,
+               .procname = "dev_failover",
+               .data     = &dev_failover,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+       {
+               .ctl_name = CTL_O2IBLND,
+               .procname = "o2iblnd",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = kiblnd_ctl_table
+       },
+       {0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+       strncpy(space, str, size);
+       space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+       kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+                             sizeof(ipif_basename_space));
+
+       kiblnd_tunables.kib_sysctl =
+               cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+       if (kiblnd_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+       if (kiblnd_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+       if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+               CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+                      *kiblnd_tunables.kib_ib_mtu);
+               return -EINVAL;
+       }
+
+       if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+               *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+       if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+               *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+       if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+               *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+       if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+               *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+       if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+               *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+       if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+           *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+               *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+       if (*kiblnd_tunables.kib_map_on_demand == 1)
+               *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+       if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+               if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+                   *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+                       *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+               else
+                       *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+       }
+
+       if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+               *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+               *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+               CWARN("Concurrent sends %d is lower than message queue size: %d, "
+                     "performance may drop slightly.\n",
+                     *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+       }
+
+       kiblnd_sysctl_init();
+       return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+       kiblnd_sysctl_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644 (file)
index 0000000..6494b2b
--- /dev/null
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644 (file)
index 0000000..c826bf9
--- /dev/null
@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t             the_ksocklnd;
+ksock_nal_data_t       ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             i;
+       ksock_interface_t *iface;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               LASSERT(i < LNET_MAX_INTERFACES);
+               iface = &net->ksnn_interfaces[i];
+
+               if (iface->ksni_ipaddr == ip)
+                       return (iface);
+       }
+
+       return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+       ksock_route_t *route;
+
+       LIBCFS_ALLOC (route, sizeof (*route));
+       if (route == NULL)
+               return (NULL);
+
+       atomic_set (&route->ksnr_refcount, 1);
+       route->ksnr_peer = NULL;
+       route->ksnr_retry_interval = 0;  /* OK to connect at any time */
+       route->ksnr_ipaddr = ipaddr;
+       route->ksnr_port = port;
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+       route->ksnr_connected = 0;
+       route->ksnr_deleted = 0;
+       route->ksnr_conn_count = 0;
+       route->ksnr_share_count = 0;
+
+       return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+       LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+       if (route->ksnr_peer != NULL)
+               ksocknal_peer_decref(route->ksnr_peer);
+
+       LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_net_t   *net = ni->ni_data;
+       ksock_peer_t  *peer;
+
+       LASSERT (id.nid != LNET_NID_ANY);
+       LASSERT (id.pid != LNET_PID_ANY);
+       LASSERT (!in_interrupt());
+
+       LIBCFS_ALLOC (peer, sizeof (*peer));
+       if (peer == NULL)
+               return -ENOMEM;
+
+       memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+       peer->ksnp_ni = ni;
+       peer->ksnp_id = id;
+       atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+       peer->ksnp_closing = 0;
+       peer->ksnp_accepting = 0;
+       peer->ksnp_proto = NULL;
+       peer->ksnp_last_alive = 0;
+       peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+       INIT_LIST_HEAD (&peer->ksnp_conns);
+       INIT_LIST_HEAD (&peer->ksnp_routes);
+       INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+       INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+       spin_lock_init(&peer->ksnp_lock);
+
+       spin_lock_bh(&net->ksnn_lock);
+
+       if (net->ksnn_shutdown) {
+               spin_unlock_bh(&net->ksnn_lock);
+
+               LIBCFS_FREE(peer, sizeof(*peer));
+               CERROR("Can't create peer: network shutdown\n");
+               return -ESHUTDOWN;
+       }
+
+       net->ksnn_npeers++;
+
+       spin_unlock_bh(&net->ksnn_lock);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+       ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+       CDEBUG (D_NET, "peer %s %p deleted\n",
+               libcfs_id2str(peer->ksnp_id), peer);
+
+       LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+       LASSERT (peer->ksnp_accepting == 0);
+       LASSERT (list_empty (&peer->ksnp_conns));
+       LASSERT (list_empty (&peer->ksnp_routes));
+       LASSERT (list_empty (&peer->ksnp_tx_queue));
+       LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+       LIBCFS_FREE (peer, sizeof (*peer));
+
+       /* NB a peer's connections and routes keep a reference on their peer
+        * until they are destroyed, so we can be assured that _all_ state to
+        * do with this peer has been cleaned up when its refcount drops to
+        * zero. */
+       spin_lock_bh(&net->ksnn_lock);
+       net->ksnn_npeers--;
+       spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+       struct list_head       *tmp;
+       ksock_peer_t     *peer;
+
+       list_for_each (tmp, peer_list) {
+
+               peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+               LASSERT (!peer->ksnp_closing);
+
+               if (peer->ksnp_ni != ni)
+                       continue;
+
+               if (peer->ksnp_id.nid != id.nid ||
+                   peer->ksnp_id.pid != id.pid)
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                      peer, libcfs_id2str(id),
+                      atomic_read(&peer->ksnp_refcount));
+               return (peer);
+       }
+       return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_peer_t     *peer;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL)                       /* +1 ref for caller? */
+               ksocknal_peer_addref(peer);
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+       int             i;
+       __u32         ip;
+       ksock_interface_t *iface;
+
+       for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+               LASSERT (i < LNET_MAX_INTERFACES);
+               ip = peer->ksnp_passive_ips[i];
+
+               iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+               /* All IPs in peer->ksnp_passive_ips[] come from the
+                * interface list, therefore the call must succeed. */
+               LASSERT (iface != NULL);
+
+               CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+                      peer, iface, iface->ksni_nroutes);
+               iface->ksni_npeers--;
+       }
+
+       LASSERT (list_empty(&peer->ksnp_conns));
+       LASSERT (list_empty(&peer->ksnp_routes));
+       LASSERT (!peer->ksnp_closing);
+       peer->ksnp_closing = 1;
+       list_del (&peer->ksnp_list);
+       /* lose peerlist's ref */
+       ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+                       lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+                       int *port, int *conn_count, int *share_count)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *ptmp;
+       ksock_route_t     *route;
+       struct list_head        *rtmp;
+       int             i;
+       int             j;
+       int             rc = -ENOENT;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+               list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       if (peer->ksnp_n_passive_ips == 0 &&
+                           list_empty(&peer->ksnp_routes)) {
+                               if (index-- > 0)
+                                       continue;
+
+                               *id = peer->ksnp_id;
+                               *myip = 0;
+                               *peer_ip = 0;
+                               *port = 0;
+                               *conn_count = 0;
+                               *share_count = 0;
+                               rc = 0;
+                               goto out;
+                       }
+
+                       for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+                               if (index-- > 0)
+                                       continue;
+
+                               *id = peer->ksnp_id;
+                               *myip = peer->ksnp_passive_ips[j];
+                               *peer_ip = 0;
+                               *port = 0;
+                               *conn_count = 0;
+                               *share_count = 0;
+                               rc = 0;
+                               goto out;
+                       }
+
+                       list_for_each (rtmp, &peer->ksnp_routes) {
+                               if (index-- > 0)
+                                       continue;
+
+                               route = list_entry(rtmp, ksock_route_t,
+                                                      ksnr_list);
+
+                               *id = peer->ksnp_id;
+                               *myip = route->ksnr_myipaddr;
+                               *peer_ip = route->ksnr_ipaddr;
+                               *port = route->ksnr_port;
+                               *conn_count = route->ksnr_conn_count;
+                               *share_count = route->ksnr_share_count;
+                               rc = 0;
+                               goto out;
+                       }
+               }
+       }
+ out:
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+       ksock_peer_t      *peer = route->ksnr_peer;
+       int             type = conn->ksnc_type;
+       ksock_interface_t *iface;
+
+       conn->ksnc_route = route;
+       ksocknal_route_addref(route);
+
+       if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+               if (route->ksnr_myipaddr == 0) {
+                       /* route wasn't bound locally yet (the initial route) */
+                       CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+                              libcfs_id2str(peer->ksnp_id),
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+               } else {
+                       CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+                              "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                              libcfs_id2str(peer->ksnp_id),
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(route->ksnr_myipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+
+                       iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                                 route->ksnr_myipaddr);
+                       if (iface != NULL)
+                               iface->ksni_nroutes--;
+               }
+               route->ksnr_myipaddr = conn->ksnc_myipaddr;
+               iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                         route->ksnr_myipaddr);
+               if (iface != NULL)
+                       iface->ksni_nroutes++;
+       }
+
+       route->ksnr_connected |= (1<<type);
+       route->ksnr_conn_count++;
+
+       /* Successful connection => further attempts can
+        * proceed immediately */
+       route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+       struct list_head        *tmp;
+       ksock_conn_t      *conn;
+       ksock_route_t     *route2;
+
+       LASSERT (!peer->ksnp_closing);
+       LASSERT (route->ksnr_peer == NULL);
+       LASSERT (!route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+       LASSERT (route->ksnr_connected == 0);
+
+       /* LASSERT(unique) */
+       list_for_each(tmp, &peer->ksnp_routes) {
+               route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+                       CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr));
+                       LBUG();
+               }
+       }
+
+       route->ksnr_peer = peer;
+       ksocknal_peer_addref(peer);
+       /* peer's routelist takes over my ref on 'route' */
+       list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+       list_for_each(tmp, &peer->ksnp_conns) {
+               conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+                       continue;
+
+               ksocknal_associate_route_conn_locked(route, conn);
+               /* keep going (typed routes) */
+       }
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+       ksock_peer_t      *peer = route->ksnr_peer;
+       ksock_interface_t *iface;
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+       struct list_head        *cnxt;
+
+       LASSERT (!route->ksnr_deleted);
+
+       /* Close associated conns */
+       list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+               conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_route != route)
+                       continue;
+
+               ksocknal_close_conn_locked (conn, 0);
+       }
+
+       if (route->ksnr_myipaddr != 0) {
+               iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                         route->ksnr_myipaddr);
+               if (iface != NULL)
+                       iface->ksni_nroutes--;
+       }
+
+       route->ksnr_deleted = 1;
+       list_del (&route->ksnr_list);
+       ksocknal_route_decref(route);        /* drop peer's ref */
+
+       if (list_empty (&peer->ksnp_routes) &&
+           list_empty (&peer->ksnp_conns)) {
+               /* I've just removed the last route to a peer with no active
+                * connections */
+               ksocknal_unlink_peer_locked (peer);
+       }
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+       struct list_head        *tmp;
+       ksock_peer_t      *peer;
+       ksock_peer_t      *peer2;
+       ksock_route_t     *route;
+       ksock_route_t     *route2;
+       int             rc;
+
+       if (id.nid == LNET_NID_ANY ||
+           id.pid == LNET_PID_ANY)
+               return (-EINVAL);
+
+       /* Have a brand new peer ready... */
+       rc = ksocknal_create_peer(&peer, ni, id);
+       if (rc != 0)
+               return rc;
+
+       route = ksocknal_create_route (ipaddr, port);
+       if (route == NULL) {
+               ksocknal_peer_decref(peer);
+               return (-ENOMEM);
+       }
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       /* always called with a ref on ni, so shutdown can't have started */
+       LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+       peer2 = ksocknal_find_peer_locked (ni, id);
+       if (peer2 != NULL) {
+               ksocknal_peer_decref(peer);
+               peer = peer2;
+       } else {
+               /* peer table takes my ref on peer */
+               list_add_tail (&peer->ksnp_list,
+                                  ksocknal_nid2peerlist (id.nid));
+       }
+
+       route2 = NULL;
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route2->ksnr_ipaddr == ipaddr)
+                       break;
+
+               route2 = NULL;
+       }
+       if (route2 == NULL) {
+               ksocknal_add_route_locked(peer, route);
+               route->ksnr_share_count++;
+       } else {
+               ksocknal_route_decref(route);
+               route2->ksnr_share_count++;
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+       ksock_conn_t     *conn;
+       ksock_route_t    *route;
+       struct list_head       *tmp;
+       struct list_head       *nxt;
+       int            nshared;
+
+       LASSERT (!peer->ksnp_closing);
+
+       /* Extra ref prevents peer disappearing until I'm done with it */
+       ksocknal_peer_addref(peer);
+
+       list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               /* no match */
+               if (!(ip == 0 || route->ksnr_ipaddr == ip))
+                       continue;
+
+               route->ksnr_share_count = 0;
+               /* This deletes associated conns too */
+               ksocknal_del_route_locked (route);
+       }
+
+       nshared = 0;
+       list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+               nshared += route->ksnr_share_count;
+       }
+
+       if (nshared == 0) {
+               /* remove everything else if there are no explicit entries
+                * left */
+
+               list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                       route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                       /* we should only be removing auto-entries */
+                       LASSERT(route->ksnr_share_count == 0);
+                       ksocknal_del_route_locked (route);
+               }
+
+               list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       ksocknal_close_conn_locked(conn, 0);
+               }
+       }
+
+       ksocknal_peer_decref(peer);
+       /* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+       LIST_HEAD     (zombies);
+       struct list_head        *ptmp;
+       struct list_head        *pnxt;
+       ksock_peer_t      *peer;
+       int             lo;
+       int             hi;
+       int             i;
+       int             rc = -ENOENT;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (id.nid != LNET_NID_ANY)
+               lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+       else {
+               lo = 0;
+               hi = ksocknal_data.ksnd_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt,
+                                       &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+                             (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+                               continue;
+
+                       ksocknal_peer_addref(peer);     /* a ref for me... */
+
+                       ksocknal_del_peer_locked (peer, ip);
+
+                       if (peer->ksnp_closing &&
+                           !list_empty(&peer->ksnp_tx_queue)) {
+                               LASSERT (list_empty(&peer->ksnp_conns));
+                               LASSERT (list_empty(&peer->ksnp_routes));
+
+                               list_splice_init(&peer->ksnp_tx_queue,
+                                                    &zombies);
+                       }
+
+                       ksocknal_peer_decref(peer);     /* ...till here */
+
+                       rc = 0;          /* matched! */
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_txlist_done(ni, &zombies, 1);
+
+       return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *ptmp;
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+       int             i;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       LASSERT (!peer->ksnp_closing);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       list_for_each (ctmp, &peer->ksnp_conns) {
+                               if (index-- > 0)
+                                       continue;
+
+                               conn = list_entry (ctmp, ksock_conn_t,
+                                                      ksnc_list);
+                               ksocknal_conn_addref(conn);
+                               read_unlock(&ksocknal_data. \
+                                                ksnd_global_lock);
+                               return (conn);
+                       }
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+       struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+       ksock_sched_t           *sched;
+       int                     i;
+
+       LASSERT(info->ksi_nthreads > 0);
+
+       sched = &info->ksi_scheds[0];
+       /*
+        * NB: it's safe so far, but info->ksi_nthreads could be changed
+        * at runtime when we have dynamic LNet configuration, then we
+        * need to take care of this.
+        */
+       for (i = 1; i < info->ksi_nthreads; i++) {
+               if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+                       sched = &info->ksi_scheds[i];
+       }
+
+       return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             i;
+       int             nip;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       nip = net->ksnn_ninterfaces;
+       LASSERT (nip <= LNET_MAX_INTERFACES);
+
+       /* Only offer interfaces for additional connections if I have
+        * more than one. */
+       if (nip < 2) {
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return 0;
+       }
+
+       for (i = 0; i < nip; i++) {
+               ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+               LASSERT (ipaddrs[i] != 0);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+       int   best_netmatch = 0;
+       int   best_xor      = 0;
+       int   best        = -1;
+       int   this_xor;
+       int   this_netmatch;
+       int   i;
+
+       for (i = 0; i < nips; i++) {
+               if (ips[i] == 0)
+                       continue;
+
+               this_xor = (ips[i] ^ iface->ksni_ipaddr);
+               this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+               if (!(best < 0 ||
+                     best_netmatch < this_netmatch ||
+                     (best_netmatch == this_netmatch &&
+                      best_xor > this_xor)))
+                       continue;
+
+               best = i;
+               best_netmatch = this_netmatch;
+               best_xor = this_xor;
+       }
+
+       LASSERT (best >= 0);
+       return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       ksock_net_t     *net = peer->ksnp_ni->ni_data;
+       ksock_interface_t  *iface;
+       ksock_interface_t  *best_iface;
+       int              n_ips;
+       int              i;
+       int              j;
+       int              k;
+       __u32          ip;
+       __u32          xor;
+       int              this_netmatch;
+       int              best_netmatch;
+       int              best_npeers;
+
+       /* CAVEAT EMPTOR: We do all our interface matching with an
+        * exclusive hold of global lock at IRQ priority.  We're only
+        * expecting to be dealing with small numbers of interfaces, so the
+        * O(n**3)-ness shouldn't matter */
+
+       /* Also note that I'm not going to return more than n_peerips
+        * interfaces, even if I have more myself */
+
+       write_lock_bh(global_lock);
+
+       LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+       LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+       /* Only match interfaces for additional connections
+        * if I have > 1 interface */
+       n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+               MIN(n_peerips, net->ksnn_ninterfaces);
+
+       for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+               /*            ^ yes really... */
+
+               /* If we have any new interfaces, first tick off all the
+                * peer IPs that match old interfaces, then choose new
+                * interfaces to match the remaining peer IPS.
+                * We don't forget interfaces we've stopped using; we might
+                * start using them again... */
+
+               if (i < peer->ksnp_n_passive_ips) {
+                       /* Old interface. */
+                       ip = peer->ksnp_passive_ips[i];
+                       best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+                       /* peer passive ips are kept up to date */
+                       LASSERT(best_iface != NULL);
+               } else {
+                       /* choose a new interface */
+                       LASSERT (i == peer->ksnp_n_passive_ips);
+
+                       best_iface = NULL;
+                       best_netmatch = 0;
+                       best_npeers = 0;
+
+                       for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                               iface = &net->ksnn_interfaces[j];
+                               ip = iface->ksni_ipaddr;
+
+                               for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+                                       if (peer->ksnp_passive_ips[k] == ip)
+                                               break;
+
+                               if (k < peer->ksnp_n_passive_ips) /* using it already */
+                                       continue;
+
+                               k = ksocknal_match_peerip(iface, peerips, n_peerips);
+                               xor = (ip ^ peerips[k]);
+                               this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                               if (!(best_iface == NULL ||
+                                     best_netmatch < this_netmatch ||
+                                     (best_netmatch == this_netmatch &&
+                                      best_npeers > iface->ksni_npeers)))
+                                       continue;
+
+                               best_iface = iface;
+                               best_netmatch = this_netmatch;
+                               best_npeers = iface->ksni_npeers;
+                       }
+
+                       best_iface->ksni_npeers++;
+                       ip = best_iface->ksni_ipaddr;
+                       peer->ksnp_passive_ips[i] = ip;
+                       peer->ksnp_n_passive_ips = i+1;
+               }
+
+               LASSERT (best_iface != NULL);
+
+               /* mark the best matching peer IP used */
+               j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+               peerips[j] = 0;
+       }
+
+       /* Overwrite input peer IP addresses */
+       memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+       write_unlock_bh(global_lock);
+
+       return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+                      __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+       ksock_route_t       *newroute = NULL;
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       lnet_ni_t          *ni = peer->ksnp_ni;
+       ksock_net_t      *net = ni->ni_data;
+       struct list_head          *rtmp;
+       ksock_route_t       *route;
+       ksock_interface_t   *iface;
+       ksock_interface_t   *best_iface;
+       int               best_netmatch;
+       int               this_netmatch;
+       int               best_nroutes;
+       int               i;
+       int               j;
+
+       /* CAVEAT EMPTOR: We do all our interface matching with an
+        * exclusive hold of global lock at IRQ priority.  We're only
+        * expecting to be dealing with small numbers of interfaces, so the
+        * O(n**3)-ness here shouldn't matter */
+
+       write_lock_bh(global_lock);
+
+       if (net->ksnn_ninterfaces < 2) {
+               /* Only create additional connections
+                * if I have > 1 interface */
+               write_unlock_bh(global_lock);
+               return;
+       }
+
+       LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+       for (i = 0; i < npeer_ipaddrs; i++) {
+               if (newroute != NULL) {
+                       newroute->ksnr_ipaddr = peer_ipaddrs[i];
+               } else {
+                       write_unlock_bh(global_lock);
+
+                       newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+                       if (newroute == NULL)
+                               return;
+
+                       write_lock_bh(global_lock);
+               }
+
+               if (peer->ksnp_closing) {
+                       /* peer got closed under me */
+                       break;
+               }
+
+               /* Already got a route? */
+               route = NULL;
+               list_for_each(rtmp, &peer->ksnp_routes) {
+                       route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                       if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+                               break;
+
+                       route = NULL;
+               }
+               if (route != NULL)
+                       continue;
+
+               best_iface = NULL;
+               best_nroutes = 0;
+               best_netmatch = 0;
+
+               LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+               /* Select interface to connect from */
+               for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                       iface = &net->ksnn_interfaces[j];
+
+                       /* Using this interface already? */
+                       list_for_each(rtmp, &peer->ksnp_routes) {
+                               route = list_entry(rtmp, ksock_route_t,
+                                                      ksnr_list);
+
+                               if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+                                       break;
+
+                               route = NULL;
+                       }
+                       if (route != NULL)
+                               continue;
+
+                       this_netmatch = (((iface->ksni_ipaddr ^
+                                          newroute->ksnr_ipaddr) &
+                                          iface->ksni_netmask) == 0) ? 1 : 0;
+
+                       if (!(best_iface == NULL ||
+                             best_netmatch < this_netmatch ||
+                             (best_netmatch == this_netmatch &&
+                              best_nroutes > iface->ksni_nroutes)))
+                               continue;
+
+                       best_iface = iface;
+                       best_netmatch = this_netmatch;
+                       best_nroutes = iface->ksni_nroutes;
+               }
+
+               if (best_iface == NULL)
+                       continue;
+
+               newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+               best_iface->ksni_nroutes++;
+
+               ksocknal_add_route_locked(peer, newroute);
+               newroute = NULL;
+       }
+
+       write_unlock_bh(global_lock);
+       if (newroute != NULL)
+               ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+       ksock_connreq_t    *cr;
+       int              rc;
+       __u32          peer_ip;
+       int              peer_port;
+
+       rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+       LASSERT (rc == 0);                    /* we succeeded before */
+
+       LIBCFS_ALLOC(cr, sizeof(*cr));
+       if (cr == NULL) {
+               LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+                                  "%u.%u.%u.%u: memory exhausted\n",
+                                  HIPQUAD(peer_ip));
+               return -ENOMEM;
+       }
+
+       lnet_ni_addref(ni);
+       cr->ksncr_ni   = ni;
+       cr->ksncr_sock = sock;
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+       wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+       return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+       ksock_route_t   *route;
+
+       list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+               if (route->ksnr_ipaddr == ipaddr)
+                       return route->ksnr_connecting;
+       }
+       return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                     socket_t *sock, int type)
+{
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       LIST_HEAD     (zombies);
+       lnet_process_id_t  peerid;
+       struct list_head        *tmp;
+       __u64         incarnation;
+       ksock_conn_t      *conn;
+       ksock_conn_t      *conn2;
+       ksock_peer_t      *peer = NULL;
+       ksock_peer_t      *peer2;
+       ksock_sched_t     *sched;
+       ksock_hello_msg_t *hello;
+       int                cpt;
+       ksock_tx_t      *tx;
+       ksock_tx_t      *txtmp;
+       int             rc;
+       int             active;
+       char          *warn = NULL;
+
+       active = (route != NULL);
+
+       LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+       LIBCFS_ALLOC(conn, sizeof(*conn));
+       if (conn == NULL) {
+               rc = -ENOMEM;
+               goto failed_0;
+       }
+
+       memset (conn, 0, sizeof (*conn));
+
+       conn->ksnc_peer = NULL;
+       conn->ksnc_route = NULL;
+       conn->ksnc_sock = sock;
+       /* 2 ref, 1 for conn, another extra ref prevents socket
+        * being closed before establishment of connection */
+       atomic_set (&conn->ksnc_sock_refcount, 2);
+       conn->ksnc_type = type;
+       ksocknal_lib_save_callback(sock, conn);
+       atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+       conn->ksnc_rx_ready = 0;
+       conn->ksnc_rx_scheduled = 0;
+
+       INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+       conn->ksnc_tx_ready = 0;
+       conn->ksnc_tx_scheduled = 0;
+       conn->ksnc_tx_carrier = NULL;
+       atomic_set (&conn->ksnc_tx_nob, 0);
+
+       LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+                                    kshm_ips[LNET_MAX_INTERFACES]));
+       if (hello == NULL) {
+               rc = -ENOMEM;
+               goto failed_1;
+       }
+
+       /* stash conn's local and remote addrs */
+       rc = ksocknal_lib_get_conn_addrs (conn);
+       if (rc != 0)
+               goto failed_1;
+
+       /* Find out/confirm peer's NID and connection type and get the
+        * vector of interfaces she's willing to let me connect to.
+        * Passive connections use the listener timeout since the peer sends
+        * eagerly */
+
+       if (active) {
+               peer = route->ksnr_peer;
+               LASSERT(ni == peer->ksnp_ni);
+
+               /* Active connection sends HELLO eagerly */
+               hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+               peerid = peer->ksnp_id;
+
+               write_lock_bh(global_lock);
+               conn->ksnc_proto = peer->ksnp_proto;
+               write_unlock_bh(global_lock);
+
+               if (conn->ksnc_proto == NULL) {
+                        conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                        if (*ksocknal_tunables.ksnd_protocol == 2)
+                                conn->ksnc_proto = &ksocknal_protocol_v2x;
+                        else if (*ksocknal_tunables.ksnd_protocol == 1)
+                                conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+               }
+
+               rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+               if (rc != 0)
+                       goto failed_1;
+       } else {
+               peerid.nid = LNET_NID_ANY;
+               peerid.pid = LNET_PID_ANY;
+
+               /* Passive, get protocol from peer */
+               conn->ksnc_proto = NULL;
+       }
+
+       rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+       if (rc < 0)
+               goto failed_1;
+
+       LASSERT (rc == 0 || active);
+       LASSERT (conn->ksnc_proto != NULL);
+       LASSERT (peerid.nid != LNET_NID_ANY);
+
+       cpt = lnet_cpt_of_nid(peerid.nid);
+
+       if (active) {
+               ksocknal_peer_addref(peer);
+               write_lock_bh(global_lock);
+       } else {
+               rc = ksocknal_create_peer(&peer, ni, peerid);
+               if (rc != 0)
+                       goto failed_1;
+
+               write_lock_bh(global_lock);
+
+               /* called with a ref on ni, so shutdown can't have started */
+               LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+               peer2 = ksocknal_find_peer_locked(ni, peerid);
+               if (peer2 == NULL) {
+                       /* NB this puts an "empty" peer in the peer
+                        * table (which takes my ref) */
+                       list_add_tail(&peer->ksnp_list,
+                                         ksocknal_nid2peerlist(peerid.nid));
+               } else {
+                       ksocknal_peer_decref(peer);
+                       peer = peer2;
+               }
+
+               /* +1 ref for me */
+               ksocknal_peer_addref(peer);
+               peer->ksnp_accepting++;
+
+               /* Am I already connecting to this guy?  Resolve in
+                * favour of higher NID... */
+               if (peerid.nid < ni->ni_nid &&
+                   ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+                       rc = EALREADY;
+                       warn = "connection race resolution";
+                       goto failed_2;
+               }
+       }
+
+       if (peer->ksnp_closing ||
+           (active && route->ksnr_deleted)) {
+               /* peer/route got closed under me */
+               rc = -ESTALE;
+               warn = "peer/route removed";
+               goto failed_2;
+       }
+
+       if (peer->ksnp_proto == NULL) {
+               /* Never connected before.
+                * NB recv_hello may have returned EPROTO to signal my peer
+                * wants a different protocol than the one I asked for.
+                */
+               LASSERT (list_empty(&peer->ksnp_conns));
+
+               peer->ksnp_proto = conn->ksnc_proto;
+               peer->ksnp_incarnation = incarnation;
+       }
+
+       if (peer->ksnp_proto != conn->ksnc_proto ||
+           peer->ksnp_incarnation != incarnation) {
+               /* Peer rebooted or I've got the wrong protocol version */
+               ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+               peer->ksnp_proto = NULL;
+               rc = ESTALE;
+               warn = peer->ksnp_incarnation != incarnation ?
+                      "peer rebooted" :
+                      "wrong proto version";
+               goto failed_2;
+       }
+
+       switch (rc) {
+       default:
+               LBUG();
+       case 0:
+               break;
+       case EALREADY:
+               warn = "lost conn race";
+               goto failed_2;
+       case EPROTO:
+               warn = "retry with different protocol version";
+               goto failed_2;
+       }
+
+       /* Refuse to duplicate an existing connection, unless this is a
+        * loopback connection */
+       if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+               list_for_each(tmp, &peer->ksnp_conns) {
+                       conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+                           conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+                           conn2->ksnc_type != conn->ksnc_type)
+                               continue;
+
+                       /* Reply on a passive connection attempt so the peer
+                        * realises we're connected. */
+                       LASSERT (rc == 0);
+                       if (!active)
+                               rc = EALREADY;
+
+                       warn = "duplicate";
+                       goto failed_2;
+               }
+       }
+
+       /* If the connection created by this route didn't bind to the IP
+        * address the route connected to, the connection/route matching
+        * code below probably isn't going to work. */
+       if (active &&
+           route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+               CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+                      libcfs_id2str(peer->ksnp_id),
+                      HIPQUAD(route->ksnr_ipaddr),
+                      HIPQUAD(conn->ksnc_ipaddr));
+       }
+
+       /* Search for a route corresponding to the new connection and
+        * create an association.  This allows incoming connections created
+        * by routes in my peer to match my own route entries so I don't
+        * continually create duplicate routes. */
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                       continue;
+
+               ksocknal_associate_route_conn_locked(route, conn);
+               break;
+       }
+
+       conn->ksnc_peer = peer;          /* conn takes my ref on peer */
+       peer->ksnp_last_alive = cfs_time_current();
+       peer->ksnp_send_keepalive = 0;
+       peer->ksnp_error = 0;
+
+       sched = ksocknal_choose_scheduler_locked(cpt);
+       sched->kss_nconns++;
+       conn->ksnc_scheduler = sched;
+
+       conn->ksnc_tx_last_post = cfs_time_current();
+       /* Set the deadline for the outgoing HELLO to drain */
+       conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+       conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();   /* order with adding to peer's conn list */
+
+       list_add (&conn->ksnc_list, &peer->ksnp_conns);
+       ksocknal_conn_addref(conn);
+
+       ksocknal_new_packet(conn, 0);
+
+       conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+       /* Take packets blocking for this connection. */
+       list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+               if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+                               continue;
+
+               list_del (&tx->tx_list);
+               ksocknal_queue_tx_locked (tx, conn);
+       }
+
+       write_unlock_bh(global_lock);
+
+       /* We've now got a new connection.  Any errors from here on are just
+        * like "normal" comms errors and we close the connection normally.
+        * NB (a) we still have to send the reply HELLO for passive
+        *      connections,
+        *    (b) normal I/O on the conn is blocked until I setup and call the
+        *      socket callbacks.
+        */
+
+       CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+              " incarnation:"LPD64" sched[%d:%d]\n",
+              libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+              HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+              conn->ksnc_port, incarnation, cpt,
+              (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+       if (active) {
+               /* additional routes after interface exchange? */
+               ksocknal_create_routes(peer, conn->ksnc_port,
+                                      hello->kshm_ips, hello->kshm_nips);
+       } else {
+               hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+                                                      hello->kshm_nips);
+               rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+       }
+
+       LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                   kshm_ips[LNET_MAX_INTERFACES]));
+
+       /* setup the socket AFTER I've received hello (it disables
+        * SO_LINGER).  I might call back to the acceptor who may want
+        * to send a protocol version response and then close the
+        * socket; this ensures the socket only tears down after the
+        * response has been sent. */
+       if (rc == 0)
+               rc = ksocknal_lib_setup_sock(sock);
+
+       write_lock_bh(global_lock);
+
+       /* NB my callbacks block while I hold ksnd_global_lock */
+       ksocknal_lib_set_callback(sock, conn);
+
+       if (!active)
+               peer->ksnp_accepting--;
+
+       write_unlock_bh(global_lock);
+
+       if (rc != 0) {
+               write_lock_bh(global_lock);
+               if (!conn->ksnc_closing) {
+                       /* could be closed by another thread */
+                       ksocknal_close_conn_locked(conn, rc);
+               }
+               write_unlock_bh(global_lock);
+       } else if (ksocknal_connsock_addref(conn) == 0) {
+               /* Allow I/O to proceed. */
+               ksocknal_read_callback(conn);
+               ksocknal_write_callback(conn);
+               ksocknal_connsock_decref(conn);
+       }
+
+       ksocknal_connsock_decref(conn);
+       ksocknal_conn_decref(conn);
+       return rc;
+
+ failed_2:
+       if (!peer->ksnp_closing &&
+           list_empty (&peer->ksnp_conns) &&
+           list_empty (&peer->ksnp_routes)) {
+               list_add(&zombies, &peer->ksnp_tx_queue);
+               list_del_init(&peer->ksnp_tx_queue);
+               ksocknal_unlink_peer_locked(peer);
+       }
+
+       write_unlock_bh(global_lock);
+
+       if (warn != NULL) {
+               if (rc < 0)
+                       CERROR("Not creating conn %s type %d: %s\n",
+                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+               else
+                       CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+                             libcfs_id2str(peerid), conn->ksnc_type, warn);
+       }
+
+       if (!active) {
+               if (rc > 0) {
+                       /* Request retry by replying with CONN_NONE
+                        * ksnc_proto has been set already */
+                       conn->ksnc_type = SOCKLND_CONN_NONE;
+                       hello->kshm_nips = 0;
+                       ksocknal_send_hello(ni, conn, peerid.nid, hello);
+               }
+
+               write_lock_bh(global_lock);
+               peer->ksnp_accepting--;
+               write_unlock_bh(global_lock);
+       }
+
+       ksocknal_txlist_done(ni, &zombies, 1);
+       ksocknal_peer_decref(peer);
+
+ failed_1:
+       if (hello != NULL)
+               LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                           kshm_ips[LNET_MAX_INTERFACES]));
+
+       LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+       libcfs_sock_release(sock);
+       return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+       /* This just does the immmediate housekeeping, and queues the
+        * connection for the reaper to terminate.
+        * Caller holds ksnd_global_lock exclusively in irq context */
+       ksock_peer_t      *peer = conn->ksnc_peer;
+       ksock_route_t     *route;
+       ksock_conn_t      *conn2;
+       struct list_head        *tmp;
+
+       LASSERT (peer->ksnp_error == 0);
+       LASSERT (!conn->ksnc_closing);
+       conn->ksnc_closing = 1;
+
+       /* ksnd_deathrow_conns takes over peer's ref */
+       list_del (&conn->ksnc_list);
+
+       route = conn->ksnc_route;
+       if (route != NULL) {
+               /* dissociate conn from route... */
+               LASSERT (!route->ksnr_deleted);
+               LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+               conn2 = NULL;
+               list_for_each(tmp, &peer->ksnp_conns) {
+                       conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       if (conn2->ksnc_route == route &&
+                           conn2->ksnc_type == conn->ksnc_type)
+                               break;
+
+                       conn2 = NULL;
+               }
+               if (conn2 == NULL)
+                       route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+               conn->ksnc_route = NULL;
+
+#if 0     /* irrelevent with only eager routes */
+               /* make route least favourite */
+               list_del (&route->ksnr_list);
+               list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+               ksocknal_route_decref(route);     /* drop conn's ref on route */
+       }
+
+       if (list_empty (&peer->ksnp_conns)) {
+               /* No more connections to this peer */
+
+               if (!list_empty(&peer->ksnp_tx_queue)) {
+                       ksock_tx_t *tx;
+
+                       LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+                       /* throw them to the last connection...,
+                        * these TXs will be send to /dev/null by scheduler */
+                       list_for_each_entry(tx, &peer->ksnp_tx_queue,
+                                               tx_list)
+                               ksocknal_tx_prep(conn, tx);
+
+                       spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+                       list_splice_init(&peer->ksnp_tx_queue,
+                                            &conn->ksnc_tx_queue);
+                       spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+               }
+
+               peer->ksnp_proto = NULL;        /* renegotiate protocol version */
+               peer->ksnp_error = error;       /* stash last conn close reason */
+
+               if (list_empty (&peer->ksnp_routes)) {
+                       /* I've just closed last conn belonging to a
+                        * peer with no routes to it */
+                       ksocknal_unlink_peer_locked (peer);
+               }
+       }
+
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       list_add_tail(&conn->ksnc_list,
+                         &ksocknal_data.ksnd_deathrow_conns);
+       wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+       int     notify = 0;
+       cfs_time_t last_alive = 0;
+
+       /* There has been a connection failure or comms error; but I'll only
+        * tell LNET I think the peer is dead if it's to another kernel and
+        * there are no connections or connection attempts in existance. */
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+           list_empty(&peer->ksnp_conns) &&
+           peer->ksnp_accepting == 0 &&
+           ksocknal_find_connecting_route_locked(peer) == NULL) {
+               notify = 1;
+               last_alive = peer->ksnp_last_alive;
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       if (notify)
+               lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+                            last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       ksock_tx_t       *tx;
+       ksock_tx_t       *tmp;
+       LIST_HEAD    (zlist);
+
+       /* NB safe to finalize TXs because closing of socket will
+        * abort all buffered data */
+       LASSERT (conn->ksnc_sock == NULL);
+
+       spin_lock(&peer->ksnp_lock);
+
+       list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+               if (tx->tx_conn != conn)
+                       continue;
+
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+               tx->tx_msg.ksm_zc_cookies[0] = 0;
+               tx->tx_zc_aborted = 1; /* mark it as not-acked */
+               list_del(&tx->tx_zc_list);
+               list_add(&tx->tx_zc_list, &zlist);
+       }
+
+       spin_unlock(&peer->ksnp_lock);
+
+       while (!list_empty(&zlist)) {
+               tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+               list_del(&tx->tx_zc_list);
+               ksocknal_tx_decref(tx);
+       }
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+       /* This gets called by the reaper (guaranteed thread context) to
+        * disengage the socket from its callbacks and close it.
+        * ksnc_refcount will eventually hit zero, and then the reaper will
+        * destroy it. */
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       ksock_sched_t    *sched = conn->ksnc_scheduler;
+       int            failed = 0;
+
+       LASSERT(conn->ksnc_closing);
+
+       /* wake up the scheduler to "send" all remaining packets to /dev/null */
+       spin_lock_bh(&sched->kss_lock);
+
+       /* a closing conn is always ready to tx */
+       conn->ksnc_tx_ready = 1;
+
+       if (!conn->ksnc_tx_scheduled &&
+           !list_empty(&conn->ksnc_tx_queue)){
+               list_add_tail (&conn->ksnc_tx_list,
+                              &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       /* serialise with callbacks */
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+       /* OK, so this conn may not be completely disengaged from its
+        * scheduler yet, but it _has_ committed to terminate... */
+       conn->ksnc_scheduler->kss_nconns--;
+
+       if (peer->ksnp_error != 0) {
+               /* peer's last conn closed in error */
+               LASSERT (list_empty (&peer->ksnp_conns));
+               failed = 1;
+               peer->ksnp_error = 0;     /* avoid multiple notifications */
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (failed)
+               ksocknal_peer_failed(peer);
+
+       /* The socket is closed on the final put; either here, or in
+        * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+        * when the connection was established, this will close the socket
+        * immediately, aborting anything buffered in it. Any hung
+        * zero-copy transmits will therefore complete in finite time. */
+       ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+       /* Queue the conn for the reaper to destroy */
+
+       LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+       wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+       cfs_time_t      last_rcv;
+
+       /* Final coup-de-grace of the reaper */
+       CDEBUG (D_NET, "connection %p\n", conn);
+
+       LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+       LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+       LASSERT (conn->ksnc_sock == NULL);
+       LASSERT (conn->ksnc_route == NULL);
+       LASSERT (!conn->ksnc_tx_scheduled);
+       LASSERT (!conn->ksnc_rx_scheduled);
+       LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+       /* complete current receive if any */
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_LNET_PAYLOAD:
+               last_rcv = conn->ksnc_rx_deadline -
+                          cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+               CERROR("Completing partial receive from %s[%d]"
+                      ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+                      "last alive is %ld secs ago\n",
+                      libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+                      HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                      conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+                      cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+                                       last_rcv)));
+               lnet_finalize (conn->ksnc_peer->ksnp_ni,
+                              conn->ksnc_cookie, -EIO);
+               break;
+       case SOCKNAL_RX_LNET_HEADER:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of lnet header from %s"
+                              ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                              conn->ksnc_proto->pro_version);
+               break;
+       case SOCKNAL_RX_KSM_HEADER:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of ksock message from %s"
+                              ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                              conn->ksnc_proto->pro_version);
+               break;
+       case SOCKNAL_RX_SLOP:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of slops from %s"
+                              ", ip %d.%d.%d.%d:%d, with error\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+              break;
+       default:
+               LBUG ();
+               break;
+       }
+
+       ksocknal_peer_decref(conn->ksnc_peer);
+
+       LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+       ksock_conn_t       *conn;
+       struct list_head         *ctmp;
+       struct list_head         *cnxt;
+       int              count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+               conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+               if (ipaddr == 0 ||
+                   conn->ksnc_ipaddr == ipaddr) {
+                       count++;
+                       ksocknal_close_conn_locked (conn, why);
+               }
+       }
+
+       return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       __u32        ipaddr = conn->ksnc_ipaddr;
+       int            count;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+       ksock_peer_t       *peer;
+       struct list_head         *ptmp;
+       struct list_head         *pnxt;
+       int              lo;
+       int              hi;
+       int              i;
+       int              count = 0;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (id.nid != LNET_NID_ANY)
+               lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+       else {
+               lo = 0;
+               hi = ksocknal_data.ksnd_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt,
+                                       &ksocknal_data.ksnd_peers[i]) {
+
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+                             (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+                               continue;
+
+                       count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       /* wildcards always succeed */
+       if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+               return (0);
+
+       return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+       /* The router is telling me she's been notified of a change in
+        * gateway state.... */
+       lnet_process_id_t  id = {0};
+
+       id.nid = gw_nid;
+       id.pid = LNET_PID_ANY;
+
+       CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+               alive ? "up" : "down");
+
+       if (!alive) {
+               /* If the gateway crashed, close all open connections... */
+               ksocknal_close_matching_conns (id, 0);
+               return;
+       }
+
+       /* ...otherwise do nothing.  We can only establish new connections
+        * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       int             connect = 1;
+       cfs_time_t       last_alive = 0;
+       cfs_time_t       now = cfs_time_current();
+       ksock_peer_t      *peer = NULL;
+       rwlock_t                *glock = &ksocknal_data.ksnd_global_lock;
+       lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+       read_lock(glock);
+
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL) {
+               struct list_head       *tmp;
+               ksock_conn_t     *conn;
+               int            bufnob;
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                       bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+                       if (bufnob < conn->ksnc_tx_bufnob) {
+                               /* something got ACKed */
+                               conn->ksnc_tx_deadline =
+                                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                               peer->ksnp_last_alive = now;
+                               conn->ksnc_tx_bufnob = bufnob;
+                       }
+               }
+
+               last_alive = peer->ksnp_last_alive;
+               if (ksocknal_find_connectable_route_locked(peer) == NULL)
+                       connect = 0;
+       }
+
+       read_unlock(glock);
+
+       if (last_alive != 0)
+               *when = last_alive;
+
+       CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+              libcfs_nid2str(nid), peer,
+              last_alive ? cfs_duration_sec(now - last_alive) : -1,
+              connect);
+
+       if (!connect)
+               return;
+
+       ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+       write_lock_bh(glock);
+
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL)
+               ksocknal_launch_all_connections_locked(peer);
+
+       write_unlock_bh(glock);
+       return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+       int            index;
+       int            i;
+       struct list_head       *tmp;
+       ksock_conn_t     *conn;
+
+       for (index = 0; ; index++) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+
+               i = 0;
+               conn = NULL;
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       if (i++ == index) {
+                               conn = list_entry (tmp, ksock_conn_t,
+                                                      ksnc_list);
+                               ksocknal_conn_addref(conn);
+                               break;
+                       }
+               }
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               if (conn == NULL)
+                       break;
+
+               ksocknal_lib_push_conn (conn);
+               ksocknal_conn_decref(conn);
+       }
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *tmp;
+       int             index;
+       int             i;
+       int             j;
+       int             rc = -ENOENT;
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               for (j = 0; ; j++) {
+                       read_lock(&ksocknal_data.ksnd_global_lock);
+
+                       index = 0;
+                       peer = NULL;
+
+                       list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                               peer = list_entry(tmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               if (!((id.nid == LNET_NID_ANY ||
+                                      id.nid == peer->ksnp_id.nid) &&
+                                     (id.pid == LNET_PID_ANY ||
+                                      id.pid == peer->ksnp_id.pid))) {
+                                       peer = NULL;
+                                       continue;
+                               }
+
+                               if (index++ == j) {
+                                       ksocknal_peer_addref(peer);
+                                       break;
+                               }
+                       }
+
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                       if (peer != NULL) {
+                               rc = 0;
+                               ksocknal_push_peer (peer);
+                               ksocknal_peer_decref(peer);
+                       }
+               }
+
+       }
+
+       return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+       ksock_net_t       *net = ni->ni_data;
+       ksock_interface_t *iface;
+       int             rc;
+       int             i;
+       int             j;
+       struct list_head        *ptmp;
+       ksock_peer_t      *peer;
+       struct list_head        *rtmp;
+       ksock_route_t     *route;
+
+       if (ipaddress == 0 ||
+           netmask == 0)
+               return (-EINVAL);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       iface = ksocknal_ip2iface(ni, ipaddress);
+       if (iface != NULL) {
+               /* silently ignore dups */
+               rc = 0;
+       } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+               rc = -ENOSPC;
+       } else {
+               iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+               iface->ksni_ipaddr = ipaddress;
+               iface->ksni_netmask = netmask;
+               iface->ksni_nroutes = 0;
+               iface->ksni_npeers = 0;
+
+               for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                       list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+                               peer = list_entry(ptmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+                                       if (peer->ksnp_passive_ips[j] == ipaddress)
+                                               iface->ksni_npeers++;
+
+                               list_for_each(rtmp, &peer->ksnp_routes) {
+                                       route = list_entry(rtmp,
+                                                              ksock_route_t,
+                                                              ksnr_list);
+
+                                       if (route->ksnr_myipaddr == ipaddress)
+                                               iface->ksni_nroutes++;
+                               }
+                       }
+               }
+
+               rc = 0;
+               /* NB only new connections will pay attention to the new interface! */
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+       struct list_head         *tmp;
+       struct list_head         *nxt;
+       ksock_route_t      *route;
+       ksock_conn_t       *conn;
+       int              i;
+       int              j;
+
+       for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+               if (peer->ksnp_passive_ips[i] == ipaddr) {
+                       for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+                               peer->ksnp_passive_ips[j-1] =
+                                       peer->ksnp_passive_ips[j];
+                       peer->ksnp_n_passive_ips--;
+                       break;
+               }
+
+       list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               if (route->ksnr_myipaddr != ipaddr)
+                       continue;
+
+               if (route->ksnr_share_count != 0) {
+                       /* Manually created; keep, but unbind */
+                       route->ksnr_myipaddr = 0;
+               } else {
+                       ksocknal_del_route_locked(route);
+               }
+       }
+
+       list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+               conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_myipaddr == ipaddr)
+                       ksocknal_close_conn_locked (conn, 0);
+       }
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             rc = -ENOENT;
+       struct list_head        *tmp;
+       struct list_head        *nxt;
+       ksock_peer_t      *peer;
+       __u32         this_ip;
+       int             i;
+       int             j;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+               if (!(ipaddress == 0 ||
+                     ipaddress == this_ip))
+                       continue;
+
+               rc = 0;
+
+               for (j = i+1; j < net->ksnn_ninterfaces; j++)
+                       net->ksnn_interfaces[j-1] =
+                               net->ksnn_interfaces[j];
+
+               net->ksnn_ninterfaces--;
+
+               for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+                       list_for_each_safe(tmp, nxt,
+                                              &ksocknal_data.ksnd_peers[j]) {
+                               peer = list_entry(tmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               if (peer->ksnp_ni != ni)
+                                       continue;
+
+                               ksocknal_peer_del_interface_locked(peer, this_ip);
+                       }
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       lnet_process_id_t id = {0};
+       struct libcfs_ioctl_data *data = arg;
+       int rc;
+
+       switch(cmd) {
+       case IOC_LIBCFS_GET_INTERFACE: {
+               ksock_net_t       *net = ni->ni_data;
+               ksock_interface_t *iface;
+
+               read_lock(&ksocknal_data.ksnd_global_lock);
+
+               if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+                       rc = -ENOENT;
+               } else {
+                       rc = 0;
+                       iface = &net->ksnn_interfaces[data->ioc_count];
+
+                       data->ioc_u32[0] = iface->ksni_ipaddr;
+                       data->ioc_u32[1] = iface->ksni_netmask;
+                       data->ioc_u32[2] = iface->ksni_npeers;
+                       data->ioc_u32[3] = iface->ksni_nroutes;
+               }
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return rc;
+       }
+
+       case IOC_LIBCFS_ADD_INTERFACE:
+               return ksocknal_add_interface(ni,
+                                             data->ioc_u32[0], /* IP address */
+                                             data->ioc_u32[1]); /* net mask */
+
+       case IOC_LIBCFS_DEL_INTERFACE:
+               return ksocknal_del_interface(ni,
+                                             data->ioc_u32[0]); /* IP address */
+
+       case IOC_LIBCFS_GET_PEER: {
+               __u32       myip = 0;
+               __u32       ip = 0;
+               int           port = 0;
+               int           conn_count = 0;
+               int           share_count = 0;
+
+               rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                           &id, &myip, &ip, &port,
+                                           &conn_count,  &share_count);
+               if (rc != 0)
+                       return rc;
+
+               data->ioc_nid    = id.nid;
+               data->ioc_count  = share_count;
+               data->ioc_u32[0] = ip;
+               data->ioc_u32[1] = port;
+               data->ioc_u32[2] = myip;
+               data->ioc_u32[3] = conn_count;
+               data->ioc_u32[4] = id.pid;
+               return 0;
+       }
+
+       case IOC_LIBCFS_ADD_PEER:
+               id.nid = data->ioc_nid;
+               id.pid = LUSTRE_SRV_LNET_PID;
+               return ksocknal_add_peer (ni, id,
+                                         data->ioc_u32[0], /* IP */
+                                         data->ioc_u32[1]); /* port */
+
+       case IOC_LIBCFS_DEL_PEER:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_del_peer (ni, id,
+                                         data->ioc_u32[0]); /* IP */
+
+       case IOC_LIBCFS_GET_CONN: {
+               int        txmem;
+               int        rxmem;
+               int        nagle;
+               ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+               if (conn == NULL)
+                       return -ENOENT;
+
+               ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+               data->ioc_count  = txmem;
+               data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+               data->ioc_flags  = nagle;
+               data->ioc_u32[0] = conn->ksnc_ipaddr;
+               data->ioc_u32[1] = conn->ksnc_port;
+               data->ioc_u32[2] = conn->ksnc_myipaddr;
+               data->ioc_u32[3] = conn->ksnc_type;
+               data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+               data->ioc_u32[5] = rxmem;
+               data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+               ksocknal_conn_decref(conn);
+               return 0;
+       }
+
+       case IOC_LIBCFS_CLOSE_CONNECTION:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_close_matching_conns (id,
+                                                     data->ioc_u32[0]);
+
+       case IOC_LIBCFS_REGISTER_MYNID:
+               /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+
+       case IOC_LIBCFS_PUSH_CONNECTION:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_push(ni, id);
+
+       default:
+               return -EINVAL;
+       }
+       /* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+       LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+       if (ksocknal_data.ksnd_sched_info != NULL) {
+               struct ksock_sched_info *info;
+               int                     i;
+
+               cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+                       if (info->ksi_scheds != NULL) {
+                               LIBCFS_FREE(info->ksi_scheds,
+                                           info->ksi_nthreads_max *
+                                           sizeof(info->ksi_scheds[0]));
+                       }
+               }
+               cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+       }
+
+       LIBCFS_FREE (ksocknal_data.ksnd_peers,
+                    sizeof (struct list_head) *
+                    ksocknal_data.ksnd_peer_hash_size);
+
+       spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+       if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+               struct list_head        zlist;
+               ksock_tx_t      *tx;
+
+               list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+               list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+               while (!list_empty(&zlist)) {
+                       tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+                       list_del(&tx->tx_list);
+                       LIBCFS_FREE(tx, tx->tx_desc_size);
+               }
+       } else {
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       }
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       int                     i;
+       int                     j;
+
+       CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+              atomic_read (&libcfs_kmemory));
+       LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+       switch (ksocknal_data.ksnd_init) {
+       default:
+               LASSERT (0);
+
+       case SOCKNAL_INIT_ALL:
+       case SOCKNAL_INIT_DATA:
+               LASSERT (ksocknal_data.ksnd_peers != NULL);
+               for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                       LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+               }
+
+               LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+               LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+               LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+               LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+               LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+                                       sched = &info->ksi_scheds[j];
+                                       LASSERT(list_empty(&sched->\
+                                                              kss_tx_conns));
+                                       LASSERT(list_empty(&sched->\
+                                                              kss_rx_conns));
+                                       LASSERT(list_empty(&sched-> \
+                                                 kss_zombie_noop_txs));
+                                       LASSERT(sched->kss_nconns == 0);
+                               }
+                       }
+               }
+
+               /* flag threads to terminate; wake and wait for them to die */
+               ksocknal_data.ksnd_shuttingdown = 1;
+               wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+               wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+                                       sched = &info->ksi_scheds[j];
+                                       wake_up_all(&sched->kss_waitq);
+                               }
+                       }
+               }
+
+               i = 4;
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               while (ksocknal_data.ksnd_nthreads != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                              "waiting for %d threads to terminate\n",
+                               ksocknal_data.ksnd_nthreads);
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       cfs_pause(cfs_time_seconds(1));
+                       read_lock(&ksocknal_data.ksnd_global_lock);
+               }
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               ksocknal_free_buffers();
+
+               ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+               break;
+       }
+
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read (&libcfs_kmemory));
+
+       module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+       struct timeval tv;
+
+       /* The incarnation number is the time this module loaded and it
+        * identifies this particular instance of the socknal.  Hopefully
+        * we won't be able to reboot more frequently than 1MHz for the
+        * forseeable future :) */
+
+       do_gettimeofday(&tv);
+
+       return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+       struct ksock_sched_info *info;
+       int                     rc;
+       int                     i;
+
+       LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+       LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+       memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+       ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+       LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+                     sizeof (struct list_head) *
+                     ksocknal_data.ksnd_peer_hash_size);
+       if (ksocknal_data.ksnd_peers == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+               INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+       rwlock_init(&ksocknal_data.ksnd_global_lock);
+       INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+       spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+       init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+       init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+       /* NB memset above zeros whole of ksocknal_data */
+
+       /* flag lists/ptrs/locks initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+       try_module_get(THIS_MODULE);
+
+       ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+                                                        sizeof(*info));
+       if (ksocknal_data.ksnd_sched_info == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+               ksock_sched_t   *sched;
+               int             nthrs;
+
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+               } else {
+                       /* max to half of CPUs, assume another half should be
+                        * reserved for upper layer modules */
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+               }
+
+               info->ksi_nthreads_max = nthrs;
+               info->ksi_cpt = i;
+
+               LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+                                info->ksi_nthreads_max * sizeof(*sched));
+               if (info->ksi_scheds == NULL)
+                       goto failed;
+
+               for (; nthrs > 0; nthrs--) {
+                       sched = &info->ksi_scheds[nthrs - 1];
+
+                       sched->kss_info = info;
+                       spin_lock_init(&sched->kss_lock);
+                       INIT_LIST_HEAD(&sched->kss_rx_conns);
+                       INIT_LIST_HEAD(&sched->kss_tx_conns);
+                       INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+                       init_waitqueue_head(&sched->kss_waitq);
+               }
+       }
+
+       ksocknal_data.ksnd_connd_starting        = 0;
+       ksocknal_data.ksnd_connd_failed_stamp     = 0;
+       ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+       /* must have at least 2 connds to remain responsive to accepts while
+        * connecting */
+       if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+               *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+       if (*ksocknal_tunables.ksnd_nconnds_max <
+           *ksocknal_tunables.ksnd_nconnds) {
+               ksocknal_tunables.ksnd_nconnds_max =
+                       ksocknal_tunables.ksnd_nconnds;
+       }
+
+       for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+               char name[16];
+               spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+               ksocknal_data.ksnd_connd_starting++;
+               spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+               snprintf(name, sizeof(name), "socknal_cd%02d", i);
+               rc = ksocknal_thread_start(ksocknal_connd,
+                                          (void *)((ulong_ptr_t)i), name);
+               if (rc != 0) {
+                       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+                       ksocknal_data.ksnd_connd_starting--;
+                       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+                       CERROR("Can't spawn socknal connd: %d\n", rc);
+                       goto failed;
+               }
+       }
+
+       rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+       if (rc != 0) {
+               CERROR ("Can't spawn socknal reaper: %d\n", rc);
+               goto failed;
+       }
+
+       /* flag everything initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       return 0;
+
+ failed:
+       ksocknal_base_shutdown();
+       return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+       ksock_peer_t    *peer = NULL;
+       struct list_head        *tmp;
+       int             i;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni == ni) break;
+
+                       peer = NULL;
+               }
+       }
+
+       if (peer != NULL) {
+               ksock_route_t *route;
+               ksock_conn_t  *conn;
+
+               CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+                      "closing %d, accepting %d, err %d, zcookie "LPU64", "
+                      "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+                      atomic_read(&peer->ksnp_refcount),
+                      peer->ksnp_sharecount, peer->ksnp_closing,
+                      peer->ksnp_accepting, peer->ksnp_error,
+                      peer->ksnp_zc_next_cookie,
+                      !list_empty(&peer->ksnp_tx_queue),
+                      !list_empty(&peer->ksnp_zc_req_list));
+
+               list_for_each (tmp, &peer->ksnp_routes) {
+                       route = list_entry(tmp, ksock_route_t, ksnr_list);
+                       CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+                              "del %d\n", atomic_read(&route->ksnr_refcount),
+                              route->ksnr_scheduled, route->ksnr_connecting,
+                              route->ksnr_connected, route->ksnr_deleted);
+               }
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                       CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+                              atomic_read(&conn->ksnc_conn_refcount),
+                              atomic_read(&conn->ksnc_sock_refcount),
+                              conn->ksnc_type, conn->ksnc_closing);
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+       ksock_net_t      *net = ni->ni_data;
+       int            i;
+       lnet_process_id_t anyid = {0};
+
+       anyid.nid =  LNET_NID_ANY;
+       anyid.pid =  LNET_PID_ANY;
+
+       LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+       LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+       spin_lock_bh(&net->ksnn_lock);
+       net->ksnn_shutdown = 1;          /* prevent new peers */
+       spin_unlock_bh(&net->ksnn_lock);
+
+       /* Delete all peers */
+       ksocknal_del_peer(ni, anyid, 0);
+
+       /* Wait for all peer state to clean up */
+       i = 2;
+       spin_lock_bh(&net->ksnn_lock);
+       while (net->ksnn_npeers != 0) {
+               spin_unlock_bh(&net->ksnn_lock);
+
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                      "waiting for %d peers to disconnect\n",
+                      net->ksnn_npeers);
+               cfs_pause(cfs_time_seconds(1));
+
+               ksocknal_debug_peerhash(ni);
+
+               spin_lock_bh(&net->ksnn_lock);
+       }
+       spin_unlock_bh(&net->ksnn_lock);
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+               LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+       }
+
+       list_del(&net->ksnn_list);
+       LIBCFS_FREE(net, sizeof(*net));
+
+       ksocknal_data.ksnd_nnets--;
+       if (ksocknal_data.ksnd_nnets == 0)
+               ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+       char      **names;
+       int      i;
+       int      j;
+       int      rc;
+       int      n;
+
+       n = libcfs_ipif_enumerate(&names);
+       if (n <= 0) {
+               CERROR("Can't enumerate interfaces: %d\n", n);
+               return n;
+       }
+
+       for (i = j = 0; i < n; i++) {
+               int     up;
+               __u32      ip;
+               __u32      mask;
+
+               if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                       continue;
+
+               rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+               if (rc != 0) {
+                       CWARN("Can't get interface %s info: %d\n",
+                             names[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Ignoring interface %s (down)\n",
+                             names[i]);
+                       continue;
+               }
+
+               if (j == LNET_MAX_INTERFACES) {
+                       CWARN("Ignoring interface %s (too many interfaces)\n",
+                             names[i]);
+                       continue;
+               }
+
+               net->ksnn_interfaces[j].ksni_ipaddr = ip;
+               net->ksnn_interfaces[j].ksni_netmask = mask;
+               strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+                       names[i], IFNAMSIZ);
+               j++;
+       }
+
+       libcfs_ipif_free_enumeration(names, n);
+
+       if (j == 0)
+               CERROR("Can't find any usable interfaces\n");
+
+       return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+       int     new_ipif = 0;
+       int     i;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               char            *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+               char            *colon = strchr(ifnam, ':');
+               int             found  = 0;
+               ksock_net_t     *tmp;
+               int             j;
+
+               if (colon != NULL) /* ignore alias device */
+                       *colon = 0;
+
+               list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+                                       ksnn_list) {
+                       for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+                               char *ifnam2 = &tmp->ksnn_interfaces[j].\
+                                            ksni_name[0];
+                               char *colon2 = strchr(ifnam2, ':');
+
+                               if (colon2 != NULL)
+                                       *colon2 = 0;
+
+                               found = strcmp(ifnam, ifnam2) == 0;
+                               if (colon2 != NULL)
+                                       *colon2 = ':';
+                       }
+                       if (found)
+                               break;
+               }
+
+               new_ipif += !found;
+               if (colon != NULL)
+                       *colon = ':';
+       }
+
+       return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+       int     nthrs;
+       int     rc = 0;
+       int     i;
+
+       if (info->ksi_nthreads == 0) {
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = info->ksi_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              info->ksi_cpt);
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+                       nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+               }
+               nthrs = min(nthrs, info->ksi_nthreads_max);
+       } else {
+               LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+               /* increase two threads if there is new interface */
+               nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long            id;
+               char            name[20];
+               ksock_sched_t   *sched;
+               id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+               sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+               snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+                        info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+               rc = ksocknal_thread_start(ksocknal_scheduler,
+                                          (void *)id, name);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      info->ksi_cpt, info->ksi_nthreads + i, rc);
+               break;
+       }
+
+       info->ksi_nthreads += i;
+       return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+       int     newif = ksocknal_search_new_ipif(net);
+       int     rc;
+       int     i;
+
+       LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+       for (i = 0; i < ncpts; i++) {
+               struct ksock_sched_info *info;
+               int cpt = (cpts == NULL) ? i : cpts[i];
+
+               LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+               info = ksocknal_data.ksnd_sched_info[cpt];
+
+               if (!newif && info->ksi_nthreads > 0)
+                       continue;
+
+               rc = ksocknal_start_schedulers(info);
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+       ksock_net_t  *net;
+       int        rc;
+       int        i;
+
+       LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+       if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+               rc = ksocknal_base_startup();
+               if (rc != 0)
+                       return rc;
+       }
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       if (net == NULL)
+               goto fail_0;
+
+       spin_lock_init(&net->ksnn_lock);
+       net->ksnn_incarnation = ksocknal_new_incarnation();
+       ni->ni_data = net;
+       ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+       ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+       ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+       ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+       if (ni->ni_interfaces[0] == NULL) {
+               rc = ksocknal_enumerate_interfaces(net);
+               if (rc <= 0)
+                       goto fail_1;
+
+               net->ksnn_ninterfaces = 1;
+       } else {
+               for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+                       int    up;
+
+                       if (ni->ni_interfaces[i] == NULL)
+                               break;
+
+                       rc = libcfs_ipif_query(
+                               ni->ni_interfaces[i], &up,
+                               &net->ksnn_interfaces[i].ksni_ipaddr,
+                               &net->ksnn_interfaces[i].ksni_netmask);
+
+                       if (rc != 0) {
+                               CERROR("Can't get interface %s info: %d\n",
+                                      ni->ni_interfaces[i], rc);
+                               goto fail_1;
+                       }
+
+                       if (!up) {
+                               CERROR("Interface %s is down\n",
+                                      ni->ni_interfaces[i]);
+                               goto fail_1;
+                       }
+
+                       strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+                               ni->ni_interfaces[i], IFNAMSIZ);
+               }
+               net->ksnn_ninterfaces = i;
+       }
+
+       /* call it before add it to ksocknal_data.ksnd_nets */
+       rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto fail_1;
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                               net->ksnn_interfaces[0].ksni_ipaddr);
+       list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+       ksocknal_data.ksnd_nnets++;
+
+       return 0;
+
+ fail_1:
+       LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+       if (ksocknal_data.ksnd_nnets == 0)
+               ksocknal_base_shutdown();
+
+       return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+       lnet_unregister_lnd(&the_ksocklnd);
+       ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+       int    rc;
+
+       /* check ksnr_connected/connecting field large enough */
+       CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+       CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+       /* initialize the_ksocklnd */
+       the_ksocklnd.lnd_type     = SOCKLND;
+       the_ksocklnd.lnd_startup  = ksocknal_startup;
+       the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+       the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+       the_ksocklnd.lnd_send     = ksocknal_send;
+       the_ksocklnd.lnd_recv     = ksocknal_recv;
+       the_ksocklnd.lnd_notify   = ksocknal_notify;
+       the_ksocklnd.lnd_query    = ksocknal_query;
+       the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+       rc = ksocknal_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       lnet_register_lnd(&the_ksocklnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644 (file)
index 0000000..b483e0c
--- /dev/null
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE  101         /* # peer lists */
+#define SOCKNAL_RESCHED         100         /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000       /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK       /* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0     /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0     /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0     /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct                           /* per scheduler state */
+{
+       spinlock_t              kss_lock;       /* serialise */
+       struct list_head                kss_rx_conns;   /* conn waiting to be read */
+       /* conn waiting to be written */
+       struct list_head                kss_tx_conns;
+       /* zombie noop tx list */
+       struct list_head                kss_zombie_noop_txs;
+       wait_queue_head_t               kss_waitq;      /* where scheduler sleeps */
+       /* # connections assigned to this scheduler */
+       int                     kss_nconns;
+       struct ksock_sched_info *kss_info;      /* owner of it */
+       struct page             *kss_rx_scratch_pgs[LNET_MAX_IOV];
+       struct iovec            kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+       int                     ksi_nthreads_max; /* max allowed threads */
+       int                     ksi_nthreads;   /* number of threads */
+       int                     ksi_cpt;        /* CPT id */
+       ksock_sched_t           *ksi_scheds;    /* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT                        16
+#define KSOCK_THREAD_ID(cpt, sid)      (((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)           ((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)           ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct                           /* in-use interface */
+{
+       __u32           ksni_ipaddr;            /* interface's IP address */
+       __u32           ksni_netmask;           /* interface's network mask */
+       int             ksni_nroutes;           /* # routes using (active) */
+       int             ksni_npeers;            /* # peers using (passive) */
+       char            ksni_name[IFNAMSIZ];    /* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+       /* "stuck" socket timeout (seconds) */
+       int           *ksnd_timeout;
+       /* # scheduler threads in each pool while starting */
+       int              *ksnd_nscheds;
+       int           *ksnd_nconnds;     /* # connection daemons */
+       int           *ksnd_nconnds_max;     /* max # connection daemons */
+       int           *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+       int           *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+       int           *ksnd_eager_ack;       /* make TCP ack eagerly? */
+       int           *ksnd_typed_conns;     /* drive sockets by type? */
+       int           *ksnd_min_bulk;   /* smallest "large" message */
+       int           *ksnd_tx_buffer_size;  /* socket tx buffer size */
+       int           *ksnd_rx_buffer_size;  /* socket rx buffer size */
+       int           *ksnd_nagle;         /* enable NAGLE? */
+       int           *ksnd_round_robin;     /* round robin for multiple interfaces */
+       int           *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+       int           *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+       int           *ksnd_keepalive_count; /* # probes */
+       int           *ksnd_keepalive_intvl; /* time between probes */
+       int           *ksnd_credits;     /* # concurrent sends */
+       int           *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+       int           *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+       int           *ksnd_peertimeout;     /* seconds to consider peer dead */
+       int           *ksnd_enable_csum;     /* enable check sum */
+       int           *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+       int           *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+       unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+       int           *ksnd_zc_recv;     /* enable ZC receive (for Chelsio TOE) */
+       int           *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+       __u64             ksnn_incarnation;     /* my epoch */
+       spinlock_t        ksnn_lock;            /* serialise */
+       struct list_head          ksnn_list;            /* chain on global list */
+       int               ksnn_npeers;          /* # peers */
+       int               ksnn_shutdown;        /* shutting down? */
+       int               ksnn_ninterfaces;     /* IP interfaces */
+       ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+       int                     ksnd_init;      /* initialisation state */
+       int                     ksnd_nnets;     /* # networks set up */
+       struct list_head                ksnd_nets;      /* list of nets */
+       /* stabilize peer/conn ops */
+       rwlock_t                ksnd_global_lock;
+       /* hash table of all my known peers */
+       struct list_head                *ksnd_peers;
+       int                     ksnd_peer_hash_size; /* size of ksnd_peers */
+
+       int                     ksnd_nthreads;  /* # live threads */
+       int                     ksnd_shuttingdown; /* tell threads to exit */
+       /* schedulers information */
+       struct ksock_sched_info **ksnd_sched_info;
+
+       atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+       struct list_head        ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+       struct list_head        ksnd_zombie_conns;   /* conns to free: reaper_lock */
+       struct list_head        ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+       wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+       cfs_time_t      ksnd_reaper_waketime;/* when reaper will wake */
+       spinlock_t        ksnd_reaper_lock;     /* serialise */
+
+       int            ksnd_enomem_tx;      /* test ENOMEM sender */
+       int            ksnd_stall_tx;       /* test sluggish sender */
+       int            ksnd_stall_rx;       /* test sluggish receiver */
+
+       struct list_head        ksnd_connd_connreqs; /* incoming connection requests */
+       struct list_head        ksnd_connd_routes;   /* routes waiting to be connected */
+       wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+       int            ksnd_connd_connecting;/* # connds connecting */
+       /** time stamp of the last failed connecting attempt */
+       long          ksnd_connd_failed_stamp;
+       /** # starting connd */
+       unsigned          ksnd_connd_starting;
+       /** time stamp of the last starting connd */
+       long          ksnd_connd_starting_stamp;
+       /** # running connd */
+       unsigned          ksnd_connd_running;
+       spinlock_t        ksnd_connd_lock;      /* serialise */
+
+       struct list_head          ksnd_idle_noop_txs;   /* list head for freed noop tx */
+       spinlock_t        ksnd_tx_lock;         /* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL       2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;                           /* forward ref */
+struct ksock_peer;                           /* forward ref */
+struct ksock_route;                         /* forward ref */
+struct ksock_proto;                         /* forward ref */
+
+typedef struct                           /* transmit packet */
+{
+       struct list_head     tx_list;   /* queue on conn for transmission etc */
+       struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+       atomic_t   tx_refcount;    /* tx reference count */
+       int         tx_nob;      /* # packet bytes */
+       int         tx_resid;       /* residual bytes */
+       int         tx_niov;    /* # packet iovec frags */
+       struct iovec  *tx_iov;   /* packet iovec frags */
+       int         tx_nkiov;       /* # packet page frags */
+       unsigned short tx_zc_aborted;  /* aborted ZC request */
+       unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+       unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+       unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+       lnet_kiov_t   *tx_kiov; /* packet page frags */
+       struct ksock_conn  *tx_conn;    /* owning conn */
+       lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+       cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+       ksock_msg_t    tx_msg;   /* socklnd message buffer */
+       int         tx_desc_size;   /* size of this descriptor */
+       union {
+               struct {
+                       struct iovec iov;       /* virt hdr */
+                       lnet_kiov_t  kiov[0];   /* paged payload */
+               }                 paged;
+               struct {
+                       struct iovec iov[1];    /* virt hdr + payload */
+               }                 virt;
+       }                      tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+       struct iovec     iov[LNET_MAX_IOV];
+       lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1             /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2             /* reading lnet message header */
+#define SOCKNAL_RX_PARSE       3              /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4             /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5             /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP         6             /* skipping body */
+
+typedef struct ksock_conn
+{
+       struct ksock_peer  *ksnc_peer;   /* owning peer */
+       struct ksock_route *ksnc_route; /* owning route */
+       struct list_head          ksnc_list;     /* stash on peer's conn list */
+       socket_t       *ksnc_sock;       /* actual socket */
+       void           *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+       void           *ksnc_saved_write_space; /* socket's original write_space() callback */
+       atomic_t        ksnc_conn_refcount; /* conn refcount */
+       atomic_t        ksnc_sock_refcount; /* sock refcount */
+       ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+       __u32          ksnc_myipaddr;   /* my IP */
+       __u32          ksnc_ipaddr;     /* peer's IP */
+       int              ksnc_port;       /* peer's port */
+       signed int        ksnc_type:3;     /* type of connection,
+                                             * should be signed value */
+       unsigned int        ksnc_closing:1;  /* being shut down */
+       unsigned int        ksnc_flip:1;     /* flip or not, only for V2.x */
+       unsigned int        ksnc_zc_capable:1; /* enable to ZC */
+       struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+       /* reader */
+       struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+       cfs_time_t          ksnc_rx_deadline; /* when (in jiffies) receive times out */
+       __u8              ksnc_rx_started;  /* started receiving a message */
+       __u8              ksnc_rx_ready;    /* data ready to read */
+       __u8              ksnc_rx_scheduled;/* being progressed */
+       __u8              ksnc_rx_state;    /* what is being read */
+       int                ksnc_rx_nob_left; /* # bytes to next hdr/body */
+       int                ksnc_rx_nob_wanted; /* bytes actually wanted */
+       int                ksnc_rx_niov;     /* # iovec frags */
+       struct iovec     *ksnc_rx_iov;      /* the iovec frags */
+       int                ksnc_rx_nkiov;    /* # page frags */
+       lnet_kiov_t       *ksnc_rx_kiov;     /* the page frags */
+       ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+       __u32            ksnc_rx_csum;     /* partial checksum for incoming data */
+       void             *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+       ksock_msg_t        ksnc_msg;     /* incoming message buffer:
+                                                * V2.x message takes the
+                                                * whole struct
+                                                * V1.x message is a bare
+                                                * lnet_hdr_t, it's stored in
+                                                * ksnc_msg.ksm_u.lnetmsg */
+
+       /* WRITER */
+       struct list_head            ksnc_tx_list;     /* where I enq waiting for output space */
+       struct list_head            ksnc_tx_queue;    /* packets waiting to be sent */
+       ksock_tx_t         *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+       cfs_time_t          ksnc_tx_deadline; /* when (in jiffies) tx times out */
+       int                ksnc_tx_bufnob;     /* send buffer marker */
+       atomic_t          ksnc_tx_nob;  /* # bytes queued */
+       int                ksnc_tx_ready;      /* write space */
+       int                ksnc_tx_scheduled;  /* being progressed */
+       cfs_time_t          ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+       struct list_head            ksnr_list;  /* chain on peer route list */
+       struct list_head            ksnr_connd_list;  /* chain on ksnr_connd_routes */
+       struct ksock_peer    *ksnr_peer;        /* owning peer */
+       atomic_t          ksnr_refcount;    /* # users */
+       cfs_time_t          ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+       cfs_duration_t  ksnr_retry_interval; /* how long between retries */
+       __u32            ksnr_myipaddr;    /* my IP */
+       __u32            ksnr_ipaddr;      /* IP address to connect to */
+       int                ksnr_port;   /* port to connect to */
+       unsigned int      ksnr_scheduled:1; /* scheduled for attention */
+       unsigned int      ksnr_connecting:1;/* connection establishment in progress */
+       unsigned int      ksnr_connected:4; /* connections established by type */
+       unsigned int      ksnr_deleted:1;   /* been removed from peer? */
+       unsigned int      ksnr_share_count; /* created explicitly? */
+       int                ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING   1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+       struct list_head            ksnp_list;  /* stash on global peer list */
+       cfs_time_t          ksnp_last_alive;  /* when (in jiffies) I was last alive */
+       lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+       atomic_t          ksnp_refcount; /* # users */
+       int                ksnp_sharecount;  /* lconf usage counter */
+       int                ksnp_closing;  /* being closed */
+       int                ksnp_accepting;/* # passive connections pending */
+       int                ksnp_error;    /* errno on closing last conn */
+       __u64            ksnp_zc_next_cookie;/* ZC completion cookie */
+       __u64            ksnp_incarnation;   /* latest known peer incarnation */
+       struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+       struct list_head            ksnp_conns;    /* all active connections */
+       struct list_head            ksnp_routes;   /* routes */
+       struct list_head            ksnp_tx_queue; /* waiting packets */
+       spinlock_t            ksnp_lock;        /* serialize, g_lock unsafe */
+       struct list_head            ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+       cfs_time_t          ksnp_send_keepalive; /* time to send keepalive */
+       lnet_ni_t           *ksnp_ni;       /* which network */
+       int                ksnp_n_passive_ips; /* # of... */
+       __u32            ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+       struct list_head            ksncr_list;     /* stash on ksnd_connd_connreqs */
+       lnet_ni_t           *ksncr_ni;       /* chosen NI */
+       socket_t         *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO       0       /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1      /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2      /* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+       int        pro_version;                                       /* version number of protocol */
+       int      (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+       int      (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+       void    (*pro_pack)(ksock_tx_t *);                                /* message pack */
+       void    (*pro_unpack)(ksock_msg_t *);                          /* message unpack */
+       ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);    /* queue tx on the connection */
+       int      (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+       int      (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);           /* handle ZC request */
+       int      (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);       /* handle ZC ACK */
+       int      (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);     /* msg type matches the connection type:
+                                                                                * return value:
+                                                                                *   return MATCH_NO  : no
+                                                                                *   return MATCH_YES : matching type
+                                                                                *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1   KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+       if (!*ksocknal_tunables.ksnd_typed_conns)
+               return (1 << SOCKLND_CONN_ANY);
+
+       return ((1 << SOCKLND_CONN_CONTROL) |
+               (1 << SOCKLND_CONN_BULK_IN) |
+               (1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+       unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+       return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+       atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+       if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+               ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+       int   rc = -ESHUTDOWN;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+       if (!conn->ksnc_closing) {
+               LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+               atomic_inc(&conn->ksnc_sock_refcount);
+               rc = 0;
+       }
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+       if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+               LASSERT (conn->ksnc_closing);
+               libcfs_sock_release(conn->ksnc_sock);
+               conn->ksnc_sock = NULL;
+               ksocknal_finalize_zcreq(conn);
+       }
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+       LASSERT (atomic_read(&tx->tx_refcount) > 0);
+       atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+       LASSERT (atomic_read(&tx->tx_refcount) > 0);
+       if (atomic_dec_and_test(&tx->tx_refcount))
+               ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+       LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+       atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+       LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+       if (atomic_dec_and_test(&route->ksnr_refcount))
+               ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+       atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+       if (atomic_dec_and_test(&peer->ksnp_refcount))
+               ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                 int delayed, unsigned int niov,
+                 struct iovec *iov, lnet_kiov_t *kiov,
+                 unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                                socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+                                             __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+                                              ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+                                  lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+                                 int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                               lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                               ksock_hello_msg_t *hello, lnet_process_id_t *id,
+                               __u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+                                          int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644 (file)
index 0000000..ad5e241
--- /dev/null
@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+       ksock_tx_t *tx = NULL;
+
+       if (type == KSOCK_MSG_NOOP) {
+               LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+               /* searching for a noop tx in free list */
+               spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+               if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+                       tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+                                           next, ksock_tx_t, tx_list);
+                       LASSERT(tx->tx_desc_size == size);
+                       list_del(&tx->tx_list);
+               }
+
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       }
+
+       if (tx == NULL)
+               LIBCFS_ALLOC(tx, size);
+
+       if (tx == NULL)
+               return NULL;
+
+       atomic_set(&tx->tx_refcount, 1);
+       tx->tx_zc_aborted = 0;
+       tx->tx_zc_capable = 0;
+       tx->tx_zc_checked = 0;
+       tx->tx_desc_size  = size;
+
+       atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+       return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+       ksock_tx_t *tx;
+
+       tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+       if (tx == NULL) {
+               CERROR("Can't allocate noop tx desc\n");
+               return NULL;
+       }
+
+       tx->tx_conn     = NULL;
+       tx->tx_lnetmsg  = NULL;
+       tx->tx_kiov     = NULL;
+       tx->tx_nkiov    = 0;
+       tx->tx_iov      = tx->tx_frags.virt.iov;
+       tx->tx_niov     = 1;
+       tx->tx_nonblk   = nonblk;
+
+       socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+       return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+       atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+       if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+               /* it's a noop tx */
+               spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+               list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       } else {
+               LIBCFS_FREE(tx, tx->tx_desc_size);
+       }
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct iovec  *iov = tx->tx_iov;
+       int    nob;
+       int    rc;
+
+       LASSERT (tx->tx_niov > 0);
+
+       /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+       rc = ksocknal_lib_send_iov(conn, tx);
+
+       if (rc <= 0)                        /* sent nothing? */
+               return (rc);
+
+       nob = rc;
+       LASSERT (nob <= tx->tx_resid);
+       tx->tx_resid -= nob;
+
+       /* "consume" iov */
+       do {
+               LASSERT (tx->tx_niov > 0);
+
+               if (nob < (int) iov->iov_len) {
+                       iov->iov_base = (void *)((char *)iov->iov_base + nob);
+                       iov->iov_len -= nob;
+                       return (rc);
+               }
+
+               nob -= iov->iov_len;
+               tx->tx_iov = ++iov;
+               tx->tx_niov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       lnet_kiov_t    *kiov = tx->tx_kiov;
+       int     nob;
+       int     rc;
+
+       LASSERT (tx->tx_niov == 0);
+       LASSERT (tx->tx_nkiov > 0);
+
+       /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+       rc = ksocknal_lib_send_kiov(conn, tx);
+
+       if (rc <= 0)                        /* sent nothing? */
+               return (rc);
+
+       nob = rc;
+       LASSERT (nob <= tx->tx_resid);
+       tx->tx_resid -= nob;
+
+       /* "consume" kiov */
+       do {
+               LASSERT(tx->tx_nkiov > 0);
+
+               if (nob < (int)kiov->kiov_len) {
+                       kiov->kiov_offset += nob;
+                       kiov->kiov_len -= nob;
+                       return rc;
+               }
+
+               nob -= (int)kiov->kiov_len;
+               tx->tx_kiov = ++kiov;
+               tx->tx_nkiov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       int      rc;
+       int      bufnob;
+
+       if (ksocknal_data.ksnd_stall_tx != 0) {
+               cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+       }
+
+       LASSERT (tx->tx_resid != 0);
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               return (-ESHUTDOWN);
+       }
+
+       do {
+               if (ksocknal_data.ksnd_enomem_tx > 0) {
+                       /* testing... */
+                       ksocknal_data.ksnd_enomem_tx--;
+                       rc = -EAGAIN;
+               } else if (tx->tx_niov != 0) {
+                       rc = ksocknal_send_iov (conn, tx);
+               } else {
+                       rc = ksocknal_send_kiov (conn, tx);
+               }
+
+               bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+               if (rc > 0)                  /* sent something? */
+                       conn->ksnc_tx_bufnob += rc; /* account it */
+
+               if (bufnob < conn->ksnc_tx_bufnob) {
+                       /* allocated send buffer bytes < computed; infer
+                        * something got ACKed */
+                       conn->ksnc_tx_deadline =
+                               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+                       conn->ksnc_tx_bufnob = bufnob;
+                       mb();
+               }
+
+               if (rc <= 0) { /* Didn't write anything? */
+
+                       if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+                               rc = -EAGAIN;
+
+                       /* Check if EAGAIN is due to memory pressure */
+                       if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+                               rc = -ENOMEM;
+
+                       break;
+               }
+
+               /* socket's wmem_queued now includes 'rc' bytes */
+               atomic_sub (rc, &conn->ksnc_tx_nob);
+               rc = 0;
+
+       } while (tx->tx_resid != 0);
+
+       ksocknal_connsock_decref(conn);
+       return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+       struct iovec *iov = conn->ksnc_rx_iov;
+       int     nob;
+       int     rc;
+
+       LASSERT (conn->ksnc_rx_niov > 0);
+
+       /* Never touch conn->ksnc_rx_iov or change connection
+        * status inside ksocknal_lib_recv_iov */
+       rc = ksocknal_lib_recv_iov(conn);
+
+       if (rc <= 0)
+               return (rc);
+
+       /* received something... */
+       nob = rc;
+
+       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+       conn->ksnc_rx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();                  /* order with setting rx_started */
+       conn->ksnc_rx_started = 1;
+
+       conn->ksnc_rx_nob_wanted -= nob;
+       conn->ksnc_rx_nob_left -= nob;
+
+       do {
+               LASSERT (conn->ksnc_rx_niov > 0);
+
+               if (nob < (int)iov->iov_len) {
+                       iov->iov_len -= nob;
+                       iov->iov_base = (void *)((char *)iov->iov_base + nob);
+                       return (-EAGAIN);
+               }
+
+               nob -= iov->iov_len;
+               conn->ksnc_rx_iov = ++iov;
+               conn->ksnc_rx_niov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+       lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+       int     nob;
+       int     rc;
+       LASSERT (conn->ksnc_rx_nkiov > 0);
+
+       /* Never touch conn->ksnc_rx_kiov or change connection
+        * status inside ksocknal_lib_recv_iov */
+       rc = ksocknal_lib_recv_kiov(conn);
+
+       if (rc <= 0)
+               return (rc);
+
+       /* received something... */
+       nob = rc;
+
+       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+       conn->ksnc_rx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();                  /* order with setting rx_started */
+       conn->ksnc_rx_started = 1;
+
+       conn->ksnc_rx_nob_wanted -= nob;
+       conn->ksnc_rx_nob_left -= nob;
+
+       do {
+               LASSERT (conn->ksnc_rx_nkiov > 0);
+
+               if (nob < (int) kiov->kiov_len) {
+                       kiov->kiov_offset += nob;
+                       kiov->kiov_len -= nob;
+                       return -EAGAIN;
+               }
+
+               nob -= kiov->kiov_len;
+               conn->ksnc_rx_kiov = ++kiov;
+               conn->ksnc_rx_nkiov--;
+       } while (nob != 0);
+
+       return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+       /* Return 1 on success, 0 on EOF, < 0 on error.
+        * Caller checks ksnc_rx_nob_wanted to determine
+        * progress/completion. */
+       int     rc;
+       ENTRY;
+
+       if (ksocknal_data.ksnd_stall_rx != 0) {
+               cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+       }
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               return (-ESHUTDOWN);
+       }
+
+       for (;;) {
+               if (conn->ksnc_rx_niov != 0)
+                       rc = ksocknal_recv_iov (conn);
+               else
+                       rc = ksocknal_recv_kiov (conn);
+
+               if (rc <= 0) {
+                       /* error/EOF or partial receive */
+                       if (rc == -EAGAIN) {
+                               rc = 1;
+                       } else if (rc == 0 && conn->ksnc_rx_started) {
+                               /* EOF in the middle of a message */
+                               rc = -EPROTO;
+                       }
+                       break;
+               }
+
+               /* Completed a fragment */
+
+               if (conn->ksnc_rx_nob_wanted == 0) {
+                       rc = 1;
+                       break;
+               }
+       }
+
+       ksocknal_connsock_decref(conn);
+       RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+       lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+       int       rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+       ENTRY;
+
+       LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+       if (tx->tx_conn != NULL)
+               ksocknal_conn_decref(tx->tx_conn);
+
+       if (ni == NULL && tx->tx_conn != NULL)
+               ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+       ksocknal_free_tx (tx);
+       if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+               lnet_finalize (ni, lnetmsg, rc);
+
+       EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+       ksock_tx_t *tx;
+
+       while (!list_empty (txlist)) {
+               tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+               if (error && tx->tx_lnetmsg != NULL) {
+                       CNETERR("Deleting packet type %d len %d %s->%s\n",
+                               le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+                               le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+                               libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+                               libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+               } else if (error) {
+                       CNETERR("Deleting noop packet\n");
+               }
+
+               list_del (&tx->tx_list);
+
+               LASSERT (atomic_read(&tx->tx_refcount) == 1);
+               ksocknal_tx_done (ni, tx);
+       }
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+       ksock_conn_t   *conn = tx->tx_conn;
+       ksock_peer_t   *peer = conn->ksnc_peer;
+
+       /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+        * to ksnp_zc_req_list if some fragment of this message should be sent
+        * zero-copy.  Our peer will send an ACK containing this cookie when
+        * she has received this message to tell us we can signal completion.
+        * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+        * ksnp_zc_req_list. */
+       LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT (tx->tx_zc_capable);
+
+       tx->tx_zc_checked = 1;
+
+       if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+           !conn->ksnc_zc_capable)
+               return;
+
+       /* assign cookie and queue tx to pending list, it will be released when
+        * a matching ack is received. See ksocknal_handle_zcack() */
+
+       ksocknal_tx_addref(tx);
+
+       spin_lock(&peer->ksnp_lock);
+
+       /* ZC_REQ is going to be pinned to the peer */
+       tx->tx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+       LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+       tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+       if (peer->ksnp_zc_next_cookie == 0)
+               peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+       list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+       spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+       ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+       LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT(tx->tx_zc_capable);
+
+       tx->tx_zc_checked = 0;
+
+       spin_lock(&peer->ksnp_lock);
+
+       if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+               /* Not waiting for an ACK */
+               spin_unlock(&peer->ksnp_lock);
+               return;
+       }
+
+       tx->tx_msg.ksm_zc_cookies[0] = 0;
+       list_del(&tx->tx_zc_list);
+
+       spin_unlock(&peer->ksnp_lock);
+
+       ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       int         rc;
+
+       if (tx->tx_zc_capable && !tx->tx_zc_checked)
+               ksocknal_check_zc_req(tx);
+
+       rc = ksocknal_transmit (conn, tx);
+
+       CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+       if (tx->tx_resid == 0) {
+               /* Sent everything OK */
+               LASSERT (rc == 0);
+
+               return (0);
+       }
+
+       if (rc == -EAGAIN)
+               return (rc);
+
+       if (rc == -ENOMEM) {
+               static int counter;
+
+               counter++;   /* exponential backoff warnings */
+               if ((counter & (-counter)) == counter)
+                       CWARN("%u ENOMEM tx %p (%u allocated)\n",
+                             counter, conn, atomic_read(&libcfs_kmemory));
+
+               /* Queue on ksnd_enomem_conns for retry after a timeout */
+               spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+               /* enomem list takes over scheduler's ref... */
+               LASSERT (conn->ksnc_tx_scheduled);
+               list_add_tail(&conn->ksnc_tx_list,
+                                 &ksocknal_data.ksnd_enomem_conns);
+               if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+                                                  SOCKNAL_ENOMEM_RETRY),
+                                  ksocknal_data.ksnd_reaper_waketime))
+                       wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+               spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+               return (rc);
+       }
+
+       /* Actual error */
+       LASSERT (rc < 0);
+
+       if (!conn->ksnc_closing) {
+               switch (rc) {
+               case -ECONNRESET:
+                       LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+                                     "while we were sending data; it may have "
+                                     "rebooted.\n",
+                                     HIPQUAD(conn->ksnc_ipaddr));
+                       break;
+               default:
+                       LCONSOLE_WARN("There was an unexpected network error "
+                                     "while writing to %u.%u.%u.%u: %d.\n",
+                                     HIPQUAD(conn->ksnc_ipaddr), rc);
+                       break;
+               }
+               CDEBUG(D_NET, "[%p] Error %d on write to %s"
+                      " ip %d.%d.%d.%d:%d\n", conn, rc,
+                      libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                      HIPQUAD(conn->ksnc_ipaddr),
+                      conn->ksnc_port);
+       }
+
+       if (tx->tx_zc_checked)
+               ksocknal_uncheck_zc_req(tx);
+
+       /* it's not an error if conn is being closed */
+       ksocknal_close_conn_and_siblings (conn,
+                                         (conn->ksnc_closing) ? 0 : rc);
+
+       return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+       /* called holding write lock on ksnd_global_lock */
+
+       LASSERT (!route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+       LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+       route->ksnr_scheduled = 1;            /* scheduling conn for connd */
+       ksocknal_route_addref(route);      /* extra ref for connd */
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       list_add_tail(&route->ksnr_connd_list,
+                         &ksocknal_data.ksnd_connd_routes);
+       wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+       ksock_route_t *route;
+
+       /* called holding write lock on ksnd_global_lock */
+       for (;;) {
+               /* launch any/all connections that need it */
+               route = ksocknal_find_connectable_route_locked(peer);
+               if (route == NULL)
+                       return;
+
+               ksocknal_launch_connection_locked(route);
+       }
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+       struct list_head       *tmp;
+       ksock_conn_t     *conn;
+       ksock_conn_t     *typed = NULL;
+       ksock_conn_t     *fallback = NULL;
+       int            tnob     = 0;
+       int            fnob     = 0;
+
+       list_for_each (tmp, &peer->ksnp_conns) {
+               ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+               int        nob = atomic_read(&c->ksnc_tx_nob) +
+                                   cfs_sock_wmem_queued(c->ksnc_sock);
+               int        rc;
+
+               LASSERT (!c->ksnc_closing);
+               LASSERT (c->ksnc_proto != NULL &&
+                        c->ksnc_proto->pro_match_tx != NULL);
+
+               rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+               switch (rc) {
+               default:
+                       LBUG();
+               case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+                       continue;
+
+               case SOCKNAL_MATCH_YES: /* typed connection */
+                       if (typed == NULL || tnob > nob ||
+                           (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+                            cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                               typed = c;
+                               tnob  = nob;
+                       }
+                       break;
+
+               case SOCKNAL_MATCH_MAY: /* fallback connection */
+                       if (fallback == NULL || fnob > nob ||
+                           (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+                            cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                               fallback = c;
+                               fnob     = nob;
+                       }
+                       break;
+               }
+       }
+
+       /* prefer the typed selection */
+       conn = (typed != NULL) ? typed : fallback;
+
+       if (conn != NULL)
+               conn->ksnc_tx_last_post = cfs_time_current();
+
+       return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       conn->ksnc_proto->pro_pack(tx);
+
+       atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+       ksocknal_conn_addref(conn); /* +1 ref for tx */
+       tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+       ksock_sched_t *sched = conn->ksnc_scheduler;
+       ksock_msg_t   *msg = &tx->tx_msg;
+       ksock_tx_t    *ztx = NULL;
+       int         bufnob = 0;
+
+       /* called holding global lock (read or irq-write) and caller may
+        * not have dropped this lock between finding conn and calling me,
+        * so we don't need the {get,put}connsock dance to deref
+        * ksnc_sock... */
+       LASSERT(!conn->ksnc_closing);
+
+       CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+               HIPQUAD(conn->ksnc_ipaddr),
+               conn->ksnc_port);
+
+       ksocknal_tx_prep(conn, tx);
+
+       /* Ensure the frags we've been given EXACTLY match the number of
+        * bytes we want to send.  Many TCP/IP stacks disregard any total
+        * size parameters passed to them and just look at the frags.
+        *
+        * We always expect at least 1 mapped fragment containing the
+        * complete ksocknal message header. */
+       LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+                lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+                (unsigned int)tx->tx_nob);
+       LASSERT (tx->tx_niov >= 1);
+       LASSERT (tx->tx_resid == tx->tx_nob);
+
+       CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+               tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+                                              KSOCK_MSG_NOOP,
+               tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+       /*
+        * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+        * but they're used inside spinlocks a lot.
+        */
+       bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+       spin_lock_bh(&sched->kss_lock);
+
+       if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+               /* First packet starts the timeout */
+               conn->ksnc_tx_deadline =
+                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+               if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+                       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+               conn->ksnc_tx_bufnob = 0;
+               mb(); /* order with adding to tx_queue */
+       }
+
+       if (msg->ksm_type == KSOCK_MSG_NOOP) {
+               /* The packet is noop ZC ACK, try to piggyback the ack_cookie
+                * on a normal packet so I don't need to send it */
+               LASSERT (msg->ksm_zc_cookies[1] != 0);
+               LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+               if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+                       ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+       } else {
+               /* It's a normal packet - can it piggback a noop zc-ack that
+                * has been queued already? */
+               LASSERT (msg->ksm_zc_cookies[1] == 0);
+               LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+               ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+               /* ztx will be released later */
+       }
+
+       if (ztx != NULL) {
+               atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+               list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+       }
+
+       if (conn->ksnc_tx_ready &&      /* able to send */
+           !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+               /* +1 ref for scheduler */
+               ksocknal_conn_addref(conn);
+               list_add_tail (&conn->ksnc_tx_list,
+                                  &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+       cfs_time_t     now = cfs_time_current();
+       struct list_head    *tmp;
+       ksock_route_t *route;
+
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+               if (route->ksnr_scheduled)      /* connections being established */
+                       continue;
+
+               /* all route types connected ? */
+               if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+                       continue;
+
+               if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+                     cfs_time_aftereq(now, route->ksnr_timeout))) {
+                       CDEBUG(D_NET,
+                              "Too soon to retry route %u.%u.%u.%u "
+                              "(cnted %d, interval %ld, %ld secs later)\n",
+                              HIPQUAD(route->ksnr_ipaddr),
+                              route->ksnr_connected,
+                              route->ksnr_retry_interval,
+                              cfs_duration_sec(route->ksnr_timeout - now));
+                       continue;
+               }
+
+               return (route);
+       }
+
+       return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+       struct list_head        *tmp;
+       ksock_route_t     *route;
+
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+               if (route->ksnr_scheduled)
+                       return (route);
+       }
+
+       return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+       ksock_peer_t     *peer;
+       ksock_conn_t     *conn;
+       rwlock_t     *g_lock;
+       int            retry;
+       int            rc;
+
+       LASSERT (tx->tx_conn == NULL);
+
+       g_lock = &ksocknal_data.ksnd_global_lock;
+
+       for (retry = 0;; retry = 1) {
+               read_lock(g_lock);
+               peer = ksocknal_find_peer_locked(ni, id);
+               if (peer != NULL) {
+                       if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+                               conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+                               if (conn != NULL) {
+                                       /* I've got no routes that need to be
+                                        * connecting and I do have an actual
+                                        * connection... */
+                                       ksocknal_queue_tx_locked (tx, conn);
+                                       read_unlock(g_lock);
+                                       return (0);
+                               }
+                       }
+               }
+
+               /* I'll need a write lock... */
+               read_unlock(g_lock);
+
+               write_lock_bh(g_lock);
+
+               peer = ksocknal_find_peer_locked(ni, id);
+               if (peer != NULL)
+                       break;
+
+               write_unlock_bh(g_lock);
+
+               if ((id.pid & LNET_PID_USERFLAG) != 0) {
+                       CERROR("Refusing to create a connection to "
+                              "userspace process %s\n", libcfs_id2str(id));
+                       return -EHOSTUNREACH;
+               }
+
+               if (retry) {
+                       CERROR("Can't find peer %s\n", libcfs_id2str(id));
+                       return -EHOSTUNREACH;
+               }
+
+               rc = ksocknal_add_peer(ni, id,
+                                      LNET_NIDADDR(id.nid),
+                                      lnet_acceptor_port());
+               if (rc != 0) {
+                       CERROR("Can't add peer %s: %d\n",
+                              libcfs_id2str(id), rc);
+                       return rc;
+               }
+       }
+
+       ksocknal_launch_all_connections_locked(peer);
+
+       conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+       if (conn != NULL) {
+               /* Connection exists; queue message on it */
+               ksocknal_queue_tx_locked (tx, conn);
+               write_unlock_bh(g_lock);
+               return (0);
+       }
+
+       if (peer->ksnp_accepting > 0 ||
+           ksocknal_find_connecting_route_locked (peer) != NULL) {
+               /* the message is going to be pinned to the peer */
+               tx->tx_deadline =
+                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+               /* Queue the message until a connection is established */
+               list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+               write_unlock_bh(g_lock);
+               return 0;
+       }
+
+       write_unlock_bh(g_lock);
+
+       /* NB Routes may be ignored if connections to them failed recently */
+       CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+       return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       int            mpflag = 0;
+       int            type = lntmsg->msg_type;
+       lnet_process_id_t target = lntmsg->msg_target;
+       unsigned int      payload_niov = lntmsg->msg_niov;
+       struct iovec     *payload_iov = lntmsg->msg_iov;
+       lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+       unsigned int      payload_offset = lntmsg->msg_offset;
+       unsigned int      payload_nob = lntmsg->msg_len;
+       ksock_tx_t       *tx;
+       int            desc_size;
+       int            rc;
+
+       /* NB 'private' is different depending on what we're sending.
+        * Just ignore it... */
+
+       CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+              payload_nob, payload_niov, libcfs_id2str(target));
+
+       LASSERT (payload_nob == 0 || payload_niov > 0);
+       LASSERT (payload_niov <= LNET_MAX_IOV);
+       /* payload is either all vaddrs or all pages */
+       LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+       LASSERT (!in_interrupt ());
+
+       if (payload_iov != NULL)
+               desc_size = offsetof(ksock_tx_t,
+                                    tx_frags.virt.iov[1 + payload_niov]);
+       else
+               desc_size = offsetof(ksock_tx_t,
+                                    tx_frags.paged.kiov[payload_niov]);
+
+       if (lntmsg->msg_vmflush)
+               mpflag = cfs_memory_pressure_get_and_set();
+       tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+       if (tx == NULL) {
+               CERROR("Can't allocate tx desc type %d size %d\n",
+                      type, desc_size);
+               if (lntmsg->msg_vmflush)
+                       cfs_memory_pressure_restore(mpflag);
+               return (-ENOMEM);
+       }
+
+       tx->tx_conn = NULL;                  /* set when assigned a conn */
+       tx->tx_lnetmsg = lntmsg;
+
+       if (payload_iov != NULL) {
+               tx->tx_kiov = NULL;
+               tx->tx_nkiov = 0;
+               tx->tx_iov = tx->tx_frags.virt.iov;
+               tx->tx_niov = 1 +
+                             lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+                                              payload_niov, payload_iov,
+                                              payload_offset, payload_nob);
+       } else {
+               tx->tx_niov = 1;
+               tx->tx_iov = &tx->tx_frags.paged.iov;
+               tx->tx_kiov = tx->tx_frags.paged.kiov;
+               tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+                                                payload_niov, payload_kiov,
+                                                payload_offset, payload_nob);
+
+               if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+                       tx->tx_zc_capable = 1;
+       }
+
+       socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+       /* The first fragment will be set later in pro_pack */
+       rc = ksocknal_launch_packet(ni, tx, target);
+       if (lntmsg->msg_vmflush)
+               cfs_memory_pressure_restore(mpflag);
+       if (rc == 0)
+               return (0);
+
+       ksocknal_free_tx(tx);
+       return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+       task_t *task = kthread_run(fn, arg, name);
+
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       ksocknal_data.ksnd_nthreads++;
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+       return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       ksocknal_data.ksnd_nthreads--;
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+       static char ksocknal_slop_buffer[4096];
+
+       int         nob;
+       unsigned int   niov;
+       int         skipped;
+
+       LASSERT(conn->ksnc_proto != NULL);
+
+       if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+               /* Remind the socket to ack eagerly... */
+               ksocknal_lib_eager_ack(conn);
+       }
+
+       if (nob_to_skip == 0) {  /* right at next packet boundary now */
+               conn->ksnc_rx_started = 0;
+               mb();                  /* racing with timeout thread */
+
+               switch (conn->ksnc_proto->pro_version) {
+               case  KSOCK_PROTO_V2:
+               case  KSOCK_PROTO_V3:
+                       conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+                       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                       conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+                       conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+                       conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+                       conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+                       break;
+
+               case KSOCK_PROTO_V1:
+                       /* Receiving bare lnet_hdr_t */
+                       conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+                       conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+                       conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+                       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                       conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+                       conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+                       break;
+
+               default:
+                       LBUG ();
+               }
+               conn->ksnc_rx_niov = 1;
+
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_nkiov = 0;
+               conn->ksnc_rx_csum = ~0;
+               return (1);
+       }
+
+       /* Set up to skip as much as possible now.  If there's more left
+        * (ran out of iov entries) we'll get called again */
+
+       conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+       conn->ksnc_rx_nob_left = nob_to_skip;
+       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+       skipped = 0;
+       niov = 0;
+
+       do {
+               nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+               conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+               conn->ksnc_rx_iov[niov].iov_len  = nob;
+               niov++;
+               skipped += nob;
+               nob_to_skip -=nob;
+
+       } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+       conn->ksnc_rx_niov = niov;
+       conn->ksnc_rx_kiov = NULL;
+       conn->ksnc_rx_nkiov = 0;
+       conn->ksnc_rx_nob_wanted = skipped;
+       return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+       lnet_hdr_t      *lhdr;
+       lnet_process_id_t *id;
+       int             rc;
+
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+       /* NB: sched lock NOT held */
+       /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+       LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+                conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+                conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+                conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+       if (conn->ksnc_rx_nob_wanted != 0) {
+               rc = ksocknal_receive(conn);
+
+               if (rc <= 0) {
+                       LASSERT (rc != -EAGAIN);
+
+                       if (rc == 0)
+                               CDEBUG (D_NET, "[%p] EOF from %s"
+                                       " ip %d.%d.%d.%d:%d\n", conn,
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                       else if (!conn->ksnc_closing)
+                               CERROR ("[%p] Error %d on read from %s"
+                                       " ip %d.%d.%d.%d:%d\n",
+                                       conn, rc,
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+
+                       /* it's not an error if conn is being closed */
+                       ksocknal_close_conn_and_siblings (conn,
+                                                         (conn->ksnc_closing) ? 0 : rc);
+                       return (rc == 0 ? -ESHUTDOWN : rc);
+               }
+
+               if (conn->ksnc_rx_nob_wanted != 0) {
+                       /* short read */
+                       return (-EAGAIN);
+               }
+       }
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_KSM_HEADER:
+               if (conn->ksnc_flip) {
+                       __swab32s(&conn->ksnc_msg.ksm_type);
+                       __swab32s(&conn->ksnc_msg.ksm_csum);
+                       __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+                       __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+               }
+
+               if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+                   conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+                       CERROR("%s: Unknown message type: %x\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_type);
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                       return (-EPROTO);
+               }
+
+               if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+                   conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+                   conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                       /* NOOP Checksum error */
+                       CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                       return (-EIO);
+               }
+
+               if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+                       __u64 cookie = 0;
+
+                       LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                       if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+                               cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+                       rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+                                              conn->ksnc_msg.ksm_zc_cookies[1]);
+
+                       if (rc != 0) {
+                               CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+                                      libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                      cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+                               ksocknal_new_packet(conn, 0);
+                               ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                               return (rc);
+                       }
+               }
+
+               if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+                       ksocknal_new_packet (conn, 0);
+                       return 0;       /* NOOP is done and just return */
+               }
+
+               conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+               conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+               conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+               conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+               conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+               conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+               conn->ksnc_rx_niov = 1;
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_nkiov = 0;
+
+               goto again;     /* read lnet header now */
+
+       case SOCKNAL_RX_LNET_HEADER:
+               /* unpack message header */
+               conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+               if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+                       /* Userspace peer */
+                       lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                       id   = &conn->ksnc_peer->ksnp_id;
+
+                       /* Substitute process ID assigned at connection time */
+                       lhdr->src_pid = cpu_to_le32(id->pid);
+                       lhdr->src_nid = cpu_to_le64(id->nid);
+               }
+
+               conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+               ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+               rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+                               &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+                               conn->ksnc_peer->ksnp_id.nid, conn, 0);
+               if (rc < 0) {
+                       /* I just received garbage: give up on this conn */
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings (conn, rc);
+                       ksocknal_conn_decref(conn);
+                       return (-EPROTO);
+               }
+
+               /* I'm racing with ksocknal_recv() */
+               LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+                        conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+               if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+                       return 0;
+
+               /* ksocknal_recv() got called */
+               goto again;
+
+       case SOCKNAL_RX_LNET_PAYLOAD:
+               /* payload all received */
+               rc = 0;
+
+               if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+                   conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+                   conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                       CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                       rc = -EIO;
+               }
+
+               if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+                       LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                       lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                       id   = &conn->ksnc_peer->ksnp_id;
+
+                       rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+                                       conn->ksnc_msg.ksm_zc_cookies[0],
+                                       *ksocknal_tunables.ksnd_nonblk_zcack ||
+                                       le64_to_cpu(lhdr->src_nid) != id->nid);
+               }
+
+               lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+               if (rc != 0) {
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings (conn, rc);
+                       return (-EPROTO);
+               }
+               /* Fall through */
+
+       case SOCKNAL_RX_SLOP:
+               /* starting new packet? */
+               if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                       return 0;       /* come back later */
+               goto again;          /* try to finish reading slop now */
+
+       default:
+               break;
+       }
+
+       /* Not Reached */
+       LBUG ();
+       return (-EINVAL);                      /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+              unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       ksock_conn_t  *conn = (ksock_conn_t *)private;
+       ksock_sched_t *sched = conn->ksnc_scheduler;
+
+       LASSERT (mlen <= rlen);
+       LASSERT (niov <= LNET_MAX_IOV);
+
+       conn->ksnc_cookie = msg;
+       conn->ksnc_rx_nob_wanted = mlen;
+       conn->ksnc_rx_nob_left   = rlen;
+
+       if (mlen == 0 || iov != NULL) {
+               conn->ksnc_rx_nkiov = 0;
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+               conn->ksnc_rx_niov =
+                       lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+                                        niov, iov, offset, mlen);
+       } else {
+               conn->ksnc_rx_niov = 0;
+               conn->ksnc_rx_iov  = NULL;
+               conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+               conn->ksnc_rx_nkiov =
+                       lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+                                         niov, kiov, offset, mlen);
+       }
+
+       LASSERT (mlen ==
+                lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+       LASSERT (conn->ksnc_rx_scheduled);
+
+       spin_lock_bh(&sched->kss_lock);
+
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_PARSE_WAIT:
+               list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+               wake_up (&sched->kss_waitq);
+               LASSERT (conn->ksnc_rx_ready);
+               break;
+
+       case SOCKNAL_RX_PARSE:
+               /* scheduler hasn't noticed I'm parsing yet */
+               break;
+       }
+
+       conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+       spin_unlock_bh(&sched->kss_lock);
+       ksocknal_conn_decref(conn);
+       return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+       int        rc;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       rc = (!ksocknal_data.ksnd_shuttingdown &&
+             list_empty(&sched->kss_rx_conns) &&
+             list_empty(&sched->kss_tx_conns));
+
+       spin_unlock_bh(&sched->kss_lock);
+       return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       ksock_conn_t            *conn;
+       ksock_tx_t              *tx;
+       int                     rc;
+       int                     nloops = 0;
+       long                    id = (long)arg;
+
+       info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+       sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+       cfs_block_allsigs();
+
+       rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+       if (rc != 0) {
+               CERROR("Can't set CPT affinity to %d: %d\n",
+                      info->ksi_cpt, rc);
+       }
+
+       spin_lock_bh(&sched->kss_lock);
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+               int did_something = 0;
+
+               /* Ensure I progress everything semi-fairly */
+
+               if (!list_empty (&sched->kss_rx_conns)) {
+                       conn = list_entry(sched->kss_rx_conns.next,
+                                             ksock_conn_t, ksnc_rx_list);
+                       list_del(&conn->ksnc_rx_list);
+
+                       LASSERT(conn->ksnc_rx_scheduled);
+                       LASSERT(conn->ksnc_rx_ready);
+
+                       /* clear rx_ready in case receive isn't complete.
+                        * Do it BEFORE we call process_recv, since
+                        * data_ready can set it any time after we release
+                        * kss_lock. */
+                       conn->ksnc_rx_ready = 0;
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       rc = ksocknal_process_receive(conn);
+
+                       spin_lock_bh(&sched->kss_lock);
+
+                       /* I'm the only one that can clear this flag */
+                       LASSERT(conn->ksnc_rx_scheduled);
+
+                       /* Did process_receive get everything it wanted? */
+                       if (rc == 0)
+                               conn->ksnc_rx_ready = 1;
+
+                       if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                               /* Conn blocked waiting for ksocknal_recv()
+                                * I change its state (under lock) to signal
+                                * it can be rescheduled */
+                               conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+                       } else if (conn->ksnc_rx_ready) {
+                               /* reschedule for rx */
+                               list_add_tail (&conn->ksnc_rx_list,
+                                                  &sched->kss_rx_conns);
+                       } else {
+                               conn->ksnc_rx_scheduled = 0;
+                               /* drop my ref */
+                               ksocknal_conn_decref(conn);
+                       }
+
+                       did_something = 1;
+               }
+
+               if (!list_empty (&sched->kss_tx_conns)) {
+                       LIST_HEAD    (zlist);
+
+                       if (!list_empty(&sched->kss_zombie_noop_txs)) {
+                               list_add(&zlist,
+                                            &sched->kss_zombie_noop_txs);
+                               list_del_init(&sched->kss_zombie_noop_txs);
+                       }
+
+                       conn = list_entry(sched->kss_tx_conns.next,
+                                             ksock_conn_t, ksnc_tx_list);
+                       list_del (&conn->ksnc_tx_list);
+
+                       LASSERT(conn->ksnc_tx_scheduled);
+                       LASSERT(conn->ksnc_tx_ready);
+                       LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+                       tx = list_entry(conn->ksnc_tx_queue.next,
+                                           ksock_tx_t, tx_list);
+
+                       if (conn->ksnc_tx_carrier == tx)
+                               ksocknal_next_tx_carrier(conn);
+
+                       /* dequeue now so empty list => more to send */
+                       list_del(&tx->tx_list);
+
+                       /* Clear tx_ready in case send isn't complete.  Do
+                        * it BEFORE we call process_transmit, since
+                        * write_space can set it any time after we release
+                        * kss_lock. */
+                       conn->ksnc_tx_ready = 0;
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       if (!list_empty(&zlist)) {
+                               /* free zombie noop txs, it's fast because
+                                * noop txs are just put in freelist */
+                               ksocknal_txlist_done(NULL, &zlist, 0);
+                       }
+
+                       rc = ksocknal_process_transmit(conn, tx);
+
+                       if (rc == -ENOMEM || rc == -EAGAIN) {
+                               /* Incomplete send: replace tx on HEAD of tx_queue */
+                               spin_lock_bh(&sched->kss_lock);
+                               list_add(&tx->tx_list,
+                                            &conn->ksnc_tx_queue);
+                       } else {
+                               /* Complete send; tx -ref */
+                               ksocknal_tx_decref(tx);
+
+                               spin_lock_bh(&sched->kss_lock);
+                               /* assume space for more */
+                               conn->ksnc_tx_ready = 1;
+                       }
+
+                       if (rc == -ENOMEM) {
+                               /* Do nothing; after a short timeout, this
+                                * conn will be reposted on kss_tx_conns. */
+                       } else if (conn->ksnc_tx_ready &&
+                                  !list_empty (&conn->ksnc_tx_queue)) {
+                               /* reschedule for tx */
+                               list_add_tail (&conn->ksnc_tx_list,
+                                                  &sched->kss_tx_conns);
+                       } else {
+                               conn->ksnc_tx_scheduled = 0;
+                               /* drop my ref */
+                               ksocknal_conn_decref(conn);
+                       }
+
+                       did_something = 1;
+               }
+               if (!did_something ||      /* nothing to do */
+                   ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       nloops = 0;
+
+                       if (!did_something) {   /* wait for something to do */
+                               cfs_wait_event_interruptible_exclusive(
+                                       sched->kss_waitq,
+                                       !ksocknal_sched_cansleep(sched), rc);
+                               LASSERT (rc == 0);
+                       } else {
+                               cond_resched();
+                       }
+
+                       spin_lock_bh(&sched->kss_lock);
+               }
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+       ksocknal_thread_fini();
+       return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+       ksock_sched_t *sched;
+       ENTRY;
+
+       sched = conn->ksnc_scheduler;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       conn->ksnc_rx_ready = 1;
+
+       if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+               list_add_tail(&conn->ksnc_rx_list,
+                                 &sched->kss_rx_conns);
+               conn->ksnc_rx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+       spin_unlock_bh(&sched->kss_lock);
+
+       EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+       ksock_sched_t *sched;
+       ENTRY;
+
+       sched = conn->ksnc_scheduler;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       conn->ksnc_tx_ready = 1;
+
+       if (!conn->ksnc_tx_scheduled && // not being progressed
+           !list_empty(&conn->ksnc_tx_queue)){//packets to send
+               list_add_tail (&conn->ksnc_tx_list,
+                                  &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+       __u32   version = 0;
+
+       if (hello->kshm_magic == LNET_PROTO_MAGIC)
+               version = hello->kshm_version;
+       else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+               version = __swab32(hello->kshm_version);
+
+       if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+               if (*ksocknal_tunables.ksnd_protocol == 1)
+                       return NULL;
+
+               if (*ksocknal_tunables.ksnd_protocol == 2 &&
+                   version == KSOCK_PROTO_V3)
+                       return NULL;
+#endif
+               if (version == KSOCK_PROTO_V2)
+                       return &ksocknal_protocol_v2x;
+
+               if (version == KSOCK_PROTO_V3)
+                       return &ksocknal_protocol_v3x;
+
+               return NULL;
+       }
+
+       if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+               lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+               CLASSERT (sizeof (lnet_magicversion_t) ==
+                         offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+               if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+                   hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+                       return &ksocknal_protocol_v1x;
+       }
+
+       return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                    lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+       /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+       ksock_net_t      *net = (ksock_net_t *)ni->ni_data;
+
+       LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+       /* rely on caller to hold a ref on socket so it wouldn't disappear */
+       LASSERT (conn->ksnc_proto != NULL);
+
+       hello->kshm_src_nid      = ni->ni_nid;
+       hello->kshm_dst_nid      = peer_nid;
+       hello->kshm_src_pid      = the_lnet.ln_pid;
+
+       hello->kshm_src_incarnation = net->ksnn_incarnation;
+       hello->kshm_ctype          = conn->ksnc_type;
+
+       return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+       switch (type)
+       {
+       case SOCKLND_CONN_ANY:
+       case SOCKLND_CONN_CONTROL:
+               return (type);
+       case SOCKLND_CONN_BULK_IN:
+               return SOCKLND_CONN_BULK_OUT;
+       case SOCKLND_CONN_BULK_OUT:
+               return SOCKLND_CONN_BULK_IN;
+       default:
+               return (SOCKLND_CONN_NONE);
+       }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                    ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+                    __u64 *incarnation)
+{
+       /* Return < 0   fatal error
+        *      0         success
+        *      EALREADY   lost connection race
+        *      EPROTO     protocol version mismatch
+        */
+       socket_t        *sock = conn->ksnc_sock;
+       int               active = (conn->ksnc_proto != NULL);
+       int               timeout;
+       int               proto_match;
+       int               rc;
+       ksock_proto_t       *proto;
+       lnet_process_id_t    recv_id;
+
+       /* socket type set on active connections - not set on passive */
+       LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+       timeout = active ? *ksocknal_tunables.ksnd_timeout :
+                           lnet_acceptor_timeout();
+
+       rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+           hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+           hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+               /* Unexpected magic! */
+               CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+                       "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+                       LNET_PROTO_TCP_MAGIC,
+                       HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       rc = libcfs_sock_read(sock, &hello->kshm_version,
+                             sizeof(hello->kshm_version), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       proto = ksocknal_parse_proto_version(hello);
+       if (proto == NULL) {
+               if (!active) {
+                       /* unknown protocol from peer, tell peer my protocol */
+                       conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                       if (*ksocknal_tunables.ksnd_protocol == 2)
+                               conn->ksnc_proto = &ksocknal_protocol_v2x;
+                       else if (*ksocknal_tunables.ksnd_protocol == 1)
+                               conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+                       hello->kshm_nips = 0;
+                       ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+               }
+
+               CERROR ("Unknown protocol version (%d.x expected)"
+                       " from %u.%u.%u.%u\n",
+                       conn->ksnc_proto->pro_version,
+                       HIPQUAD(conn->ksnc_ipaddr));
+
+               return -EPROTO;
+       }
+
+       proto_match = (conn->ksnc_proto == proto);
+       conn->ksnc_proto = proto;
+
+       /* receive the rest of hello message anyway */
+       rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+                      rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       *incarnation = hello->kshm_src_incarnation;
+
+       if (hello->kshm_src_nid == LNET_NID_ANY) {
+               CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+                      "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       if (!active &&
+           conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+               /* Userspace NAL assigns peer process ID from socket */
+               recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+               recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+       } else {
+               recv_id.nid = hello->kshm_src_nid;
+               recv_id.pid = hello->kshm_src_pid;
+       }
+
+       if (!active) {
+               *peerid = recv_id;
+
+               /* peer determines type */
+               conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+               if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                       CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+                               hello->kshm_ctype, libcfs_id2str(*peerid),
+                               HIPQUAD(conn->ksnc_ipaddr));
+                       return -EPROTO;
+               }
+
+               return 0;
+       }
+
+       if (peerid->pid != recv_id.pid ||
+           peerid->nid != recv_id.nid) {
+               LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+                                  " %u.%u.%u.%u, but they claimed they were "
+                                  "%s; please check your Lustre "
+                                  "configuration.\n",
+                                  libcfs_id2str(*peerid),
+                                  HIPQUAD(conn->ksnc_ipaddr),
+                                  libcfs_id2str(recv_id));
+               return -EPROTO;
+       }
+
+       if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+               /* Possible protocol mismatch or I lost the connection race */
+               return proto_match ? EALREADY : EPROTO;
+       }
+
+       if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+               CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+                       conn->ksnc_type, libcfs_id2str(*peerid),
+                       HIPQUAD(conn->ksnc_ipaddr),
+                       hello->kshm_ctype);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+       LIST_HEAD    (zombies);
+       ksock_peer_t     *peer = route->ksnr_peer;
+       int            type;
+       int            wanted;
+       socket_t     *sock;
+       cfs_time_t      deadline;
+       int            retry_later = 0;
+       int            rc = 0;
+
+       deadline = cfs_time_add(cfs_time_current(),
+                               cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       LASSERT (route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+
+       route->ksnr_connecting = 1;
+
+       for (;;) {
+               wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+               /* stop connecting if peer/route got closed under me, or
+                * route got connected while queued */
+               if (peer->ksnp_closing || route->ksnr_deleted ||
+                   wanted == 0) {
+                       retry_later = 0;
+                       break;
+               }
+
+               /* reschedule if peer is connecting to me */
+               if (peer->ksnp_accepting > 0) {
+                       CDEBUG(D_NET,
+                              "peer %s(%d) already connecting to me, retry later.\n",
+                              libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+                       retry_later = 1;
+               }
+
+               if (retry_later) /* needs reschedule */
+                       break;
+
+               if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+                       type = SOCKLND_CONN_ANY;
+               } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+                       type = SOCKLND_CONN_CONTROL;
+               } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+                       type = SOCKLND_CONN_BULK_IN;
+               } else {
+                       LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+                       type = SOCKLND_CONN_BULK_OUT;
+               }
+
+               write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+               if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+                       rc = -ETIMEDOUT;
+                       lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                  route->ksnr_ipaddr,
+                                                  route->ksnr_port);
+                       goto failed;
+               }
+
+               rc = lnet_connect(&sock, peer->ksnp_id.nid,
+                                 route->ksnr_myipaddr,
+                                 route->ksnr_ipaddr, route->ksnr_port);
+               if (rc != 0)
+                       goto failed;
+
+               rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+               if (rc < 0) {
+                       lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                  route->ksnr_ipaddr,
+                                                  route->ksnr_port);
+                       goto failed;
+               }
+
+               /* A +ve RC means I have to retry because I lost the connection
+                * race or I have to renegotiate protocol version */
+               retry_later = (rc != 0);
+               if (retry_later)
+                       CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+                              libcfs_nid2str(peer->ksnp_id.nid));
+
+               write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       }
+
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+
+       if (retry_later) {
+               /* re-queue for attention; this frees me up to handle
+                * the peer's incoming connection request */
+
+               if (rc == EALREADY ||
+                   (rc == 0 && peer->ksnp_accepting > 0)) {
+                       /* We want to introduce a delay before next
+                        * attempt to connect if we lost conn race,
+                        * but the race is resolved quickly usually,
+                        * so min_reconnectms should be good heuristic */
+                       route->ksnr_retry_interval =
+                               cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+                       route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                                          route->ksnr_retry_interval);
+               }
+
+               ksocknal_launch_connection_locked(route);
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+       return retry_later;
+
+ failed:
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+
+       /* This is a retry rather than a new connection */
+       route->ksnr_retry_interval *= 2;
+       route->ksnr_retry_interval =
+               MAX(route->ksnr_retry_interval,
+                   cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+       route->ksnr_retry_interval =
+               MIN(route->ksnr_retry_interval,
+                   cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+       LASSERT (route->ksnr_retry_interval != 0);
+       route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                          route->ksnr_retry_interval);
+
+       if (!list_empty(&peer->ksnp_tx_queue) &&
+           peer->ksnp_accepting == 0 &&
+           ksocknal_find_connecting_route_locked(peer) == NULL) {
+               ksock_conn_t *conn;
+
+               /* ksnp_tx_queue is queued on a conn on successful
+                * connection for V1.x and V2.x */
+               if (!list_empty (&peer->ksnp_conns)) {
+                       conn = list_entry(peer->ksnp_conns.next,
+                                             ksock_conn_t, ksnc_list);
+                       LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+               }
+
+               /* take all the blocked packets while I've got the lock and
+                * complete below... */
+               list_splice_init(&peer->ksnp_tx_queue, &zombies);
+       }
+
+#if 0     /* irrelevent with only eager routes */
+       if (!route->ksnr_deleted) {
+               /* make this route least-favourite for re-selection */
+               list_del(&route->ksnr_list);
+               list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+       }
+#endif
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_peer_failed(peer);
+       ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+       return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+       char name[16];
+       int rc;
+       int total = ksocknal_data.ksnd_connd_starting +
+                   ksocknal_data.ksnd_connd_running;
+
+       if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+               /* still in initializing */
+               return 0;
+       }
+
+       if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+           total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+               /* can't create more connd, or still have enough
+                * threads to handle more connecting */
+               return 0;
+       }
+
+       if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+               /* no pending connecting request */
+               return 0;
+       }
+
+       if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+               /* may run out of resource, retry later */
+               *timeout = cfs_time_seconds(1);
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_starting > 0) {
+               /* serialize starting to avoid flood */
+               return 0;
+       }
+
+       ksocknal_data.ksnd_connd_starting_stamp = sec;
+       ksocknal_data.ksnd_connd_starting++;
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       /* NB: total is the next id */
+       snprintf(name, sizeof(name), "socknal_cd%02d", total);
+       rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+       if (rc == 0)
+               return 1;
+
+       /* we tried ... */
+       LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+       ksocknal_data.ksnd_connd_starting--;
+       ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+       return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+       int val;
+
+       if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+               /* still in initializing */
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_starting > 0) {
+               /* in progress of starting new thread */
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_running <=
+           *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+               return 0;
+       }
+
+       /* created thread in past 120 seconds? */
+       val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+                   SOCKNAL_CONND_TIMEOUT - sec);
+
+       *timeout = (val > 0) ? cfs_time_seconds(val) :
+                              cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+       if (val > 0)
+               return 0;
+
+       /* no creating in past 120 seconds */
+
+       return ksocknal_data.ksnd_connd_running >
+              ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+       ksock_route_t *route;
+       cfs_time_t     now;
+
+       now = cfs_time_current();
+
+       /* connd_routes can contain both pending and ordinary routes */
+       list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+                                ksnr_connd_list) {
+
+               if (route->ksnr_retry_interval == 0 ||
+                   cfs_time_aftereq(now, route->ksnr_timeout))
+                       return route;
+
+               if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+                   (int)*timeout_p > (int)(route->ksnr_timeout - now))
+                       *timeout_p = (int)(route->ksnr_timeout - now);
+       }
+
+       return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+       spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+       ksock_connreq_t   *cr;
+       wait_queue_t     wait;
+       int             nloops = 0;
+       int             cons_retry = 0;
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current (&wait);
+
+       spin_lock_bh(connd_lock);
+
+       LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+       ksocknal_data.ksnd_connd_starting--;
+       ksocknal_data.ksnd_connd_running++;
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+               ksock_route_t *route = NULL;
+               long sec = cfs_time_current_sec();
+               long timeout = MAX_SCHEDULE_TIMEOUT;
+               int  dropped_lock = 0;
+
+               if (ksocknal_connd_check_stop(sec, &timeout)) {
+                       /* wakeup another one to check stop */
+                       wake_up(&ksocknal_data.ksnd_connd_waitq);
+                       break;
+               }
+
+               if (ksocknal_connd_check_start(sec, &timeout)) {
+                       /* created new thread */
+                       dropped_lock = 1;
+               }
+
+               if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                       /* Connection accepted by the listener */
+                       cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+                                           next, ksock_connreq_t, ksncr_list);
+
+                       list_del(&cr->ksncr_list);
+                       spin_unlock_bh(connd_lock);
+                       dropped_lock = 1;
+
+                       ksocknal_create_conn(cr->ksncr_ni, NULL,
+                                            cr->ksncr_sock, SOCKLND_CONN_NONE);
+                       lnet_ni_decref(cr->ksncr_ni);
+                       LIBCFS_FREE(cr, sizeof(*cr));
+
+                       spin_lock_bh(connd_lock);
+               }
+
+               /* Only handle an outgoing connection request if there
+                * is a thread left to handle incoming connections and
+                * create new connd */
+               if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+                   ksocknal_data.ksnd_connd_running) {
+                       route = ksocknal_connd_get_route_locked(&timeout);
+               }
+               if (route != NULL) {
+                       list_del (&route->ksnr_connd_list);
+                       ksocknal_data.ksnd_connd_connecting++;
+                       spin_unlock_bh(connd_lock);
+                       dropped_lock = 1;
+
+                       if (ksocknal_connect(route)) {
+                               /* consecutive retry */
+                               if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+                                       CWARN("massive consecutive "
+                                             "re-connecting to %u.%u.%u.%u\n",
+                                             HIPQUAD(route->ksnr_ipaddr));
+                                       cons_retry = 0;
+                               }
+                       } else {
+                               cons_retry = 0;
+                       }
+
+                       ksocknal_route_decref(route);
+
+                       spin_lock_bh(connd_lock);
+                       ksocknal_data.ksnd_connd_connecting--;
+               }
+
+               if (dropped_lock) {
+                       if (++nloops < SOCKNAL_RESCHED)
+                               continue;
+                       spin_unlock_bh(connd_lock);
+                       nloops = 0;
+                       cond_resched();
+                       spin_lock_bh(connd_lock);
+                       continue;
+               }
+
+               /* Nothing to do for 'timeout'  */
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+               spin_unlock_bh(connd_lock);
+
+               nloops = 0;
+               waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+               spin_lock_bh(connd_lock);
+       }
+       ksocknal_data.ksnd_connd_running--;
+       spin_unlock_bh(connd_lock);
+
+       ksocknal_thread_fini();
+       return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+       /* We're called with a shared lock on ksnd_global_lock */
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+
+       list_for_each (ctmp, &peer->ksnp_conns) {
+               int     error;
+               conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+               /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+               LASSERT (!conn->ksnc_closing);
+
+               /* SOCK_ERROR will reset error code of socket in
+                * some platform (like Darwin8.x) */
+               error = cfs_sock_error(conn->ksnc_sock);
+               if (error != 0) {
+                       ksocknal_conn_addref(conn);
+
+                       switch (error) {
+                       case ECONNRESET:
+                               CNETERR("A connection with %s "
+                                       "(%u.%u.%u.%u:%d) was reset; "
+                                       "it may have rebooted.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       case ETIMEDOUT:
+                               CNETERR("A connection with %s "
+                                       "(%u.%u.%u.%u:%d) timed out; the "
+                                       "network or node may be down.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       default:
+                               CNETERR("An unexpected network error %d "
+                                       "occurred with %s "
+                                       "(%u.%u.%u.%u:%d\n", error,
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       }
+
+                       return (conn);
+               }
+
+               if (conn->ksnc_rx_started &&
+                   cfs_time_aftereq(cfs_time_current(),
+                                    conn->ksnc_rx_deadline)) {
+                       /* Timed out incomplete incoming message */
+                       ksocknal_conn_addref(conn);
+                       CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+                               "state %d wanted %d left %d\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port,
+                               conn->ksnc_rx_state,
+                               conn->ksnc_rx_nob_wanted,
+                               conn->ksnc_rx_nob_left);
+                       return (conn);
+               }
+
+               if ((!list_empty(&conn->ksnc_tx_queue) ||
+                    cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+                   cfs_time_aftereq(cfs_time_current(),
+                                    conn->ksnc_tx_deadline)) {
+                       /* Timed out messages queued for sending or
+                        * buffered in the socket's send buffer */
+                       ksocknal_conn_addref(conn);
+                       CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+                               "the network or that node may be down.\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port);
+                       return (conn);
+               }
+       }
+
+       return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+       ksock_tx_t      *tx;
+       LIST_HEAD      (stale_txs);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       while (!list_empty (&peer->ksnp_tx_queue)) {
+               tx = list_entry (peer->ksnp_tx_queue.next,
+                                    ksock_tx_t, tx_list);
+
+               if (!cfs_time_aftereq(cfs_time_current(),
+                                     tx->tx_deadline))
+                       break;
+
+               list_del (&tx->tx_list);
+               list_add_tail (&tx->tx_list, &stale_txs);
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+       ksock_sched_t  *sched;
+       ksock_conn_t   *conn;
+       ksock_tx_t     *tx;
+
+       if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+               return 0;
+
+       if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+               return 0;
+
+       if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+           cfs_time_before(cfs_time_current(),
+                           cfs_time_add(peer->ksnp_last_alive,
+                                        cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+               return 0;
+
+       if (cfs_time_before(cfs_time_current(),
+                           peer->ksnp_send_keepalive))
+               return 0;
+
+       /* retry 10 secs later, so we wouldn't put pressure
+        * on this peer if we failed to send keepalive this time */
+       peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+       conn = ksocknal_find_conn_locked(peer, NULL, 1);
+       if (conn != NULL) {
+               sched = conn->ksnc_scheduler;
+
+               spin_lock_bh(&sched->kss_lock);
+               if (!list_empty(&conn->ksnc_tx_queue)) {
+                       spin_unlock_bh(&sched->kss_lock);
+                       /* there is an queued ACK, don't need keepalive */
+                       return 0;
+               }
+
+               spin_unlock_bh(&sched->kss_lock);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       /* cookie = 1 is reserved for keepalive PING */
+       tx = ksocknal_alloc_tx_noop(1, 1);
+       if (tx == NULL) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               return -ENOMEM;
+       }
+
+       if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               return 1;
+       }
+
+       ksocknal_free_tx(tx);
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+       struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+       ksock_peer_t     *peer;
+       ksock_conn_t     *conn;
+       ksock_tx_t       *tx;
+
+ again:
+       /* NB. We expect to have a look at all the peers and not find any
+        * connections to time out, so we just use a shared lock while we
+        * take a look... */
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       list_for_each_entry(peer, peers, ksnp_list) {
+               cfs_time_t  deadline = 0;
+               int      resid = 0;
+               int      n     = 0;
+
+               if (ksocknal_send_keepalive_locked(peer) != 0) {
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       goto again;
+               }
+
+               conn = ksocknal_find_timed_out_conn (peer);
+
+               if (conn != NULL) {
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                       ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+                       /* NB we won't find this one again, but we can't
+                        * just proceed with the next peer, since we dropped
+                        * ksnd_global_lock and it might be dead already! */
+                       ksocknal_conn_decref(conn);
+                       goto again;
+               }
+
+               /* we can't process stale txs right here because we're
+                * holding only shared lock */
+               if (!list_empty (&peer->ksnp_tx_queue)) {
+                       ksock_tx_t *tx =
+                               list_entry (peer->ksnp_tx_queue.next,
+                                               ksock_tx_t, tx_list);
+
+                       if (cfs_time_aftereq(cfs_time_current(),
+                                            tx->tx_deadline)) {
+
+                               ksocknal_peer_addref(peer);
+                               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                               ksocknal_flush_stale_txs(peer);
+
+                               ksocknal_peer_decref(peer);
+                               goto again;
+                       }
+               }
+
+               if (list_empty(&peer->ksnp_zc_req_list))
+                       continue;
+
+               spin_lock(&peer->ksnp_lock);
+               list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+                       if (!cfs_time_aftereq(cfs_time_current(),
+                                             tx->tx_deadline))
+                               break;
+                       /* ignore the TX if connection is being closed */
+                       if (tx->tx_conn->ksnc_closing)
+                               continue;
+                       n++;
+               }
+
+               if (n == 0) {
+                       spin_unlock(&peer->ksnp_lock);
+                       continue;
+               }
+
+               tx = list_entry(peer->ksnp_zc_req_list.next,
+                                   ksock_tx_t, tx_zc_list);
+               deadline = tx->tx_deadline;
+               resid    = tx->tx_resid;
+               conn     = tx->tx_conn;
+               ksocknal_conn_addref(conn);
+
+               spin_unlock(&peer->ksnp_lock);
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+                      "oldest(%p) timed out %ld secs ago, "
+                      "resid: %d, wmem: %d\n",
+                      n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+                      cfs_duration_sec(cfs_time_current() - deadline),
+                      resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+               ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+               ksocknal_conn_decref(conn);
+               goto again;
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+       wait_queue_t     wait;
+       ksock_conn_t      *conn;
+       ksock_sched_t     *sched;
+       struct list_head         enomem_conns;
+       int             nenomem_conns;
+       cfs_duration_t     timeout;
+       int             i;
+       int             peer_index = 0;
+       cfs_time_t       deadline = cfs_time_current();
+
+       cfs_block_allsigs ();
+
+       INIT_LIST_HEAD(&enomem_conns);
+       init_waitqueue_entry_current (&wait);
+
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+
+               if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+                       conn = list_entry (ksocknal_data. \
+                                              ksnd_deathrow_conns.next,
+                                              ksock_conn_t, ksnc_list);
+                       list_del (&conn->ksnc_list);
+
+                       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                       ksocknal_terminate_conn(conn);
+                       ksocknal_conn_decref(conn);
+
+                       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                       continue;
+               }
+
+               if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+                       conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+                                              next, ksock_conn_t, ksnc_list);
+                       list_del (&conn->ksnc_list);
+
+                       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                       ksocknal_destroy_conn(conn);
+
+                       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                       continue;
+               }
+
+               if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+                       list_add(&enomem_conns,
+                                    &ksocknal_data.ksnd_enomem_conns);
+                       list_del_init(&ksocknal_data.ksnd_enomem_conns);
+               }
+
+               spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+               /* reschedule all the connections that stalled with ENOMEM... */
+               nenomem_conns = 0;
+               while (!list_empty (&enomem_conns)) {
+                       conn = list_entry (enomem_conns.next,
+                                              ksock_conn_t, ksnc_tx_list);
+                       list_del (&conn->ksnc_tx_list);
+
+                       sched = conn->ksnc_scheduler;
+
+                       spin_lock_bh(&sched->kss_lock);
+
+                       LASSERT(conn->ksnc_tx_scheduled);
+                       conn->ksnc_tx_ready = 1;
+                       list_add_tail(&conn->ksnc_tx_list,
+                                         &sched->kss_tx_conns);
+                       wake_up(&sched->kss_waitq);
+
+                       spin_unlock_bh(&sched->kss_lock);
+                       nenomem_conns++;
+               }
+
+               /* careful with the jiffy wrap... */
+               while ((timeout = cfs_time_sub(deadline,
+                                              cfs_time_current())) <= 0) {
+                       const int n = 4;
+                       const int p = 1;
+                       int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+                       /* Time to check for timeouts on a few more peers: I do
+                        * checks every 'p' seconds on a proportion of the peer
+                        * table and I need to check every connection 'n' times
+                        * within a timeout interval, to ensure I detect a
+                        * timeout on any connection within (n+1)/n times the
+                        * timeout interval. */
+
+                       if (*ksocknal_tunables.ksnd_timeout > n * p)
+                               chunk = (chunk * n * p) /
+                                       *ksocknal_tunables.ksnd_timeout;
+                       if (chunk == 0)
+                               chunk = 1;
+
+                       for (i = 0; i < chunk; i++) {
+                               ksocknal_check_peer_timeouts (peer_index);
+                               peer_index = (peer_index + 1) %
+                                            ksocknal_data.ksnd_peer_hash_size;
+                       }
+
+                       deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+               }
+
+               if (nenomem_conns != 0) {
+                       /* Reduce my timeout if I rescheduled ENOMEM conns.
+                        * This also prevents me getting woken immediately
+                        * if any go back on my enomem list. */
+                       timeout = SOCKNAL_ENOMEM_RETRY;
+               }
+               ksocknal_data.ksnd_reaper_waketime =
+                       cfs_time_add(cfs_time_current(), timeout);
+
+               set_current_state (TASK_INTERRUPTIBLE);
+               add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+               if (!ksocknal_data.ksnd_shuttingdown &&
+                   list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+                   list_empty (&ksocknal_data.ksnd_zombie_conns))
+                       waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+                                            timeout);
+
+               set_current_state (TASK_RUNNING);
+               remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+               spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+       }
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       ksocknal_thread_fini();
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644 (file)
index 0000000..3e08fe2
--- /dev/null
@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+       SOCKLND_TIMEOUT = 1,
+       SOCKLND_CREDITS,
+       SOCKLND_PEER_TXCREDITS,
+       SOCKLND_PEER_RTRCREDITS,
+       SOCKLND_PEER_TIMEOUT,
+       SOCKLND_NCONNDS,
+       SOCKLND_RECONNECTS_MIN,
+       SOCKLND_RECONNECTS_MAX,
+       SOCKLND_EAGER_ACK,
+       SOCKLND_ZERO_COPY,
+       SOCKLND_TYPED,
+       SOCKLND_BULK_MIN,
+       SOCKLND_RX_BUFFER_SIZE,
+       SOCKLND_TX_BUFFER_SIZE,
+       SOCKLND_NAGLE,
+       SOCKLND_IRQ_AFFINITY,
+       SOCKLND_ROUND_ROBIN,
+       SOCKLND_KEEPALIVE,
+       SOCKLND_KEEPALIVE_IDLE,
+       SOCKLND_KEEPALIVE_COUNT,
+       SOCKLND_KEEPALIVE_INTVL,
+       SOCKLND_BACKOFF_INIT,
+       SOCKLND_BACKOFF_MAX,
+       SOCKLND_PROTOCOL,
+       SOCKLND_ZERO_COPY_RECV,
+       SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+       {
+               .ctl_name = SOCKLND_TIMEOUT,
+               .procname = "timeout",
+               .data     = &ksocknal_tunables.ksnd_timeout,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_CREDITS,
+               .procname = "credits",
+               .data     = &ksocknal_tunables.ksnd_credits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+        {
+               .ctl_name = SOCKLND_PEER_TXCREDITS,
+               .procname = "peer_credits",
+               .data     = &ksocknal_tunables.ksnd_peertxcredits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+        {
+               .ctl_name = SOCKLND_PEER_RTRCREDITS,
+               .procname = "peer_buffer_credits",
+               .data     = &ksocknal_tunables.ksnd_peerrtrcredits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_PEER_TIMEOUT,
+               .procname = "peer_timeout",
+               .data     = &ksocknal_tunables.ksnd_peertimeout,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_NCONNDS,
+               .procname = "nconnds",
+               .data     = &ksocknal_tunables.ksnd_nconnds,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RECONNECTS_MIN,
+               .procname = "min_reconnectms",
+               .data     = &ksocknal_tunables.ksnd_min_reconnectms,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RECONNECTS_MAX,
+               .procname = "max_reconnectms",
+               .data     = &ksocknal_tunables.ksnd_max_reconnectms,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_EAGER_ACK,
+               .procname = "eager_ack",
+               .data     = &ksocknal_tunables.ksnd_eager_ack,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ZERO_COPY,
+               .procname = "zero_copy",
+               .data     = &ksocknal_tunables.ksnd_zc_min_payload,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ZERO_COPY_RECV,
+               .procname = "zero_copy_recv",
+               .data     = &ksocknal_tunables.ksnd_zc_recv,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+
+       {
+               .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+               .procname = "zero_copy_recv",
+               .data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_TYPED,
+               .procname = "typed",
+               .data     = &ksocknal_tunables.ksnd_typed_conns,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_BULK_MIN,
+               .procname = "min_bulk",
+               .data     = &ksocknal_tunables.ksnd_min_bulk,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RX_BUFFER_SIZE,
+               .procname = "rx_buffer_size",
+               .data     = &ksocknal_tunables.ksnd_rx_buffer_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_TX_BUFFER_SIZE,
+               .procname = "tx_buffer_size",
+               .data     = &ksocknal_tunables.ksnd_tx_buffer_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_NAGLE,
+               .procname = "nagle",
+               .data     = &ksocknal_tunables.ksnd_nagle,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ROUND_ROBIN,
+               .procname = "round_robin",
+               .data     = &ksocknal_tunables.ksnd_round_robin,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE,
+               .procname = "keepalive",
+               .data     = &ksocknal_tunables.ksnd_keepalive,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_IDLE,
+               .procname = "keepalive_idle",
+               .data     = &ksocknal_tunables.ksnd_keepalive_idle,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_COUNT,
+               .procname = "keepalive_count",
+               .data     = &ksocknal_tunables.ksnd_keepalive_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_INTVL,
+               .procname = "keepalive_intvl",
+               .data     = &ksocknal_tunables.ksnd_keepalive_intvl,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+#if SOCKNAL_VERSION_DEBUG
+       {
+               .ctl_name = SOCKLND_PROTOCOL,
+               .procname = "protocol",
+               .data     = &ksocknal_tunables.ksnd_protocol,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+#endif
+       {0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+       {
+               .ctl_name = CTL_SOCKLND,
+               .procname = "socknal",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = ksocknal_ctl_table
+       },
+       { 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+       if (!*ksocknal_tunables.ksnd_typed_conns) {
+               int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+               if (*ksocknal_tunables.ksnd_protocol < 3)
+                       rc = 0;
+#endif
+               if (rc != 0) {
+                       CERROR("Protocol V3.x MUST have typed connections\n");
+                       return rc;
+               }
+       }
+
+       if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+               *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+       if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+               *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+       ksocknal_tunables.ksnd_sysctl =
+               cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+       if (ksocknal_tunables.ksnd_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+       if (ksocknal_tunables.ksnd_sysctl != NULL)
+               unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+       int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                    &conn->ksnc_ipaddr,
+                                    &conn->ksnc_port);
+
+       /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+       LASSERT (!conn->ksnc_closing);
+
+       if (rc != 0) {
+               CERROR ("Error %d getting sock peer IP\n", rc);
+               return rc;
+       }
+
+       rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                &conn->ksnc_myipaddr, NULL);
+       if (rc != 0) {
+               CERROR ("Error %d getting sock local IP\n", rc);
+               return rc;
+       }
+
+       return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+       int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+       if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+               return 0;
+
+       /* ZC if the socket supports scatter/gather and doesn't need software
+        * checksums */
+       return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct socket *sock = conn->ksnc_sock;
+       int         nob;
+       int         rc;
+
+       if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
+           conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+           tx->tx_nob == tx->tx_resid           && /* frist sending    */
+           tx->tx_msg.ksm_csum == 0)                /* not checksummed  */
+               ksocknal_lib_csum_tx(tx);
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+
+       {
+#if SOCKNAL_SINGLE_FRAG_TX
+               struct iovec    scratch;
+               struct iovec   *scratchiov = &scratch;
+               unsigned int    niov = 1;
+#else
+               struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+               unsigned int    niov = tx->tx_niov;
+#endif
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = scratchiov,
+                       .msg_iovlen     = niov,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = MSG_DONTWAIT
+               };
+               mm_segment_t oldmm = get_fs();
+               int  i;
+
+               for (nob = i = 0; i < niov; i++) {
+                       scratchiov[i] = tx->tx_iov[i];
+                       nob += scratchiov[i].iov_len;
+               }
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   nob < tx->tx_resid)
+                       msg.msg_flags |= MSG_MORE;
+
+               set_fs (KERNEL_DS);
+               rc = sock_sendmsg(sock, &msg, nob);
+               set_fs (oldmm);
+       }
+       return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct socket *sock = conn->ksnc_sock;
+       lnet_kiov_t   *kiov = tx->tx_kiov;
+       int         rc;
+       int         nob;
+
+       /* Not NOOP message */
+       LASSERT (tx->tx_lnetmsg != NULL);
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+               /* Zero copy is enabled */
+               struct sock   *sk = sock->sk;
+               struct page   *page = kiov->kiov_page;
+               int         offset = kiov->kiov_offset;
+               int         fragsize = kiov->kiov_len;
+               int         msgflg = MSG_DONTWAIT;
+
+               CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                              page, offset, kiov->kiov_len);
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   fragsize < tx->tx_resid)
+                       msgflg |= MSG_MORE;
+
+               if (sk->sk_prot->sendpage != NULL) {
+                       rc = sk->sk_prot->sendpage(sk, page,
+                                                  offset, fragsize, msgflg);
+               } else {
+                       rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+                                             msgflg);
+               }
+       } else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+               struct iovec  scratch;
+               struct iovec *scratchiov = &scratch;
+               unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+               struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+               unsigned int  niov = tx->tx_nkiov;
+#endif
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = scratchiov,
+                       .msg_iovlen     = niov,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = MSG_DONTWAIT
+               };
+               mm_segment_t  oldmm = get_fs();
+               int        i;
+
+               for (nob = i = 0; i < niov; i++) {
+                       scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+                                                kiov[i].kiov_offset;
+                       nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+               }
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   nob < tx->tx_resid)
+                       msg.msg_flags |= MSG_MORE;
+
+               set_fs (KERNEL_DS);
+               rc = sock_sendmsg(sock, &msg, nob);
+               set_fs (oldmm);
+
+               for (i = 0; i < niov; i++)
+                       kunmap(kiov[i].kiov_page);
+       }
+       return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+       int         opt = 1;
+       mm_segment_t   oldmm = get_fs();
+       struct socket *sock = conn->ksnc_sock;
+
+       /* Remind the socket to ACK eagerly.  If I don't, the socket might
+        * think I'm about to send something it could piggy-back the ACK
+        * on, introducing delay in completing zero-copy sends in my
+        * peer. */
+
+       set_fs(KERNEL_DS);
+       sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+                              (char *)&opt, sizeof (opt));
+       set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+       struct iovec  scratch;
+       struct iovec *scratchiov = &scratch;
+       unsigned int  niov = 1;
+#else
+       struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+       unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+       struct iovec *iov = conn->ksnc_rx_iov;
+       struct msghdr msg = {
+               .msg_name       = NULL,
+               .msg_namelen    = 0,
+               .msg_iov        = scratchiov,
+               .msg_iovlen     = niov,
+               .msg_control    = NULL,
+               .msg_controllen = 0,
+               .msg_flags      = 0
+       };
+       mm_segment_t oldmm = get_fs();
+       int       nob;
+       int       i;
+       int       rc;
+       int       fragnob;
+       int       sum;
+       __u32   saved_csum;
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       LASSERT (niov > 0);
+
+       for (nob = i = 0; i < niov; i++) {
+               scratchiov[i] = iov[i];
+               nob += scratchiov[i].iov_len;
+       }
+       LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+       set_fs (KERNEL_DS);
+       rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+       /* NB this is just a boolean..........................^ */
+       set_fs (oldmm);
+
+       saved_csum = 0;
+       if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+               saved_csum = conn->ksnc_msg.ksm_csum;
+               conn->ksnc_msg.ksm_csum = 0;
+       }
+
+       if (saved_csum != 0) {
+               /* accumulate checksum */
+               for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                       LASSERT (i < niov);
+
+                       fragnob = iov[i].iov_len;
+                       if (fragnob > sum)
+                               fragnob = sum;
+
+                       conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                          iov[i].iov_base, fragnob);
+               }
+               conn->ksnc_msg.ksm_csum = saved_csum;
+       }
+
+       return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+       if (addr == NULL)
+               return;
+
+       vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+                      struct iovec *iov, struct page **pages)
+{
+       void         *addr;
+       int            nob;
+       int            i;
+
+       if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+               return NULL;
+
+       LASSERT (niov <= LNET_MAX_IOV);
+
+       if (niov < 2 ||
+           niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+               return NULL;
+
+       for (nob = i = 0; i < niov; i++) {
+               if ((kiov[i].kiov_offset != 0 && i > 0) ||
+                   (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+                       return NULL;
+
+               pages[i] = kiov[i].kiov_page;
+               nob += kiov[i].kiov_len;
+       }
+
+       addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+       if (addr == NULL)
+               return NULL;
+
+       iov->iov_base = addr + kiov[0].kiov_offset;
+       iov->iov_len = nob;
+
+       return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+       struct iovec   scratch;
+       struct iovec  *scratchiov = &scratch;
+       struct page  **pages      = NULL;
+       unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+       struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+       struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+       unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+       lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+       struct msghdr msg = {
+               .msg_name       = NULL,
+               .msg_namelen    = 0,
+               .msg_iov        = scratchiov,
+               .msg_control    = NULL,
+               .msg_controllen = 0,
+               .msg_flags      = 0
+       };
+       mm_segment_t oldmm = get_fs();
+       int       nob;
+       int       i;
+       int       rc;
+       void    *base;
+       void    *addr;
+       int       sum;
+       int       fragnob;
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+               nob = scratchiov[0].iov_len;
+               msg.msg_iovlen = 1;
+
+       } else {
+               for (nob = i = 0; i < niov; i++) {
+                       nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+                       scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+                                                kiov[i].kiov_offset;
+               }
+               msg.msg_iovlen = niov;
+       }
+
+       LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+       set_fs (KERNEL_DS);
+       rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+       /* NB this is just a boolean.......................^ */
+       set_fs (oldmm);
+
+       if (conn->ksnc_msg.ksm_csum != 0) {
+               for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                       LASSERT (i < niov);
+
+                       /* Dang! have to kmap again because I have nowhere to stash the
+                        * mapped address.  But by doing it while the page is still
+                        * mapped, the kernel just bumps the map count and returns me
+                        * the address it stashed. */
+                       base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+                       fragnob = kiov[i].kiov_len;
+                       if (fragnob > sum)
+                               fragnob = sum;
+
+                       conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                          base, fragnob);
+
+                       kunmap(kiov[i].kiov_page);
+               }
+       }
+
+       if (addr != NULL) {
+               ksocknal_lib_kiov_vunmap(addr);
+       } else {
+               for (i = 0; i < niov; i++)
+                       kunmap(kiov[i].kiov_page);
+       }
+
+       return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+       int       i;
+       __u32   csum;
+       void    *base;
+
+       LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+       LASSERT(tx->tx_conn != NULL);
+       LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+       tx->tx_msg.ksm_csum = 0;
+
+       csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+                            tx->tx_iov[0].iov_len);
+
+       if (tx->tx_kiov != NULL) {
+               for (i = 0; i < tx->tx_nkiov; i++) {
+                       base = kmap(tx->tx_kiov[i].kiov_page) +
+                              tx->tx_kiov[i].kiov_offset;
+
+                       csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+                       kunmap(tx->tx_kiov[i].kiov_page);
+               }
+       } else {
+               for (i = 1; i < tx->tx_niov; i++)
+                       csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+                                            tx->tx_iov[i].iov_len);
+       }
+
+       if (*ksocknal_tunables.ksnd_inject_csum_error) {
+               csum++;
+               *ksocknal_tunables.ksnd_inject_csum_error = 0;
+       }
+
+       tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+       mm_segment_t   oldmm = get_fs ();
+       struct socket *sock = conn->ksnc_sock;
+       int         len;
+       int         rc;
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               *txmem = *rxmem = *nagle = 0;
+               return (-ESHUTDOWN);
+       }
+
+       rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+       if (rc == 0) {
+               len = sizeof(*nagle);
+               set_fs(KERNEL_DS);
+               rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+                                          (char *)nagle, &len);
+               set_fs(oldmm);
+       }
+
+       ksocknal_connsock_decref(conn);
+
+       if (rc == 0)
+               *nagle = !*nagle;
+       else
+               *txmem = *rxmem = *nagle = 0;
+
+       return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+       mm_segment_t    oldmm = get_fs ();
+       int          rc;
+       int          option;
+       int          keep_idle;
+       int          keep_intvl;
+       int          keep_count;
+       int          do_keepalive;
+       struct linger   linger;
+
+       sock->sk->sk_allocation = GFP_NOFS;
+
+       /* Ensure this socket aborts active sends immediately when we close
+        * it. */
+
+       linger.l_onoff = 0;
+       linger.l_linger = 0;
+
+       set_fs (KERNEL_DS);
+       rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+                             (char *)&linger, sizeof (linger));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_LINGER: %d\n", rc);
+               return (rc);
+       }
+
+       option = -1;
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+                                   (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_LINGER2: %d\n", rc);
+               return (rc);
+       }
+
+       if (!*ksocknal_tunables.ksnd_nagle) {
+               option = 1;
+
+               set_fs (KERNEL_DS);
+               rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+                                           (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't disable nagle: %d\n", rc);
+                       return (rc);
+               }
+       }
+
+       rc = libcfs_sock_setbuf(sock,
+                               *ksocknal_tunables.ksnd_tx_buffer_size,
+                               *ksocknal_tunables.ksnd_rx_buffer_size);
+       if (rc != 0) {
+               CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                       *ksocknal_tunables.ksnd_tx_buffer_size,
+                       *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+               return (rc);
+       }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+       /* snapshot tunables */
+       keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+       keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+       keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+       do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+       option = (do_keepalive ? 1 : 0);
+       set_fs (KERNEL_DS);
+       rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+                             (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+               return (rc);
+       }
+
+       if (!do_keepalive)
+               return (0);
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+                                   (char *)&keep_idle, sizeof (keep_idle));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+                                   (char *)&keep_intvl, sizeof (keep_intvl));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+                                   (char *)&keep_count, sizeof (keep_count));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+               return (rc);
+       }
+
+       return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+       struct sock    *sk;
+       struct tcp_sock *tp;
+       int          nonagle;
+       int          val = 1;
+       int          rc;
+       mm_segment_t    oldmm;
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0)                        /* being shut down */
+               return;
+
+       sk = conn->ksnc_sock->sk;
+       tp = tcp_sk(sk);
+
+       lock_sock (sk);
+       nonagle = tp->nonagle;
+       tp->nonagle = 1;
+       release_sock (sk);
+
+       oldmm = get_fs ();
+       set_fs (KERNEL_DS);
+
+       rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                     (char *)&val, sizeof (val));
+       LASSERT (rc == 0);
+
+       set_fs (oldmm);
+
+       lock_sock (sk);
+       tp->nonagle = nonagle;
+       release_sock (sk);
+
+       ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+       ksock_conn_t  *conn;
+       ENTRY;
+
+       /* interleave correctly with closing sockets... */
+       LASSERT(!in_irq());
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = sk->sk_user_data;
+       if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
+               LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+               sk->sk_data_ready (sk, n);
+       } else
+               ksocknal_read_callback(conn);
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+       ksock_conn_t  *conn;
+       int         wspace;
+       int         min_wpace;
+
+       /* interleave correctly with closing sockets... */
+       LASSERT(!in_irq());
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = sk->sk_user_data;
+       wspace = SOCKNAL_WSPACE(sk);
+       min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+       CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+              sk, wspace, min_wpace, conn,
+              (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+                                     " ready" : " blocked"),
+              (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                     " scheduled" : " idle"),
+              (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                     " empty" : " queued"));
+
+       if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
+               LASSERT (sk->sk_write_space != &ksocknal_write_space);
+               sk->sk_write_space (sk);
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return;
+       }
+
+       if (wspace >= min_wpace) {            /* got enough space */
+               ksocknal_write_callback(conn);
+
+               /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+                * ENOMEM check in ksocknal_transmit is race-free (think about
+                * it). */
+
+               clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+       conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+       conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+       sock->sk->sk_user_data = conn;
+       sock->sk->sk_data_ready = ksocknal_data_ready;
+       sock->sk->sk_write_space = ksocknal_write_space;
+       return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+       /* Remove conn's network callbacks.
+        * NB I _have_ to restore the callback, rather than storing a noop,
+        * since the socket could survive past this module being unloaded!! */
+       sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+       sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+       /* A callback could be in progress already; they hold a read lock
+        * on ksnd_global_lock (to serialise with me) and NOOP if
+        * sk_user_data is NULL. */
+       sock->sk->sk_user_data = NULL;
+
+       return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+       int         rc = 0;
+       ksock_sched_t *sched;
+
+       sched = conn->ksnc_scheduler;
+       spin_lock_bh(&sched->kss_lock);
+
+       if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+           !conn->ksnc_tx_ready) {
+               /* SOCK_NOSPACE is set when the socket fills
+                * and cleared in the write_space callback
+                * (which also sets ksnc_tx_ready).  If
+                * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                * zero, I didn't fill the socket and
+                * write_space won't reschedule me, so I
+                * return -ENOMEM to get my caller to retry
+                * after a timeout */
+               rc = -ENOMEM;
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       return rc;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644 (file)
index 0000000..3c13578
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+       return crc32_le(crc, p, len);
+#else
+       while (len-- > 0)
+               crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+       return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS                3
+#define SOCKNAL_NSCHEDS_HIGH   (SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644 (file)
index 0000000..8a474f6
--- /dev/null
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+               "dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+               "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+               "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+               "# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+               "max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+               "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+               "max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+               "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+               "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+               "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+               "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+               "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+               "enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+               "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+               "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+               "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+               "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+               "seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+               "enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+               "set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+               "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+               "minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+               "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+               "minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+               "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+       /* initialize ksocknal_tunables structure */
+       ksocknal_tunables.ksnd_timeout      = &sock_timeout;
+       ksocknal_tunables.ksnd_nscheds            = &nscheds;
+       ksocknal_tunables.ksnd_nconnds      = &nconnds;
+       ksocknal_tunables.ksnd_nconnds_max      = &nconnds_max;
+       ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+       ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+       ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+       ksocknal_tunables.ksnd_typed_conns      = &typed_conns;
+       ksocknal_tunables.ksnd_min_bulk    = &min_bulk;
+       ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+       ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+       ksocknal_tunables.ksnd_nagle          = &nagle;
+       ksocknal_tunables.ksnd_round_robin      = &round_robin;
+       ksocknal_tunables.ksnd_keepalive          = &keepalive;
+       ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+       ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+       ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+       ksocknal_tunables.ksnd_credits      = &credits;
+       ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+       ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+       ksocknal_tunables.ksnd_peertimeout      = &peer_timeout;
+       ksocknal_tunables.ksnd_enable_csum      = &enable_csum;
+       ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+       ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+       ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+       ksocknal_tunables.ksnd_zc_recv      = &zc_recv;
+       ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+       ksocknal_tunables.ksnd_protocol    = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ksocknal_tunables.ksnd_sysctl        =  NULL;
+#endif
+
+       if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+               *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+       /* initialize platform-sepcific tunables */
+       return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+       ksocknal_lib_tunables_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644 (file)
index 0000000..ec57179
--- /dev/null
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack       : pack message header
+ *   pro_unpack           : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *                       return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *                       return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+       /* V1.x, just enqueue it */
+       list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+       return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+       ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+       /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+       LASSERT (!list_empty(&conn->ksnc_tx_queue));
+       LASSERT (tx != NULL);
+
+       /* Next TX that can carry ZC-ACK or LNet message */
+       if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+               /* no more packets queued */
+               conn->ksnc_tx_carrier = NULL;
+       } else {
+               conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+                                                      ksock_tx_t, tx_list);
+               LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+       }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+                          ksock_tx_t *tx_ack, __u64 cookie)
+{
+       ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+       LASSERT (tx_ack == NULL ||
+                tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       /*
+        * Enqueue or piggyback tx_ack / cookie
+        * . no tx can piggyback cookie of tx_ack (or cookie), just
+        *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+        * . There is tx can piggyback cookie of tx_ack (or cookie),
+        *   piggyback the cookie and return the tx.
+        */
+       if (tx == NULL) {
+               if (tx_ack != NULL) {
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+                       conn->ksnc_tx_carrier = tx_ack;
+               }
+               return 0;
+       }
+
+       if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+               /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+               if (tx_ack != NULL)
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+               return 0;
+       }
+
+       LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+       LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+       if (tx_ack != NULL)
+               cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+       /* piggyback the zc-ack cookie */
+       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+       /* move on to the next TX which can carry cookie */
+       ksocknal_next_tx_carrier(conn);
+
+       return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+       ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+       /*
+        * Enqueue tx_msg:
+        * . If there is no NOOP on the connection, just enqueue
+        *   tx_msg and return NULL
+        * . If there is NOOP on the connection, piggyback the cookie
+        *   and replace the NOOP tx, and return the NOOP tx.
+        */
+       if (tx == NULL) { /* nothing on queue */
+               list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+               conn->ksnc_tx_carrier = tx_msg;
+               return NULL;
+       }
+
+       if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+               list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+               return NULL;
+       }
+
+       LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       /* There is a noop zc-ack can be piggybacked */
+       tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+       ksocknal_next_tx_carrier(conn);
+
+       /* use new_tx to replace the noop zc-ack packet */
+       list_add(&tx_msg->tx_list, &tx->tx_list);
+       list_del(&tx->tx_list);
+
+       return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+                          ksock_tx_t *tx_ack, __u64 cookie)
+{
+       ksock_tx_t *tx;
+
+       if (conn->ksnc_type != SOCKLND_CONN_ACK)
+               return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+       /* non-blocking ZC-ACK (to router) */
+       LASSERT (tx_ack == NULL ||
+                tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       if ((tx = conn->ksnc_tx_carrier) == NULL) {
+               if (tx_ack != NULL) {
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+                       conn->ksnc_tx_carrier = tx_ack;
+               }
+               return 0;
+       }
+
+       /* conn->ksnc_tx_carrier != NULL */
+
+       if (tx_ack != NULL)
+               cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+       if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+               return 1;
+
+       if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+               /* replace the keepalive PING with a real ACK */
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+               tx->tx_msg.ksm_zc_cookies[1] = cookie;
+               return 1;
+       }
+
+       if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+           cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+               CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+                     libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+               return 1; /* XXX return error in the future */
+       }
+
+       if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+               /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+               if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+                       tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+                       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+               } else {
+                       tx->tx_msg.ksm_zc_cookies[0] = cookie;
+               }
+
+               if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+                       /* not likely to carry more ACKs, skip it to simplify logic */
+                       ksocknal_next_tx_carrier(conn);
+               }
+
+               return 1;
+       }
+
+       /* takes two or more cookies already */
+
+       if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+               __u64   tmp = 0;
+
+               /* two seperated cookies: (a+2, a) or (a+1, a) */
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+                        tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+               if (tx->tx_msg.ksm_zc_cookies[0] -
+                   tx->tx_msg.ksm_zc_cookies[1] == 2) {
+                       if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+                               tmp = cookie;
+               } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+                       tmp = tx->tx_msg.ksm_zc_cookies[1];
+               } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+                       tmp = tx->tx_msg.ksm_zc_cookies[0];
+               }
+
+               if (tmp != 0) {
+                       /* range of cookies */
+                       tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+                       tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+                       return 1;
+               }
+
+       } else {
+               /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+               if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+                   cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+                       CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+                             libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+                       return 1; /* XXX: return error in the future */
+               }
+
+               if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+                       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                       return 1;
+               }
+
+               if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+                       tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                       return 1;
+               }
+       }
+
+       /* failed to piggyback ZC-ACK */
+       if (tx_ack != NULL) {
+               list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+               /* the next tx can piggyback at least 1 ACK */
+               ksocknal_next_tx_carrier(conn);
+       }
+
+       return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+       int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+       if (!*ksocknal_tunables.ksnd_typed_conns)
+               return SOCKNAL_MATCH_YES;
+#endif
+
+       if (tx == NULL || tx->tx_lnetmsg == NULL) {
+               /* noop packet */
+               nob = offsetof(ksock_msg_t, ksm_u);
+       } else {
+               nob = tx->tx_lnetmsg->msg_len +
+                     ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+                      sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+       }
+
+       /* default checking for typed connection */
+       switch (conn->ksnc_type) {
+       default:
+               CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+               LBUG();
+       case SOCKLND_CONN_ANY:
+               return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_BULK_IN:
+               return SOCKNAL_MATCH_MAY;
+
+       case SOCKLND_CONN_BULK_OUT:
+               if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_CONTROL:
+               if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+       }
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+       int nob;
+
+       if (tx == NULL || tx->tx_lnetmsg == NULL)
+               nob = offsetof(ksock_msg_t, ksm_u);
+       else
+               nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+       switch (conn->ksnc_type) {
+       default:
+               CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+               LBUG();
+       case SOCKLND_CONN_ANY:
+               return SOCKNAL_MATCH_NO;
+
+       case SOCKLND_CONN_ACK:
+               if (nonblk)
+                       return SOCKNAL_MATCH_YES;
+               else if (tx == NULL || tx->tx_lnetmsg == NULL)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_NO;
+
+       case SOCKLND_CONN_BULK_OUT:
+               if (nonblk)
+                       return SOCKNAL_MATCH_NO;
+               else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_CONTROL:
+               if (nonblk)
+                       return SOCKNAL_MATCH_NO;
+               else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+       }
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+       ksock_peer_t   *peer = c->ksnc_peer;
+       ksock_conn_t   *conn;
+       ksock_tx_t     *tx;
+       int          rc;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+       if (conn != NULL) {
+               ksock_sched_t *sched = conn->ksnc_scheduler;
+
+               LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+               spin_lock_bh(&sched->kss_lock);
+
+               rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+               spin_unlock_bh(&sched->kss_lock);
+
+               if (rc) { /* piggybacked */
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       return 0;
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       /* ACK connection is not ready, or can't piggyback the ACK */
+       tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+       if (tx == NULL)
+               return -ENOMEM;
+
+       if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+               return 0;
+
+       ksocknal_free_tx(tx);
+       return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+       ksock_peer_t      *peer = conn->ksnc_peer;
+       ksock_tx_t      *tx;
+       ksock_tx_t      *tmp;
+       LIST_HEAD     (zlist);
+       int             count;
+
+       if (cookie1 == 0)
+               cookie1 = cookie2;
+
+       count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+       if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+           conn->ksnc_proto == &ksocknal_protocol_v3x) {
+               /* keepalive PING for V3.x, just ignore it */
+               return count == 1 ? 0 : -EPROTO;
+       }
+
+       spin_lock(&peer->ksnp_lock);
+
+       list_for_each_entry_safe(tx, tmp,
+                                    &peer->ksnp_zc_req_list, tx_zc_list) {
+               __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+               if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+                       tx->tx_msg.ksm_zc_cookies[0] = 0;
+                       list_del(&tx->tx_zc_list);
+                       list_add(&tx->tx_zc_list, &zlist);
+
+                       if (--count == 0)
+                               break;
+               }
+       }
+
+       spin_unlock(&peer->ksnp_lock);
+
+       while (!list_empty(&zlist)) {
+               tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+               list_del(&tx->tx_zc_list);
+               ksocknal_tx_decref(tx);
+       }
+
+       return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+       socket_t        *sock = conn->ksnc_sock;
+       lnet_hdr_t        *hdr;
+       lnet_magicversion_t *hmv;
+       int               rc;
+       int               i;
+
+       CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+       LIBCFS_ALLOC(hdr, sizeof(*hdr));
+       if (hdr == NULL) {
+               CERROR("Can't allocate lnet_hdr_t\n");
+               return -ENOMEM;
+       }
+
+       hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+       /* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+        * header and send out */
+       hmv->magic       = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+       hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+       hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+       if (the_lnet.ln_testprotocompat != 0) {
+               /* single-shot proto check */
+               LNET_LOCK();
+               if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                       hmv->version_major++;   /* just different! */
+                       the_lnet.ln_testprotocompat &= ~1;
+               }
+               if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                       hmv->magic = LNET_PROTO_MAGIC;
+                       the_lnet.ln_testprotocompat &= ~2;
+               }
+               LNET_UNLOCK();
+       }
+
+       hdr->src_nid    = cpu_to_le64 (hello->kshm_src_nid);
+       hdr->src_pid    = cpu_to_le32 (hello->kshm_src_pid);
+       hdr->type          = cpu_to_le32 (LNET_MSG_HELLO);
+       hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+       hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+       hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+       rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+               goto out;
+       }
+
+       if (hello->kshm_nips == 0)
+               goto out;
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+       }
+
+       rc = libcfs_sock_write(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32),
+                              lnet_acceptor_timeout());
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO payload (%d)"
+                       " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+       }
+out:
+       LIBCFS_FREE(hdr, sizeof(*hdr));
+
+       return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+       socket_t   *sock = conn->ksnc_sock;
+       int          rc;
+
+       hello->kshm_magic   = LNET_PROTO_MAGIC;
+       hello->kshm_version = conn->ksnc_proto->pro_version;
+
+       if (the_lnet.ln_testprotocompat != 0) {
+               /* single-shot proto check */
+               LNET_LOCK();
+               if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                       hello->kshm_version++;   /* just different! */
+                       the_lnet.ln_testprotocompat &= ~1;
+               }
+               LNET_UNLOCK();
+       }
+
+       rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+                              lnet_acceptor_timeout());
+
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+               return rc;
+       }
+
+       if (hello->kshm_nips == 0)
+               return 0;
+
+       rc = libcfs_sock_write(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32),
+                              lnet_acceptor_timeout());
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO payload (%d)"
+                       " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+       }
+
+       return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+       socket_t        *sock = conn->ksnc_sock;
+       lnet_hdr_t        *hdr;
+       int               rc;
+       int               i;
+
+       LIBCFS_ALLOC(hdr, sizeof(*hdr));
+       if (hdr == NULL) {
+               CERROR("Can't allocate lnet_hdr_t\n");
+               return -ENOMEM;
+       }
+
+       rc = libcfs_sock_read(sock, &hdr->src_nid,
+                             sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+                             timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               goto out;
+       }
+
+       /* ...and check we got what we expected */
+       if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+               CERROR ("Expecting a HELLO hdr,"
+                       " but got type %d from %u.%u.%u.%u\n",
+                       le32_to_cpu (hdr->type),
+                       HIPQUAD(conn->ksnc_ipaddr));
+               rc = -EPROTO;
+               goto out;
+       }
+
+       hello->kshm_src_nid      = le64_to_cpu (hdr->src_nid);
+       hello->kshm_src_pid      = le32_to_cpu (hdr->src_pid);
+       hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+       hello->kshm_ctype          = le32_to_cpu (hdr->msg.hello.type);
+       hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
+                                        sizeof (__u32);
+
+       if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+               CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                      hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+               rc = -EPROTO;
+               goto out;
+       }
+
+       if (hello->kshm_nips == 0)
+               goto out;
+
+       rc = libcfs_sock_read(sock, hello->kshm_ips,
+                             hello->kshm_nips * sizeof(__u32), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               goto out;
+       }
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+               if (hello->kshm_ips[i] == 0) {
+                       CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                              i, HIPQUAD(conn->ksnc_ipaddr));
+                       rc = -EPROTO;
+                       break;
+               }
+       }
+out:
+       LIBCFS_FREE(hdr, sizeof(*hdr));
+
+       return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+       socket_t      *sock = conn->ksnc_sock;
+       int             rc;
+       int             i;
+
+       if (hello->kshm_magic == LNET_PROTO_MAGIC)
+               conn->ksnc_flip = 0;
+       else
+               conn->ksnc_flip = 1;
+
+       rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+                             offsetof(ksock_hello_msg_t, kshm_ips) -
+                                      offsetof(ksock_hello_msg_t, kshm_src_nid),
+                             timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               return rc;
+       }
+
+       if (conn->ksnc_flip) {
+               __swab32s(&hello->kshm_src_pid);
+               __swab64s(&hello->kshm_src_nid);
+               __swab32s(&hello->kshm_dst_pid);
+               __swab64s(&hello->kshm_dst_nid);
+               __swab64s(&hello->kshm_src_incarnation);
+               __swab64s(&hello->kshm_dst_incarnation);
+               __swab32s(&hello->kshm_ctype);
+               __swab32s(&hello->kshm_nips);
+       }
+
+       if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+               CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                      hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       if (hello->kshm_nips == 0)
+               return 0;
+
+       rc = libcfs_sock_read(sock, hello->kshm_ips,
+                             hello->kshm_nips * sizeof(__u32), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               return rc;
+       }
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               if (conn->ksnc_flip)
+                       __swab32s(&hello->kshm_ips[i]);
+
+               if (hello->kshm_ips[i] == 0) {
+                       CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                              i, HIPQUAD(conn->ksnc_ipaddr));
+                       return -EPROTO;
+               }
+       }
+
+       return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+       /* V1.x has no KSOCK_MSG_NOOP */
+       LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT(tx->tx_lnetmsg != NULL);
+
+       tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+       tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+       tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+       tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+       if (tx->tx_lnetmsg != NULL) {
+               LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+               tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+               tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+               tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+       } else {
+               LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+               tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+               tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+       }
+       /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+       msg->ksm_csum      = 0;
+       msg->ksm_type      = KSOCK_MSG_LNET;
+       msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+       return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+       .pro_version        = KSOCK_PROTO_V1,
+       .pro_send_hello  = ksocknal_send_hello_v1,
+       .pro_recv_hello  = ksocknal_recv_hello_v1,
+       .pro_pack              = ksocknal_pack_msg_v1,
+       .pro_unpack          = ksocknal_unpack_msg_v1,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+       .pro_handle_zcreq       = NULL,
+       .pro_handle_zcack       = NULL,
+       .pro_queue_tx_zcack     = NULL,
+       .pro_match_tx      = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+       .pro_version        = KSOCK_PROTO_V2,
+       .pro_send_hello  = ksocknal_send_hello_v2,
+       .pro_recv_hello  = ksocknal_recv_hello_v2,
+       .pro_pack              = ksocknal_pack_msg_v2,
+       .pro_unpack          = ksocknal_unpack_msg_v2,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+       .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+       .pro_handle_zcreq       = ksocknal_handle_zcreq,
+       .pro_handle_zcack       = ksocknal_handle_zcack,
+       .pro_match_tx      = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+       .pro_version        = KSOCK_PROTO_V3,
+       .pro_send_hello  = ksocknal_send_hello_v2,
+       .pro_recv_hello  = ksocknal_recv_hello_v2,
+       .pro_pack              = ksocknal_pack_msg_v2,
+       .pro_unpack          = ksocknal_unpack_msg_v2,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+       .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+       .pro_handle_zcreq       = ksocknal_handle_zcreq,
+       .pro_handle_zcack       = ksocknal_handle_zcack,
+       .pro_match_tx      = ksocknal_match_tx_v3
+};
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644 (file)
index 0000000..1bd9ef7
--- /dev/null
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o    \
+         lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o          \
+         router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644 (file)
index 0000000..81ef28b
--- /dev/null
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+struct {
+       int                     pta_shutdown;
+       socket_t                *pta_sock;
+       struct completion       pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+       return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+       return (magic == constant ||
+               magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+               "Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+               "Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+               "Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+               "Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+       /* Userland acceptor uses 'accept_type' instead of 'accept', due to
+        * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+        * for compatibility. Hence the trick. */
+       accept_type = accept;
+       return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+       return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+                          __u32 peer_ip, int peer_port)
+{
+       switch (rc) {
+       /* "normal" errors */
+       case -ECONNREFUSED:
+               CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+                       "refused: check that Lustre is running on that node.\n",
+                       libcfs_nid2str(peer_nid),
+                       HIPQUAD(peer_ip), peer_port);
+               break;
+       case -EHOSTUNREACH:
+       case -ENETUNREACH:
+               CNETERR("Connection to %s at host %u.%u.%u.%u "
+                       "was unreachable: the network or that node may "
+                       "be down, or Lustre may be misconfigured.\n",
+                       libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+               break;
+       case -ETIMEDOUT:
+               CNETERR("Connection to %s at host %u.%u.%u.%u on "
+                       "port %d took too long: that node may be hung "
+                       "or experiencing high load.\n",
+                       libcfs_nid2str(peer_nid),
+                       HIPQUAD(peer_ip), peer_port);
+               break;
+       case -ECONNRESET:
+               LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+                                  " on port %d was reset: "
+                                  "is it running a compatible version of "
+                                  "Lustre and is %s one of its NIDs?\n",
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port,
+                                  libcfs_nid2str(peer_nid));
+               break;
+       case -EPROTO:
+               LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+                                  "host %u.%u.%u.%u on port %d: is it running "
+                                  "a compatible version of Lustre?\n",
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       case -EADDRINUSE:
+               LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+                                  "connect to %s at host %u.%u.%u.%u on port "
+                                  "%d\n", libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       default:
+               LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+                                  " at host %u.%u.%u.%u on port %d\n", rc,
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+           __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+       lnet_acceptor_connreq_t cr;
+       socket_t           *sock;
+       int                  rc;
+       int                  port;
+       int                  fatal;
+
+       CLASSERT (sizeof(cr) <= 16);        /* not too big to be on the stack */
+
+       for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+            port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+            --port) {
+               /* Iterate through reserved ports. */
+
+               rc = libcfs_sock_connect(&sock, &fatal,
+                                        local_ip, port,
+                                        peer_ip, peer_port);
+               if (rc != 0) {
+                       if (fatal)
+                               goto failed;
+                       continue;
+               }
+
+               CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+               cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+               cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+               cr.acr_nid     = peer_nid;
+
+               if (the_lnet.ln_testprotocompat != 0) {
+                       /* single-shot proto check */
+                       lnet_net_lock(LNET_LOCK_EX);
+                       if ((the_lnet.ln_testprotocompat & 4) != 0) {
+                               cr.acr_version++;
+                               the_lnet.ln_testprotocompat &= ~4;
+                       }
+                       if ((the_lnet.ln_testprotocompat & 8) != 0) {
+                               cr.acr_magic = LNET_PROTO_MAGIC;
+                               the_lnet.ln_testprotocompat &= ~8;
+                       }
+                       lnet_net_unlock(LNET_LOCK_EX);
+               }
+
+               rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                      accept_timeout);
+               if (rc != 0)
+                       goto failed_sock;
+
+               *sockp = sock;
+               return 0;
+       }
+
+       rc = -EADDRINUSE;
+       goto failed;
+
+ failed_sock:
+       libcfs_sock_release(sock);
+ failed:
+       lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+       return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+       lnet_acceptor_connreq_t cr;
+       __u32              peer_ip;
+       int                  peer_port;
+       int                  rc;
+       int                  flip;
+       lnet_ni_t             *ni;
+       char               *str;
+
+       LASSERT (sizeof(cr) <= 16);          /* not too big for the stack */
+
+       rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+       LASSERT (rc == 0);                    /* we succeeded before */
+
+       if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+               if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+                       /* future version compatibility!
+                        * When LNET unifies protocols over all LNDs, the first
+                        * thing sent will be a version query.  I send back
+                        * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+                       memset (&cr, 0, sizeof(cr));
+                       cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+                       cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+                       rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                              accept_timeout);
+
+                       if (rc != 0)
+                               CERROR("Error sending magic+version in response"
+                                      "to LNET magic from %u.%u.%u.%u: %d\n",
+                                      HIPQUAD(peer_ip), rc);
+                       return -EPROTO;
+               }
+
+               if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+                       str = "'old' socknal/tcpnal";
+               else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+                       str = "'old' ranal";
+               else
+                       str = "unrecognised";
+
+               LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+                                  " magic %08x: %s acceptor protocol\n",
+                                  HIPQUAD(peer_ip), magic, str);
+               return -EPROTO;
+       }
+
+       flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+       rc = libcfs_sock_read(sock, &cr.acr_version,
+                             sizeof(cr.acr_version),
+                             accept_timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading connection request version from "
+                      "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+               return -EIO;
+       }
+
+       if (flip)
+               __swab32s(&cr.acr_version);
+
+       if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+               /* future version compatibility!
+                * An acceptor-specific protocol rev will first send a version
+                * query.  I send back my current version to tell her I'm
+                * "old". */
+               int peer_version = cr.acr_version;
+
+               memset (&cr, 0, sizeof(cr));
+               cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+               cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+               rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                      accept_timeout);
+
+               if (rc != 0)
+                       CERROR("Error sending magic+version in response"
+                              "to version %d from %u.%u.%u.%u: %d\n",
+                              peer_version, HIPQUAD(peer_ip), rc);
+               return -EPROTO;
+       }
+
+       rc = libcfs_sock_read(sock, &cr.acr_nid,
+                             sizeof(cr) -
+                             offsetof(lnet_acceptor_connreq_t, acr_nid),
+                             accept_timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading connection request from "
+                      "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+               return -EIO;
+       }
+
+       if (flip)
+               __swab64s(&cr.acr_nid);
+
+       ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+       if (ni == NULL ||              /* no matching net */
+           ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+               if (ni != NULL)
+                       lnet_ni_decref(ni);
+               LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+                                  " for %s: No matching NI\n",
+                                  HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+               return -EPERM;
+       }
+
+       if (ni->ni_lnd->lnd_accept == NULL) {
+               /* This catches a request for the loopback LND */
+               lnet_ni_decref(ni);
+               LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+                                 " for %s: NI doesn not accept IP connections\n",
+                                 HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+               return -EPERM;
+       }
+
+       CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+              libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+       rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+       lnet_ni_decref(ni);
+       return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+       socket_t  *newsock;
+       int         rc;
+       __u32     magic;
+       __u32     peer_ip;
+       int         peer_port;
+       int         secure = (int)((long_ptr_t)arg);
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+       cfs_block_allsigs();
+
+       rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+                               0, accept_port, accept_backlog);
+       if (rc != 0) {
+               if (rc == -EADDRINUSE)
+                       LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+                                          " %d: port already in use\n",
+                                          accept_port);
+               else
+                       LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+                                          "%d: unexpected error %d\n",
+                                          accept_port, rc);
+
+               lnet_acceptor_state.pta_sock = NULL;
+       } else {
+               LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+       }
+
+       /* set init status and unblock parent */
+       lnet_acceptor_state.pta_shutdown = rc;
+       complete(&lnet_acceptor_state.pta_signal);
+
+       if (rc != 0)
+               return rc;
+
+       while (!lnet_acceptor_state.pta_shutdown) {
+
+               rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+               if (rc != 0) {
+                       if (rc != -EAGAIN) {
+                               CWARN("Accept error %d: pausing...\n", rc);
+                               cfs_pause(cfs_time_seconds(1));
+                       }
+                       continue;
+               }
+
+               /* maybe we're waken up with libcfs_sock_abort_accept() */
+               if (lnet_acceptor_state.pta_shutdown) {
+                       libcfs_sock_release(newsock);
+                       break;
+               }
+
+               rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+               if (rc != 0) {
+                       CERROR("Can't determine new connection's address\n");
+                       goto failed;
+               }
+
+               if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+                       CERROR("Refusing connection from %u.%u.%u.%u: "
+                              "insecure port %d\n",
+                              HIPQUAD(peer_ip), peer_port);
+                       goto failed;
+               }
+
+               rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+                                     accept_timeout);
+               if (rc != 0) {
+                       CERROR("Error %d reading connection request from "
+                              "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+                       goto failed;
+               }
+
+               rc = lnet_accept(newsock, magic);
+               if (rc != 0)
+                       goto failed;
+
+               continue;
+
+       failed:
+               libcfs_sock_release(newsock);
+       }
+
+       libcfs_sock_release(lnet_acceptor_state.pta_sock);
+       lnet_acceptor_state.pta_sock = NULL;
+
+       CDEBUG(D_NET, "Acceptor stopping\n");
+
+       /* unblock lnet_acceptor_stop() */
+       complete(&lnet_acceptor_state.pta_signal);
+       return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+       if (!strcmp(acc, "secure")) {
+               *sec = 1;
+               return 1;
+       } else if (!strcmp(acc, "all")) {
+               *sec = 0;
+               return 1;
+       } else if (!strcmp(acc, "none")) {
+               return 0;
+       } else {
+               LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+                                  acc);
+               return -EINVAL;
+       }
+}
+
+int
+lnet_acceptor_start(void)
+{
+       int  rc;
+       long rc2;
+       long secure;
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+       rc = lnet_acceptor_get_tunables();
+       if (rc != 0)
+               return rc;
+
+
+       init_completion(&lnet_acceptor_state.pta_signal);
+       rc = accept2secure(accept_type, &secure);
+       if (rc <= 0) {
+               fini_completion(&lnet_acceptor_state.pta_signal);
+               return rc;
+       }
+
+       if (lnet_count_acceptor_nis() == 0)  /* not required */
+               return 0;
+
+       rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+                                 (void *)(ulong_ptr_t)secure,
+                                 "acceptor_%03ld", secure));
+       if (IS_ERR_VALUE(rc2)) {
+               CERROR("Can't start acceptor thread: %ld\n", rc2);
+               fini_completion(&lnet_acceptor_state.pta_signal);
+
+               return -ESRCH;
+       }
+
+       /* wait for acceptor to startup */
+       wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+       if (!lnet_acceptor_state.pta_shutdown) {
+               /* started OK */
+               LASSERT(lnet_acceptor_state.pta_sock != NULL);
+               return 0;
+       }
+
+       LASSERT(lnet_acceptor_state.pta_sock == NULL);
+       fini_completion(&lnet_acceptor_state.pta_signal);
+
+       return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+       if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+               return;
+
+       lnet_acceptor_state.pta_shutdown = 1;
+       libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+       /* block until acceptor signals exit */
+       wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+       fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644 (file)
index 0000000..695b272
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644 (file)
index 0000000..e88bee3
--- /dev/null
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;                     /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+               "LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+               "local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+               "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+               "size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+       return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+       char   *nets;
+       int     rc;
+
+       if (*networks != 0 && *ip2nets != 0) {
+               LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+                                  "'ip2nets' but not both at once\n");
+               return NULL;
+       }
+
+       if (*ip2nets != 0) {
+               rc = lnet_parse_ip2nets(&nets, ip2nets);
+               return (rc == 0) ? nets : NULL;
+       }
+
+       if (*networks != 0)
+               return networks;
+
+       return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+       spin_lock_init(&the_lnet.ln_eq_wait_lock);
+       init_waitqueue_head(&the_lnet.ln_eq_waitq);
+       mutex_init(&the_lnet.ln_lnd_mutex);
+       mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+       int             i;
+       struct list_head        *hash;
+
+       LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+       LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+       LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+       if (hash == NULL) {
+               CERROR("Failed to create remote nets hash table\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&hash[i]);
+       the_lnet.ln_remote_nets_hash = hash;
+       return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+       int             i;
+       struct list_head        *hash;
+
+       if (the_lnet.ln_remote_nets_hash == NULL)
+               return;
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+               LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+       LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+                   LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+       the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+       if (the_lnet.ln_res_lock != NULL) {
+               cfs_percpt_lock_free(the_lnet.ln_res_lock);
+               the_lnet.ln_res_lock = NULL;
+       }
+
+       if (the_lnet.ln_net_lock != NULL) {
+               cfs_percpt_lock_free(the_lnet.ln_net_lock);
+               the_lnet.ln_net_lock = NULL;
+       }
+
+       lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+       lnet_init_locks();
+
+       the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+       if (the_lnet.ln_res_lock == NULL)
+               goto failed;
+
+       the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+       if (the_lnet.ln_net_lock == NULL)
+               goto failed;
+
+       return 0;
+
+ failed:
+       lnet_destroy_locks();
+       return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+       /* Wire protocol assertions generated by 'wirecheck'
+        * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+        * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+        * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+       /* Constants... */
+       CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+       CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+       CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+       CLASSERT (LNET_MSG_ACK == 0);
+       CLASSERT (LNET_MSG_PUT == 1);
+       CLASSERT (LNET_MSG_GET == 2);
+       CLASSERT (LNET_MSG_REPLY == 3);
+       CLASSERT (LNET_MSG_HELLO == 4);
+
+       /* Checks for struct ptl_handle_wire_t */
+       CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+       CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+       CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+       CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+       CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+       /* Checks for struct lnet_magicversion_t */
+       CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+       /* Checks for struct lnet_hdr_t */
+       CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+       CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+       /* Ack */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+       /* Put */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+       /* Get */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+       /* Reply */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+       /* Hello */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+       lnd_t         *lnd;
+       struct list_head         *tmp;
+
+       /* holding lnd mutex */
+       list_for_each (tmp, &the_lnet.ln_lnds) {
+               lnd = list_entry(tmp, lnd_t, lnd_list);
+
+               if ((int)lnd->lnd_type == type)
+                       return lnd;
+       }
+
+       return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+       LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+       list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+       lnd->lnd_refcount = 0;
+
+       CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+       LASSERT (lnd->lnd_refcount == 0);
+
+       list_del (&lnd->lnd_list);
+       CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+       lnet_counters_t *ctr;
+       int             i;
+
+       memset(counters, 0, sizeof(*counters));
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+               counters->msgs_max     += ctr->msgs_max;
+               counters->msgs_alloc   += ctr->msgs_alloc;
+               counters->errors       += ctr->errors;
+               counters->send_count   += ctr->send_count;
+               counters->recv_count   += ctr->recv_count;
+               counters->route_count  += ctr->route_count;
+               counters->drop_length  += ctr->drop_length;
+               counters->send_length  += ctr->send_length;
+               counters->recv_length  += ctr->recv_length;
+               counters->route_length += ctr->route_length;
+               counters->drop_length  += ctr->drop_length;
+
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+       lnet_counters_t *counters;
+       int             i;
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+               memset(counters, 0, sizeof(lnet_counters_t));
+
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+       char *space;
+
+       LASSERT (n > 0);
+
+       size += offsetof (lnet_freeobj_t, fo_contents);
+
+       LIBCFS_ALLOC(space, n * size);
+       if (space == NULL)
+               return (-ENOMEM);
+
+       INIT_LIST_HEAD (&fl->fl_list);
+       fl->fl_objs = space;
+       fl->fl_nobjs = n;
+       fl->fl_objsize = size;
+
+       do
+       {
+               memset (space, 0, size);
+               list_add ((struct list_head *)space, &fl->fl_list);
+               space += size;
+       } while (--n != 0);
+
+       return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+       struct list_head       *el;
+       int            count;
+
+       if (fl->fl_nobjs == 0)
+               return;
+
+       count = 0;
+       for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+               count++;
+
+       LASSERT (count == fl->fl_nobjs);
+
+       LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+       memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+       /* NB the interface cookie in wire handles guards against delayed
+        * replies and ACKs appearing valid after reboot. Initialisation time,
+        * even if it's only implemented to millisecond resolution is probably
+        * easily good enough. */
+       struct timeval tv;
+       __u64     cookie;
+       do_gettimeofday(&tv);
+       cookie = tv.tv_sec;
+       cookie *= 1000000;
+       cookie += tv.tv_usec;
+       return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+       switch (type) {
+       default:
+               LBUG();
+       case LNET_COOKIE_TYPE_MD:
+               return "MD";
+       case LNET_COOKIE_TYPE_ME:
+               return "ME";
+       case LNET_COOKIE_TYPE_EQ:
+               return "EQ";
+       }
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+       int     count = 0;
+
+       if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+               return;
+
+       while (!list_empty(&rec->rec_active)) {
+               struct list_head *e = rec->rec_active.next;
+
+               list_del_init(e);
+               if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+                       lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+               } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+                       lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+               } else { /* NB: Active MEs should be attached on portals */
+                       LBUG();
+               }
+               count++;
+       }
+
+       if (count > 0) {
+               /* Found alive MD/ME/EQ, user really should unlink/free
+                * all of them before finalize LNet, but if someone didn't,
+                * we have to recycle garbage for him */
+               CERROR("%d active elements on exit of %s container\n",
+                      count, lnet_res_type2str(rec->rec_type));
+       }
+
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_fini(&rec->rec_freelist);
+#endif
+       if (rec->rec_lh_hash != NULL) {
+               LIBCFS_FREE(rec->rec_lh_hash,
+                           LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+               rec->rec_lh_hash = NULL;
+       }
+
+       rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+                        int cpt, int type, int objnum, int objsz)
+{
+       int     rc = 0;
+       int     i;
+
+       LASSERT(rec->rec_type == 0);
+
+       rec->rec_type = type;
+       INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+       memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+       rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+       if (rc != 0)
+               goto out;
+#endif
+       rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+       /* Arbitrary choice of hash table size */
+       LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+                        LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+       if (rec->rec_lh_hash == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+       return 0;
+
+out:
+       CERROR("Failed to setup %s resource container\n",
+              lnet_res_type2str(type));
+       lnet_res_container_cleanup(rec);
+       return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+       struct lnet_res_container       *rec;
+       int                             i;
+
+       cfs_percpt_for_each(rec, i, recs)
+               lnet_res_container_cleanup(rec);
+
+       cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+       struct lnet_res_container       **recs;
+       struct lnet_res_container       *rec;
+       int                             rc;
+       int                             i;
+
+       recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+       if (recs == NULL) {
+               CERROR("Failed to allocate %s resource containers\n",
+                      lnet_res_type2str(type));
+               return NULL;
+       }
+
+       cfs_percpt_for_each(rec, i, recs) {
+               rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+               if (rc != 0) {
+                       lnet_res_containers_destroy(recs);
+                       return NULL;
+               }
+       }
+
+       return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+       /* ALWAYS called with lnet_res_lock held */
+       struct list_head                *head;
+       lnet_libhandle_t        *lh;
+       unsigned int            hash;
+
+       if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+               return NULL;
+
+       hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+       head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+       list_for_each_entry(lh, head, lh_hash_chain) {
+               if (lh->lh_cookie == cookie)
+                       return lh;
+       }
+
+       return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+       /* ALWAYS called with lnet_res_lock held */
+       unsigned int    ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+       unsigned int    hash;
+
+       lh->lh_cookie = rec->rec_lh_cookie;
+       rec->rec_lh_cookie += 1 << ibits;
+
+       hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+       list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+       /* Prepare to bring up the network */
+       struct lnet_res_container **recs;
+       int                       rc = 0;
+
+       LASSERT (the_lnet.ln_refcount == 0);
+
+       the_lnet.ln_routing = 0;
+
+       LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+       the_lnet.ln_pid = requested_pid;
+
+       INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+       INIT_LIST_HEAD(&the_lnet.ln_nis);
+       INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+       INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+       INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+       rc = lnet_create_remote_nets_table();
+       if (rc != 0)
+               goto failed;
+
+       the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+       the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+                                               sizeof(lnet_counters_t));
+       if (the_lnet.ln_counters == NULL) {
+               CERROR("Failed to allocate counters for LNet\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       rc = lnet_peer_tables_create();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_msg_containers_create();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+                                     LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+                                     sizeof(lnet_eq_t));
+       if (rc != 0)
+               goto failed;
+
+       recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+                                         sizeof(lnet_me_t));
+       if (recs == NULL)
+               goto failed;
+
+       the_lnet.ln_me_containers = recs;
+
+       recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+                                         sizeof(lnet_libmd_t));
+       if (recs == NULL)
+               goto failed;
+
+       the_lnet.ln_md_containers = recs;
+
+       rc = lnet_portals_create();
+       if (rc != 0) {
+               CERROR("Failed to create portals for LNet: %d\n", rc);
+               goto failed;
+       }
+
+       return 0;
+
+ failed:
+       lnet_unprepare();
+       return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+       /* NB no LNET_LOCK since this is the last reference.  All LND instances
+        * have shut down already, so it is safe to unlink and free all
+        * descriptors, even those that appear committed to a network op (eg MD
+        * with non-zero pending count) */
+
+       lnet_fail_nid(LNET_NID_ANY, 0);
+
+       LASSERT(the_lnet.ln_refcount == 0);
+       LASSERT(list_empty(&the_lnet.ln_test_peers));
+       LASSERT(list_empty(&the_lnet.ln_nis));
+       LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+       lnet_portals_destroy();
+
+       if (the_lnet.ln_md_containers != NULL) {
+               lnet_res_containers_destroy(the_lnet.ln_md_containers);
+               the_lnet.ln_md_containers = NULL;
+       }
+
+       if (the_lnet.ln_me_containers != NULL) {
+               lnet_res_containers_destroy(the_lnet.ln_me_containers);
+               the_lnet.ln_me_containers = NULL;
+       }
+
+       lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+       lnet_msg_containers_destroy();
+       lnet_peer_tables_destroy();
+       lnet_rtrpools_free();
+
+       if (the_lnet.ln_counters != NULL) {
+               cfs_percpt_free(the_lnet.ln_counters);
+               the_lnet.ln_counters = NULL;
+       }
+       lnet_destroy_remote_nets_table();
+
+       return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+       struct list_head        *tmp;
+       lnet_ni_t       *ni;
+
+       LASSERT(cpt != LNET_LOCK_EX);
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (LNET_NIDNET(ni->ni_nid) == net) {
+                       lnet_ni_addref_locked(ni, cpt);
+                       return ni;
+               }
+       }
+
+       return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+       lnet_ni_t *ni;
+
+       lnet_net_lock(0);
+       ni = lnet_net2ni_locked(net, 0);
+       lnet_net_unlock(0);
+
+       return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+       __u64           key = nid;
+       unsigned int    val;
+
+       LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+       if (number == 1)
+               return 0;
+
+       val = cfs_hash_long(key, LNET_CPT_BITS);
+       /* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+       if (val < number)
+               return val;
+
+       return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+       struct lnet_ni *ni;
+
+       /* must called with hold of lnet_net_lock */
+       if (LNET_CPT_NUMBER == 1)
+               return 0; /* the only one */
+
+       /* take lnet_net_lock(any) would be OK */
+       if (!list_empty(&the_lnet.ln_nis_cpt)) {
+               list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+                       if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+                               continue;
+
+                       LASSERT(ni->ni_cpts != NULL);
+                       return ni->ni_cpts[lnet_nid_cpt_hash
+                                          (nid, ni->ni_ncpts)];
+               }
+       }
+
+       return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+       int     cpt;
+       int     cpt2;
+
+       if (LNET_CPT_NUMBER == 1)
+               return 0; /* the only one */
+
+       if (list_empty(&the_lnet.ln_nis_cpt))
+               return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+       cpt = lnet_net_lock_current();
+       cpt2 = lnet_cpt_of_nid_locked(nid);
+       lnet_net_unlock(cpt);
+
+       return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+
+       ni = lnet_net2ni_locked(net, cpt);
+       if (ni != NULL)
+               lnet_ni_decref_locked(ni, cpt);
+
+       lnet_net_unlock(cpt);
+
+       return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+       struct lnet_ni  *ni;
+       struct list_head        *tmp;
+
+       LASSERT(cpt != LNET_LOCK_EX);
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (ni->ni_nid == nid) {
+                       lnet_ni_addref_locked(ni, cpt);
+                       return ni;
+               }
+       }
+
+       return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+       ni = lnet_nid2ni_locked(nid, cpt);
+       if (ni != NULL)
+               lnet_ni_decref_locked(ni, cpt);
+       lnet_net_unlock(cpt);
+
+       return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+       /* Return the # of NIs that need the acceptor. */
+       int             count = 0;
+       struct list_head        *tmp;
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (ni->ni_lnd->lnd_accept != NULL)
+                       count++;
+       }
+
+       lnet_net_unlock(cpt);
+
+       return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+       int     credits;
+
+       LASSERT(ni->ni_ncpts >= 1);
+
+       if (ni->ni_ncpts == 1)
+               return ni->ni_maxtxcredits;
+
+       credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+       credits = max(credits, 8 * ni->ni_peertxcredits);
+       credits = min(credits, ni->ni_maxtxcredits);
+
+       return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+       int             i;
+       int             islo;
+       lnet_ni_t        *ni;
+
+       /* NB called holding the global mutex */
+
+       /* All quiet on the API front */
+       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_refcount == 0);
+       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_shutdown = 1;       /* flag shutdown */
+
+       /* Unlink NIs from the global table */
+       while (!list_empty(&the_lnet.ln_nis)) {
+               ni = list_entry(the_lnet.ln_nis.next,
+                                   lnet_ni_t, ni_list);
+               /* move it to zombie list and nobody can find it anymore */
+               list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+               lnet_ni_decref_locked(ni, 0);   /* drop ln_nis' ref */
+
+               if (!list_empty(&ni->ni_cptlist)) {
+                       list_del_init(&ni->ni_cptlist);
+                       lnet_ni_decref_locked(ni, 0);
+               }
+       }
+
+       /* Drop the cached eqwait NI. */
+       if (the_lnet.ln_eq_waitni != NULL) {
+               lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+               the_lnet.ln_eq_waitni = NULL;
+       }
+
+       /* Drop the cached loopback NI. */
+       if (the_lnet.ln_loni != NULL) {
+               lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+               the_lnet.ln_loni = NULL;
+       }
+
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       /* Clear lazy portals and drop delayed messages which hold refs
+        * on their lnet_msg_t::msg_rxpeer */
+       for (i = 0; i < the_lnet.ln_nportals; i++)
+               LNetClearLazyPortal(i);
+
+       /* Clear the peer table and wait for all peers to go (they hold refs on
+        * their NIs) */
+       lnet_peer_tables_cleanup();
+
+       lnet_net_lock(LNET_LOCK_EX);
+       /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+        * and shut them down in guaranteed thread context */
+       i = 2;
+       while (!list_empty(&the_lnet.ln_nis_zombie)) {
+               int     *ref;
+               int     j;
+
+               ni = list_entry(the_lnet.ln_nis_zombie.next,
+                                   lnet_ni_t, ni_list);
+               list_del_init(&ni->ni_list);
+               cfs_percpt_for_each(ref, j, ni->ni_refs) {
+                       if (*ref == 0)
+                               continue;
+                       /* still busy, add it back to zombie list */
+                       list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+                       break;
+               }
+
+               while (!list_empty(&ni->ni_list)) {
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       ++i;
+                       if ((i & (-i)) == i) {
+                               CDEBUG(D_WARNING,
+                                      "Waiting for zombie LNI %s\n",
+                                      libcfs_nid2str(ni->ni_nid));
+                       }
+                       cfs_pause(cfs_time_seconds(1));
+                       lnet_net_lock(LNET_LOCK_EX);
+                       continue;
+               }
+
+               ni->ni_lnd->lnd_refcount--;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               islo = ni->ni_lnd->lnd_type == LOLND;
+
+               LASSERT (!in_interrupt ());
+               (ni->ni_lnd->lnd_shutdown)(ni);
+
+               /* can't deref lnd anymore now; it might have unregistered
+                * itself...  */
+
+               if (!islo)
+                       CDEBUG(D_LNI, "Removed LNI %s\n",
+                              libcfs_nid2str(ni->ni_nid));
+
+               lnet_ni_free(ni);
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       the_lnet.ln_shutdown = 0;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       if (the_lnet.ln_network_tokens != NULL) {
+               LIBCFS_FREE(the_lnet.ln_network_tokens,
+                           the_lnet.ln_network_tokens_nob);
+               the_lnet.ln_network_tokens = NULL;
+       }
+}
+
+int
+lnet_startup_lndnis (void)
+{
+       lnd_t                   *lnd;
+       struct lnet_ni          *ni;
+       struct lnet_tx_queue    *tq;
+       struct list_head                nilist;
+       int                     i;
+       int             rc = 0;
+       int             lnd_type;
+       int             nicount = 0;
+       char          *nets = lnet_get_networks();
+
+       INIT_LIST_HEAD(&nilist);
+
+       if (nets == NULL)
+               goto failed;
+
+       rc = lnet_parse_networks(&nilist, nets);
+       if (rc != 0)
+               goto failed;
+
+       while (!list_empty(&nilist)) {
+               ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+               lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+               LASSERT (libcfs_isknown_lnd(lnd_type));
+
+               if (lnd_type == CIBLND    ||
+                   lnd_type == OPENIBLND ||
+                   lnd_type == IIBLND    ||
+                   lnd_type == VIBLND) {
+                       CERROR("LND %s obsoleted\n",
+                              libcfs_lnd2str(lnd_type));
+                       goto failed;
+               }
+
+               LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+               lnd = lnet_find_lnd_by_type(lnd_type);
+
+               if (lnd == NULL) {
+                       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+                       rc = request_module("%s",
+                                               libcfs_lnd2modname(lnd_type));
+                       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+                       lnd = lnet_find_lnd_by_type(lnd_type);
+                       if (lnd == NULL) {
+                               LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+                               CERROR("Can't load LND %s, module %s, rc=%d\n",
+                                      libcfs_lnd2str(lnd_type),
+                                      libcfs_lnd2modname(lnd_type), rc);
+                               goto failed;
+                       }
+               }
+
+               lnet_net_lock(LNET_LOCK_EX);
+               lnd->lnd_refcount++;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               ni->ni_lnd = lnd;
+
+               rc = (lnd->lnd_startup)(ni);
+
+               LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+               if (rc != 0) {
+                       LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+                                          "\n",
+                                          rc, libcfs_lnd2str(lnd->lnd_type));
+                       lnet_net_lock(LNET_LOCK_EX);
+                       lnd->lnd_refcount--;
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       goto failed;
+               }
+
+               LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+               list_del(&ni->ni_list);
+
+               lnet_net_lock(LNET_LOCK_EX);
+               /* refcount for ln_nis */
+               lnet_ni_addref_locked(ni, 0);
+               list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+               if (ni->ni_cpts != NULL) {
+                       list_add_tail(&ni->ni_cptlist,
+                                         &the_lnet.ln_nis_cpt);
+                       lnet_ni_addref_locked(ni, 0);
+               }
+
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               if (lnd->lnd_type == LOLND) {
+                       lnet_ni_addref(ni);
+                       LASSERT (the_lnet.ln_loni == NULL);
+                       the_lnet.ln_loni = ni;
+                       continue;
+               }
+
+               if (ni->ni_peertxcredits == 0 ||
+                   ni->ni_maxtxcredits == 0) {
+                       LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+                                          libcfs_lnd2str(lnd->lnd_type),
+                                          ni->ni_peertxcredits == 0 ?
+                                          "" : "per-peer ");
+                       goto failed;
+               }
+
+               cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+                       tq->tq_credits_min =
+                       tq->tq_credits_max =
+                       tq->tq_credits = lnet_ni_tq_credits(ni);
+               }
+
+               CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+                      libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+                      lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+                      ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+               nicount++;
+       }
+
+       if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+               lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+               LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+                                  "\n",
+                                  libcfs_lnd2str(lnd_type));
+               goto failed;
+       }
+
+       return 0;
+
+ failed:
+       lnet_shutdown_lndnis();
+
+       while (!list_empty(&nilist)) {
+               ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+               list_del(&ni->ni_list);
+               lnet_ni_free(ni);
+       }
+
+       return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+       int     rc;
+
+       lnet_assert_wire_constants();
+       LASSERT(!the_lnet.ln_init);
+
+       memset(&the_lnet, 0, sizeof(the_lnet));
+
+       /* refer to global cfs_cpt_table for now */
+       the_lnet.ln_cpt_table   = cfs_cpt_table;
+       the_lnet.ln_cpt_number  = cfs_cpt_number(cfs_cpt_table);
+
+       LASSERT(the_lnet.ln_cpt_number > 0);
+       if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+               /* we are under risk of consuming all lh_cookie */
+               CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+                      "please change setting of CPT-table and retry\n",
+                      the_lnet.ln_cpt_number, LNET_CPT_MAX);
+               return -1;
+       }
+
+       while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+               the_lnet.ln_cpt_bits++;
+
+       rc = lnet_create_locks();
+       if (rc != 0) {
+               CERROR("Can't create LNet global locks: %d\n", rc);
+               return -1;
+       }
+
+       the_lnet.ln_refcount = 0;
+       the_lnet.ln_init = 1;
+       LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+       INIT_LIST_HEAD(&the_lnet.ln_lnds);
+       INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+       INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+       /* The hash table size is the number of bits it takes to express the set
+        * ln_num_routes, minus 1 (better to under estimate than over so we
+        * don't waste memory). */
+       if (rnet_htable_size <= 0)
+               rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+       else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+               rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+       the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+                                          order_base_2(rnet_htable_size) - 1);
+
+       /* All LNDs apart from the LOLND are in separate modules.  They
+        * register themselves when their module loads, and unregister
+        * themselves when their module is unloaded. */
+       lnet_register_lnd(&the_lolnd);
+       return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount == 0);
+
+       while (!list_empty(&the_lnet.ln_lnds))
+               lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+                                                  lnd_t, lnd_list));
+       lnet_destroy_locks();
+
+       the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+       int      im_a_router = 0;
+       int      rc;
+
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+       if (the_lnet.ln_refcount > 0) {
+               rc = the_lnet.ln_refcount++;
+               goto out;
+       }
+
+       lnet_get_tunables();
+
+       if (requested_pid == LNET_PID_ANY) {
+               /* Don't instantiate LNET just for me */
+               rc = -ENETDOWN;
+               goto failed0;
+       }
+
+       rc = lnet_prepare(requested_pid);
+       if (rc != 0)
+               goto failed0;
+
+       rc = lnet_startup_lndnis();
+       if (rc != 0)
+               goto failed1;
+
+       rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_check_routes();
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_rtrpools_alloc(im_a_router);
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_acceptor_start();
+       if (rc != 0)
+               goto failed2;
+
+       the_lnet.ln_refcount = 1;
+       /* Now I may use my own API functions... */
+
+       /* NB router checker needs the_lnet.ln_ping_info in
+        * lnet_router_checker -> lnet_update_ni_status_locked */
+       rc = lnet_ping_target_init();
+       if (rc != 0)
+               goto failed3;
+
+       rc = lnet_router_checker_start();
+       if (rc != 0)
+               goto failed4;
+
+       lnet_proc_init();
+       goto out;
+
+ failed4:
+       lnet_ping_target_fini();
+ failed3:
+       the_lnet.ln_refcount = 0;
+       lnet_acceptor_stop();
+ failed2:
+       lnet_destroy_routes();
+       lnet_shutdown_lndnis();
+ failed1:
+       lnet_unprepare();
+ failed0:
+       LASSERT (rc < 0);
+ out:
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+       return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (the_lnet.ln_refcount != 1) {
+               the_lnet.ln_refcount--;
+       } else {
+               LASSERT (!the_lnet.ln_niinit_self);
+
+               lnet_proc_fini();
+               lnet_router_checker_stop();
+               lnet_ping_target_fini();
+
+               /* Teardown fns that use my own API functions BEFORE here */
+               the_lnet.ln_refcount = 0;
+
+               lnet_acceptor_stop();
+               lnet_destroy_routes();
+               lnet_shutdown_lndnis();
+               lnet_unprepare();
+       }
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       lnet_process_id_t        id = {0};
+       lnet_ni_t               *ni;
+       int                    rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       switch (cmd) {
+       case IOC_LIBCFS_GET_NI:
+               rc = LNetGetId(data->ioc_count, &id);
+               data->ioc_nid = id.nid;
+               return rc;
+
+       case IOC_LIBCFS_FAIL_NID:
+               return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+       case IOC_LIBCFS_ADD_ROUTE:
+               rc = lnet_add_route(data->ioc_net, data->ioc_count,
+                                   data->ioc_nid);
+               return (rc != 0) ? rc : lnet_check_routes();
+
+       case IOC_LIBCFS_DEL_ROUTE:
+               return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+       case IOC_LIBCFS_GET_ROUTE:
+               return lnet_get_route(data->ioc_count,
+                                     &data->ioc_net, &data->ioc_count,
+                                     &data->ioc_nid, &data->ioc_flags);
+       case IOC_LIBCFS_NOTIFY_ROUTER:
+               return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+                                  cfs_time_current() -
+                                  cfs_time_seconds(cfs_time_current_sec() -
+                                                   (time_t)data->ioc_u64[0]));
+
+       case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+               /* This can be removed once lustre stops calling it */
+               return 0;
+
+       case IOC_LIBCFS_LNET_DIST:
+               rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+               if (rc < 0 && rc != -EHOSTUNREACH)
+                       return rc;
+
+               data->ioc_u32[0] = rc;
+               return 0;
+
+       case IOC_LIBCFS_TESTPROTOCOMPAT:
+               lnet_net_lock(LNET_LOCK_EX);
+               the_lnet.ln_testprotocompat = data->ioc_flags;
+               lnet_net_unlock(LNET_LOCK_EX);
+               return 0;
+
+       case IOC_LIBCFS_PING:
+               id.nid = data->ioc_nid;
+               id.pid = data->ioc_u32[0];
+               rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+                              (lnet_process_id_t *)data->ioc_pbuf1,
+                              data->ioc_plen1/sizeof(lnet_process_id_t));
+               if (rc < 0)
+                       return rc;
+               data->ioc_count = rc;
+               return 0;
+
+       case IOC_LIBCFS_DEBUG_PEER: {
+               /* CAVEAT EMPTOR: this one designed for calling directly; not
+                * via an ioctl */
+               id = *((lnet_process_id_t *) arg);
+
+               lnet_debug_peer(id.nid);
+
+               ni = lnet_net2ni(LNET_NIDNET(id.nid));
+               if (ni == NULL) {
+                       CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+               } else {
+                       if (ni->ni_lnd->lnd_ctl == NULL) {
+                               CDEBUG(D_WARNING, "No ctl for %s\n",
+                                      libcfs_id2str(id));
+                       } else {
+                               (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+                       }
+
+                       lnet_ni_decref(ni);
+               }
+               return 0;
+       }
+
+       default:
+               ni = lnet_net2ni(data->ioc_net);
+               if (ni == NULL)
+                       return -EINVAL;
+
+               if (ni->ni_lnd->lnd_ctl == NULL)
+                       rc = -EINVAL;
+               else
+                       rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+               lnet_ni_decref(ni);
+               return rc;
+       }
+       /* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+       struct lnet_ni  *ni;
+       struct list_head        *tmp;
+       int             cpt;
+       int             rc = -ENOENT;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_net_lock_current();
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               if (index-- != 0)
+                       continue;
+
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               id->nid = ni->ni_nid;
+               id->pid = the_lnet.ln_pid;
+               rc = 0;
+               break;
+       }
+
+       lnet_net_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+       snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+       int            i;
+       int            n;
+       int            rc;
+       unsigned int      infosz;
+       lnet_ni_t       *ni;
+       lnet_process_id_t id;
+       lnet_ping_info_t *pinfo;
+
+       for (n = 0; ; n++) {
+               rc = LNetGetId(n, &id);
+               if (rc == -ENOENT)
+                       break;
+
+               LASSERT (rc == 0);
+       }
+
+       infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+       LIBCFS_ALLOC(pinfo, infosz);
+       if (pinfo == NULL) {
+               CERROR("Can't allocate ping info[%d]\n", n);
+               return -ENOMEM;
+       }
+
+       pinfo->pi_nnis    = n;
+       pinfo->pi_pid     = the_lnet.ln_pid;
+       pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+       pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+       for (i = 0; i < n; i++) {
+               lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+               rc = LNetGetId(i, &id);
+               LASSERT (rc == 0);
+
+               ns->ns_nid    = id.nid;
+               ns->ns_status = LNET_NI_STATUS_UP;
+
+               lnet_net_lock(0);
+
+               ni = lnet_nid2ni_locked(id.nid, 0);
+               LASSERT(ni != NULL);
+
+               lnet_ni_lock(ni);
+               LASSERT(ni->ni_status == NULL);
+               ni->ni_status = ns;
+               lnet_ni_unlock(ni);
+
+               lnet_ni_decref_locked(ni, 0);
+               lnet_net_unlock(0);
+       }
+
+       the_lnet.ln_ping_info = pinfo;
+       return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+       struct lnet_ni  *ni;
+
+       lnet_net_lock(0);
+
+       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+               lnet_ni_lock(ni);
+               ni->ni_status = NULL;
+               lnet_ni_unlock(ni);
+       }
+
+       lnet_net_unlock(0);
+
+       LIBCFS_FREE(the_lnet.ln_ping_info,
+                   offsetof(lnet_ping_info_t,
+                            pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+       the_lnet.ln_ping_info = NULL;
+       return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+       lnet_md_t        md = {0};
+       lnet_handle_me_t  meh;
+       lnet_process_id_t id;
+       int            rc;
+       int            rc2;
+       int            infosz;
+
+       rc = lnet_create_ping_info();
+       if (rc != 0)
+               return rc;
+
+       /* We can have a tiny EQ since we only need to see the unlink event on
+        * teardown, which by definition is the last one! */
+       rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+       if (rc != 0) {
+               CERROR("Can't allocate ping EQ: %d\n", rc);
+               goto failed_0;
+       }
+
+       memset(&id, 0, sizeof(lnet_process_id_t));
+       id.nid = LNET_NID_ANY;
+       id.pid = LNET_PID_ANY;
+
+       rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+                         LNET_PROTO_PING_MATCHBITS, 0,
+                         LNET_UNLINK, LNET_INS_AFTER,
+                         &meh);
+       if (rc != 0) {
+               CERROR("Can't create ping ME: %d\n", rc);
+               goto failed_1;
+       }
+
+       /* initialize md content */
+       infosz = offsetof(lnet_ping_info_t,
+                         pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+       md.start     = the_lnet.ln_ping_info;
+       md.length    = infosz;
+       md.threshold = LNET_MD_THRESH_INF;
+       md.max_size  = 0;
+       md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+                      LNET_MD_MANAGE_REMOTE;
+       md.user_ptr  = NULL;
+       md.eq_handle = the_lnet.ln_ping_target_eq;
+
+       rc = LNetMDAttach(meh, md,
+                         LNET_RETAIN,
+                         &the_lnet.ln_ping_target_md);
+       if (rc != 0) {
+               CERROR("Can't attach ping MD: %d\n", rc);
+               goto failed_2;
+       }
+
+       return 0;
+
+ failed_2:
+       rc2 = LNetMEUnlink(meh);
+       LASSERT (rc2 == 0);
+ failed_1:
+       rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+       LASSERT (rc2 == 0);
+ failed_0:
+       lnet_destroy_ping_info();
+       return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+       lnet_event_t    event;
+       int          rc;
+       int          which;
+       int          timeout_ms = 1000;
+       sigset_t    blocked = cfs_block_allsigs();
+
+       LNetMDUnlink(the_lnet.ln_ping_target_md);
+       /* NB md could be busy; this just starts the unlink */
+
+       for (;;) {
+               rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+                               timeout_ms, &event, &which);
+
+               /* I expect overflow... */
+               LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+               if (rc == 0) {
+                       /* timed out: provide a diagnostic */
+                       CWARN("Still waiting for ping MD to unlink\n");
+                       timeout_ms *= 2;
+                       continue;
+               }
+
+               /* Got a valid event */
+               if (event.unlinked)
+                       break;
+       }
+
+       rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+       LASSERT (rc == 0);
+       lnet_destroy_ping_info();
+       cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+       lnet_handle_eq_t     eqh;
+       lnet_handle_md_t     mdh;
+       lnet_event_t     event;
+       lnet_md_t           md = {0};
+       int               which;
+       int               unlinked = 0;
+       int               replied = 0;
+       const int           a_long_time = 60000; /* mS */
+       int               infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+       lnet_ping_info_t    *info;
+       lnet_process_id_t    tmpid;
+       int               i;
+       int               nob;
+       int               rc;
+       int               rc2;
+       sigset_t         blocked;
+
+       if (n_ids <= 0 ||
+           id.nid == LNET_NID_ANY ||
+           timeout_ms > 500000 ||            /* arbitrary limit! */
+           n_ids > 20)                  /* arbitrary limit! */
+               return -EINVAL;
+
+       if (id.pid == LNET_PID_ANY)
+               id.pid = LUSTRE_SRV_LNET_PID;
+
+       LIBCFS_ALLOC(info, infosz);
+       if (info == NULL)
+               return -ENOMEM;
+
+       /* NB 2 events max (including any unlink event) */
+       rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+       if (rc != 0) {
+               CERROR("Can't allocate EQ: %d\n", rc);
+               goto out_0;
+       }
+
+       /* initialize md content */
+       md.start     = info;
+       md.length    = infosz;
+       md.threshold = 2; /*GET/REPLY*/
+       md.max_size  = 0;
+       md.options   = LNET_MD_TRUNCATE;
+       md.user_ptr  = NULL;
+       md.eq_handle = eqh;
+
+       rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+       if (rc != 0) {
+               CERROR("Can't bind MD: %d\n", rc);
+               goto out_1;
+       }
+
+       rc = LNetGet(LNET_NID_ANY, mdh, id,
+                    LNET_RESERVED_PORTAL,
+                    LNET_PROTO_PING_MATCHBITS, 0);
+
+       if (rc != 0) {
+               /* Don't CERROR; this could be deliberate! */
+
+               rc2 = LNetMDUnlink(mdh);
+               LASSERT (rc2 == 0);
+
+               /* NB must wait for the UNLINK event below... */
+               unlinked = 1;
+               timeout_ms = a_long_time;
+       }
+
+       do {
+               /* MUST block for unlink to complete */
+               if (unlinked)
+                       blocked = cfs_block_allsigs();
+
+               rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+               if (unlinked)
+                       cfs_restore_sigs(blocked);
+
+               CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+                      (rc2 <= 0) ? -1 : event.type,
+                      (rc2 <= 0) ? -1 : event.status,
+                      (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+               LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+               if (rc2 <= 0 || event.status != 0) {
+                       /* timeout or error */
+                       if (!replied && rc == 0)
+                               rc = (rc2 < 0) ? rc2 :
+                                    (rc2 == 0) ? -ETIMEDOUT :
+                                    event.status;
+
+                       if (!unlinked) {
+                               /* Ensure completion in finite time... */
+                               LNetMDUnlink(mdh);
+                               /* No assertion (racing with network) */
+                               unlinked = 1;
+                               timeout_ms = a_long_time;
+                       } else if (rc2 == 0) {
+                               /* timed out waiting for unlink */
+                               CWARN("ping %s: late network completion\n",
+                                     libcfs_id2str(id));
+                       }
+               } else if (event.type == LNET_EVENT_REPLY) {
+                       replied = 1;
+                       rc = event.mlength;
+               }
+
+       } while (rc2 <= 0 || !event.unlinked);
+
+       if (!replied) {
+               if (rc >= 0)
+                       CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+                             libcfs_id2str(id));
+               rc = -EIO;
+               goto out_1;
+       }
+
+       nob = rc;
+       LASSERT (nob >= 0 && nob <= infosz);
+
+       rc = -EPROTO;                      /* if I can't parse... */
+
+       if (nob < 8) {
+               /* can't check magic/version */
+               CERROR("%s: ping info too short %d\n",
+                      libcfs_id2str(id), nob);
+               goto out_1;
+       }
+
+       if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+               lnet_swap_pinginfo(info);
+       } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+               CERROR("%s: Unexpected magic %08x\n",
+                      libcfs_id2str(id), info->pi_magic);
+               goto out_1;
+       }
+
+       if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+               CERROR("%s: ping w/o NI status: 0x%x\n",
+                      libcfs_id2str(id), info->pi_features);
+               goto out_1;
+       }
+
+       if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+               CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+                      nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+               goto out_1;
+       }
+
+       if (info->pi_nnis < n_ids)
+               n_ids = info->pi_nnis;
+
+       if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+               CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+                      nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+               goto out_1;
+       }
+
+       rc = -EFAULT;                      /* If I SEGV... */
+
+       for (i = 0; i < n_ids; i++) {
+               tmpid.pid = info->pi_pid;
+               tmpid.nid = info->pi_ni[i].ns_nid;
+               if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+                       goto out_1;
+       }
+       rc = info->pi_nnis;
+
+ out_1:
+       rc2 = LNetEQFree(eqh);
+       if (rc2 != 0)
+               CERROR("rc2 %d\n", rc2);
+       LASSERT (rc2 == 0);
+
+ out_0:
+       LIBCFS_FREE(info, infosz);
+       return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644 (file)
index 0000000..28711e6
--- /dev/null
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct {                           /* tmp struct for parsing routes */
+       struct list_head         ltb_list;      /* stash on lists */
+       int             ltb_size;       /* allocated size */
+       char           ltb_text[0];     /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;                     /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)      /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+       static char dots[LNET_SINGLE_TEXTBUF_NOB];
+       static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+       memset(dots, '.', sizeof(dots));
+       dots[sizeof(dots)-1] = 0;
+       memset(dashes, '-', sizeof(dashes));
+       dashes[sizeof(dashes)-1] = 0;
+
+       LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+       LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+                          (int)strlen(name), dots, offset, dots,
+                           (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+       switch (c) {
+       case '\n':
+       case '\r':
+       case ';':
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+       struct list_head       *tmp;
+       lnet_ni_t       *ni;
+
+       list_for_each (tmp, nilist) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (LNET_NIDNET(ni->ni_nid) == net)
+                       return 0;
+       }
+
+       return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+       if (ni->ni_refs != NULL)
+               cfs_percpt_free(ni->ni_refs);
+
+       if (ni->ni_tx_queues != NULL)
+               cfs_percpt_free(ni->ni_tx_queues);
+
+       if (ni->ni_cpts != NULL)
+               cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+       LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+       struct lnet_tx_queue    *tq;
+       struct lnet_ni          *ni;
+       int                     rc;
+       int                     i;
+
+       if (!lnet_net_unique(net, nilist)) {
+               LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+                                  libcfs_net2str(net));
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(ni, sizeof(*ni));
+       if (ni == NULL) {
+               CERROR("Out of memory creating network %s\n",
+                      libcfs_net2str(net));
+               return NULL;
+       }
+
+       spin_lock_init(&ni->ni_lock);
+       INIT_LIST_HEAD(&ni->ni_cptlist);
+       ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+                                      sizeof(*ni->ni_refs[0]));
+       if (ni->ni_refs == NULL)
+               goto failed;
+
+       ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(*ni->ni_tx_queues[0]));
+       if (ni->ni_tx_queues == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+               INIT_LIST_HEAD(&tq->tq_delayed);
+
+       if (el == NULL) {
+               ni->ni_cpts  = NULL;
+               ni->ni_ncpts = LNET_CPT_NUMBER;
+       } else {
+               rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+               if (rc <= 0) {
+                       CERROR("Failed to set CPTs for NI %s: %d\n",
+                              libcfs_net2str(net), rc);
+                       goto failed;
+               }
+
+               LASSERT(rc <= LNET_CPT_NUMBER);
+               if (rc == LNET_CPT_NUMBER) {
+                       LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+                       ni->ni_cpts = NULL;
+               }
+
+               ni->ni_ncpts = rc;
+       }
+
+       /* LND will fill in the address part of the NID */
+       ni->ni_nid = LNET_MKNID(net, 0);
+       ni->ni_last_alive = cfs_time_current_sec();
+       list_add_tail(&ni->ni_list, nilist);
+       return ni;
+ failed:
+       lnet_ni_free(ni);
+       return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+       struct cfs_expr_list *el = NULL;
+       int             tokensize = strlen(networks) + 1;
+       char            *tokens;
+       char            *str;
+       char            *tmp;
+       struct lnet_ni  *ni;
+       __u32           net;
+       int             nnets = 0;
+
+       if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _WAY_ conservative */
+               LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+                                  "long\n");
+               return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(tokens, tokensize);
+       if (tokens == NULL) {
+               CERROR("Can't allocate net tokens\n");
+               return -ENOMEM;
+       }
+
+       the_lnet.ln_network_tokens = tokens;
+       the_lnet.ln_network_tokens_nob = tokensize;
+       memcpy (tokens, networks, tokensize);
+       str = tmp = tokens;
+
+       /* Add in the loopback network */
+       ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+       if (ni == NULL)
+               goto failed;
+
+       while (str != NULL && *str != 0) {
+               char    *comma = strchr(str, ',');
+               char    *bracket = strchr(str, '(');
+               char    *square = strchr(str, '[');
+               char    *iface;
+               int     niface;
+               int     rc;
+
+               /* NB we don't check interface conflicts here; it's the LNDs
+                * responsibility (if it cares at all) */
+
+               if (square != NULL && (comma == NULL || square < comma)) {
+                       /* i.e: o2ib0(ib0)[1,2], number between square
+                        * brackets are CPTs this NI needs to be bond */
+                       if (bracket != NULL && bracket > square) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       tmp = strchr(square, ']');
+                       if (tmp == NULL) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       rc = cfs_expr_list_parse(square, tmp - square + 1,
+                                                0, LNET_CPT_NUMBER - 1, &el);
+                       if (rc != 0) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       while (square <= tmp)
+                               *square++ = ' ';
+               }
+
+               if (bracket == NULL ||
+                   (comma != NULL && comma < bracket)) {
+
+                       /* no interface list specified */
+
+                       if (comma != NULL)
+                               *comma++ = 0;
+                       net = libcfs_str2net(cfs_trimwhite(str));
+
+                       if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                               LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+                                                  " type\n");
+                               tmp = str;
+                               goto failed_syntax;
+                       }
+
+                       if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+                           lnet_ni_alloc(net, el, nilist) == NULL)
+                               goto failed;
+
+                       if (el != NULL) {
+                               cfs_expr_list_free(el);
+                               el = NULL;
+                       }
+
+                       str = comma;
+                       continue;
+               }
+
+               *bracket = 0;
+               net = libcfs_str2net(cfs_trimwhite(str));
+               if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                       tmp = str;
+                       goto failed_syntax;
+               }
+
+               nnets++;
+               ni = lnet_ni_alloc(net, el, nilist);
+               if (ni == NULL)
+                       goto failed;
+
+               if (el != NULL) {
+                       cfs_expr_list_free(el);
+                       el = NULL;
+               }
+
+               niface = 0;
+               iface = bracket + 1;
+
+               bracket = strchr(iface, ')');
+               if (bracket == NULL) {
+                       tmp = iface;
+                       goto failed_syntax;
+               }
+
+               *bracket = 0;
+               do {
+                       comma = strchr(iface, ',');
+                       if (comma != NULL)
+                               *comma++ = 0;
+
+                       iface = cfs_trimwhite(iface);
+                       if (*iface == 0) {
+                               tmp = iface;
+                               goto failed_syntax;
+                       }
+
+                       if (niface == LNET_MAX_INTERFACES) {
+                               LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+                                                  "for net %s\n",
+                                                  libcfs_net2str(net));
+                               goto failed;
+                       }
+
+                       ni->ni_interfaces[niface++] = iface;
+                       iface = comma;
+               } while (iface != NULL);
+
+               str = bracket + 1;
+               comma = strchr(bracket + 1, ',');
+               if (comma != NULL) {
+                       *comma = 0;
+                       str = cfs_trimwhite(str);
+                       if (*str != 0) {
+                               tmp = str;
+                               goto failed_syntax;
+                       }
+                       str = comma + 1;
+                       continue;
+               }
+
+               str = cfs_trimwhite(str);
+               if (*str != 0) {
+                       tmp = str;
+                       goto failed_syntax;
+               }
+       }
+
+       LASSERT(!list_empty(nilist));
+       return 0;
+
+ failed_syntax:
+       lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+       while (!list_empty(nilist)) {
+               ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+               list_del(&ni->ni_list);
+               lnet_ni_free(ni);
+       }
+
+       if (el != NULL)
+               cfs_expr_list_free(el);
+
+       LIBCFS_FREE(tokens, tokensize);
+       the_lnet.ln_network_tokens = NULL;
+
+       return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+       lnet_text_buf_t *ltb;
+       int           nob;
+
+       /* NB allocate space for the terminating 0 */
+       nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+       if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _way_ conservative for "route net gateway..." */
+               CERROR("text buffer too big\n");
+               return NULL;
+       }
+
+       if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+               CERROR("Too many text buffers\n");
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(ltb, nob);
+       if (ltb == NULL)
+               return NULL;
+
+       ltb->ltb_size = nob;
+       ltb->ltb_text[0] = 0;
+       lnet_tbnob += nob;
+       return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+       lnet_tbnob -= ltb->ltb_size;
+       LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+       lnet_text_buf_t  *ltb;
+
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+       struct list_head        *tmp;
+       lnet_text_buf_t   *ltb;
+
+       list_for_each (tmp, tbs) {
+               ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+               CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+       }
+
+       CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+       struct list_head        pending;
+       char         *sep;
+       int            nob;
+       int            i;
+       lnet_text_buf_t  *ltb;
+
+       INIT_LIST_HEAD(&pending);
+
+       /* Split 'str' into separate commands */
+       for (;;) {
+               /* skip leading whitespace */
+               while (cfs_iswhite(*str))
+                       str++;
+
+               /* scan for separator or comment */
+               for (sep = str; *sep != 0; sep++)
+                       if (lnet_issep(*sep) || *sep == '#')
+                               break;
+
+               nob = (int)(sep - str);
+               if (nob > 0) {
+                       ltb = lnet_new_text_buf(nob);
+                       if (ltb == NULL) {
+                               lnet_free_text_bufs(&pending);
+                               return -1;
+                       }
+
+                       for (i = 0; i < nob; i++)
+                               if (cfs_iswhite(str[i]))
+                                       ltb->ltb_text[i] = ' ';
+                               else
+                                       ltb->ltb_text[i] = str[i];
+
+                       ltb->ltb_text[nob] = 0;
+
+                       list_add_tail(&ltb->ltb_list, &pending);
+               }
+
+               if (*sep == '#') {
+                       /* scan for separator */
+                       do {
+                               sep++;
+                       } while (*sep != 0 && !lnet_issep(*sep));
+               }
+
+               if (*sep == 0)
+                       break;
+
+               str = sep + 1;
+       }
+
+       list_splice(&pending, tbs->prev);
+       return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+              char *str, char *sep1, char *sep2,
+              char *item, int itemlen)
+{
+       int           len1 = (int)(sep1 - str);
+       int           len2 = strlen(sep2 + 1);
+       lnet_text_buf_t *ltb;
+
+       LASSERT (*sep1 == '[');
+       LASSERT (*sep2 == ']');
+
+       ltb = lnet_new_text_buf(len1 + itemlen + len2);
+       if (ltb == NULL)
+               return -ENOMEM;
+
+       memcpy(ltb->ltb_text, str, len1);
+       memcpy(&ltb->ltb_text[len1], item, itemlen);
+       memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+       ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+       list_add_tail(&ltb->ltb_list, list);
+       return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+       char          num[16];
+       struct list_head        pending;
+       char         *sep;
+       char         *sep2;
+       char         *parsed;
+       char         *enditem;
+       int            lo;
+       int            hi;
+       int            stride;
+       int            i;
+       int            nob;
+       int            scanned;
+
+       INIT_LIST_HEAD(&pending);
+
+       sep = strchr(str, '[');
+       if (sep == NULL)                        /* nothing to expand */
+               return 0;
+
+       sep2 = strchr(sep, ']');
+       if (sep2 == NULL)
+               goto failed;
+
+       for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+               enditem = ++parsed;
+               while (enditem < sep2 && *enditem != ',')
+                       enditem++;
+
+               if (enditem == parsed)          /* no empty items */
+                       goto failed;
+
+               if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+                       if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+                               /* simple string enumeration */
+                               if (lnet_expand1tb(&pending, str, sep, sep2,
+                                                  parsed, (int)(enditem - parsed)) != 0)
+                                       goto failed;
+
+                               continue;
+                       }
+
+                       stride = 1;
+               }
+
+               /* range expansion */
+
+               if (enditem != parsed + scanned) /* no trailing junk */
+                       goto failed;
+
+               if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+                   (hi - lo) % stride != 0)
+                       goto failed;
+
+               for (i = lo; i <= hi; i += stride) {
+
+                       snprintf(num, sizeof(num), "%d", i);
+                       nob = strlen(num);
+                       if (nob + 1 == sizeof(num))
+                               goto failed;
+
+                       if (lnet_expand1tb(&pending, str, sep, sep2,
+                                          num, nob) != 0)
+                               goto failed;
+               }
+       }
+
+       list_splice(&pending, tbs->prev);
+       return 1;
+
+ failed:
+       lnet_free_text_bufs(&pending);
+       return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+       int     len = strlen(str);
+       int     nob = len;
+
+       return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+               nob == len &&
+               *hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+       /* static scratch buffer OK (single threaded) */
+       static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+       struct list_head        nets;
+       struct list_head        gateways;
+       struct list_head       *tmp1;
+       struct list_head       *tmp2;
+       __u32        net;
+       lnet_nid_t      nid;
+       lnet_text_buf_t  *ltb;
+       int            rc;
+       char         *sep;
+       char         *token = str;
+       int            ntokens = 0;
+       int            myrc = -1;
+       unsigned int      hops;
+       int            got_hops = 0;
+
+       INIT_LIST_HEAD(&gateways);
+       INIT_LIST_HEAD(&nets);
+
+       /* save a copy of the string for error messages */
+       strncpy(cmd, str, sizeof(cmd) - 1);
+       cmd[sizeof(cmd) - 1] = 0;
+
+       sep = str;
+       for (;;) {
+               /* scan for token start */
+               while (cfs_iswhite(*sep))
+                       sep++;
+               if (*sep == 0) {
+                       if (ntokens < (got_hops ? 3 : 2))
+                               goto token_error;
+                       break;
+               }
+
+               ntokens++;
+               token = sep++;
+
+               /* scan for token end */
+               while (*sep != 0 && !cfs_iswhite(*sep))
+                       sep++;
+               if (*sep != 0)
+                       *sep++ = 0;
+
+               if (ntokens == 1) {
+                       tmp2 = &nets;           /* expanding nets */
+               } else if (ntokens == 2 &&
+                          lnet_parse_hops(token, &hops)) {
+                       got_hops = 1;      /* got a hop count */
+                       continue;
+               } else {
+                       tmp2 = &gateways;       /* expanding gateways */
+               }
+
+               ltb = lnet_new_text_buf(strlen(token));
+               if (ltb == NULL)
+                       goto out;
+
+               strcpy(ltb->ltb_text, token);
+               tmp1 = &ltb->ltb_list;
+               list_add_tail(tmp1, tmp2);
+
+               while (tmp1 != tmp2) {
+                       ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+                       rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+                       if (rc < 0)
+                               goto token_error;
+
+                       tmp1 = tmp1->next;
+
+                       if (rc > 0) {           /* expanded! */
+                               list_del(&ltb->ltb_list);
+                               lnet_free_text_buf(ltb);
+                               continue;
+                       }
+
+                       if (ntokens == 1) {
+                               net = libcfs_str2net(ltb->ltb_text);
+                               if (net == LNET_NIDNET(LNET_NID_ANY) ||
+                                   LNET_NETTYP(net) == LOLND)
+                                       goto token_error;
+                       } else {
+                               nid = libcfs_str2nid(ltb->ltb_text);
+                               if (nid == LNET_NID_ANY ||
+                                   LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                                       goto token_error;
+                       }
+               }
+       }
+
+       if (!got_hops)
+               hops = 1;
+
+       LASSERT (!list_empty(&nets));
+       LASSERT (!list_empty(&gateways));
+
+       list_for_each (tmp1, &nets) {
+               ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+               net = libcfs_str2net(ltb->ltb_text);
+               LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+               list_for_each (tmp2, &gateways) {
+                       ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+                       nid = libcfs_str2nid(ltb->ltb_text);
+                       LASSERT (nid != LNET_NID_ANY);
+
+                       if (lnet_islocalnid(nid)) {
+                               *im_a_router = 1;
+                               continue;
+                       }
+
+                       rc = lnet_add_route (net, hops, nid);
+                       if (rc != 0) {
+                               CERROR("Can't create route "
+                                      "to %s via %s\n",
+                                      libcfs_net2str(net),
+                                      libcfs_nid2str(nid));
+                               goto out;
+                       }
+               }
+       }
+
+       myrc = 0;
+       goto out;
+
+ token_error:
+       lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+       lnet_free_text_bufs(&nets);
+       lnet_free_text_bufs(&gateways);
+       return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+       lnet_text_buf_t   *ltb;
+
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+               if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+                       lnet_free_text_bufs(tbs);
+                       return -EINVAL;
+               }
+
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+
+       return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+       struct list_head        tbs;
+       int            rc = 0;
+
+       *im_a_router = 0;
+
+       INIT_LIST_HEAD(&tbs);
+
+       if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+               CERROR("Error parsing routes\n");
+               rc = -EINVAL;
+       } else {
+               rc = lnet_parse_route_tbs(&tbs, im_a_router);
+       }
+
+       LASSERT (lnet_tbnob == 0);
+       return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+       LIST_HEAD       (list);
+       int             rc;
+       int             i;
+
+       rc = cfs_ip_addr_parse(token, len, &list);
+       if (rc != 0)
+               return rc;
+
+       for (rc = i = 0; !rc && i < nip; i++)
+               rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+       cfs_ip_addr_free(&list);
+
+       return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+       static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+       int   matched = 0;
+       int   ntokens = 0;
+       int   len;
+       char *net = NULL;
+       char *sep;
+       char *token;
+       int   rc;
+
+       LASSERT (strlen(net_entry) < sizeof(tokens));
+
+       /* work on a copy of the string */
+       strcpy(tokens, net_entry);
+       sep = tokens;
+       for (;;) {
+               /* scan for token start */
+               while (cfs_iswhite(*sep))
+                       sep++;
+               if (*sep == 0)
+                       break;
+
+               token = sep++;
+
+               /* scan for token end */
+               while (*sep != 0 && !cfs_iswhite(*sep))
+                       sep++;
+               if (*sep != 0)
+                       *sep++ = 0;
+
+               if (ntokens++ == 0) {
+                       net = token;
+                       continue;
+               }
+
+               len = strlen(token);
+
+               rc = lnet_match_network_token(token, len, ipaddrs, nip);
+               if (rc < 0) {
+                       lnet_syntax("ip2nets", net_entry,
+                                   (int)(token - tokens), len);
+                       return rc;
+               }
+
+               matched |= (rc != 0);
+       }
+
+       if (!matched)
+               return 0;
+
+       strcpy(net_entry, net);          /* replace with matched net */
+       return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+       char   *bracket = strchr(netspec, '(');
+       __u32   net;
+
+       if (bracket != NULL)
+               *bracket = 0;
+
+       net = libcfs_str2net(netspec);
+
+       if (bracket != NULL)
+               *bracket = '(';
+
+       return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+       int            offset = 0;
+       int            offset2;
+       int            len;
+       lnet_text_buf_t  *tb;
+       lnet_text_buf_t  *tb2;
+       struct list_head       *t;
+       char         *sep;
+       char         *bracket;
+       __u32        net;
+
+       LASSERT (!list_empty(nets));
+       LASSERT (nets->next == nets->prev);     /* single entry */
+
+       tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+       for (;;) {
+               sep = strchr(tb->ltb_text, ',');
+               bracket = strchr(tb->ltb_text, '(');
+
+               if (sep != NULL &&
+                   bracket != NULL &&
+                   bracket < sep) {
+                       /* netspec lists interfaces... */
+
+                       offset2 = offset + (int)(bracket - tb->ltb_text);
+                       len = strlen(bracket);
+
+                       bracket = strchr(bracket + 1, ')');
+
+                       if (bracket == NULL ||
+                           !(bracket[1] == ',' || bracket[1] == 0)) {
+                               lnet_syntax("ip2nets", source, offset2, len);
+                               return -EINVAL;
+                       }
+
+                       sep = (bracket[1] == 0) ? NULL : bracket + 1;
+               }
+
+               if (sep != NULL)
+                       *sep++ = 0;
+
+               net = lnet_netspec2net(tb->ltb_text);
+               if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                       lnet_syntax("ip2nets", source, offset,
+                                   strlen(tb->ltb_text));
+                       return -EINVAL;
+               }
+
+               list_for_each(t, nets) {
+                       tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+                       if (tb2 == tb)
+                               continue;
+
+                       if (net == lnet_netspec2net(tb2->ltb_text)) {
+                               /* duplicate network */
+                               lnet_syntax("ip2nets", source, offset,
+                                           strlen(tb->ltb_text));
+                               return -EINVAL;
+                       }
+               }
+
+               if (sep == NULL)
+                       return 0;
+
+               offset += (int)(sep - tb->ltb_text);
+               tb2 = lnet_new_text_buf(strlen(sep));
+               if (tb2 == NULL)
+                       return -ENOMEM;
+
+               strcpy(tb2->ltb_text, sep);
+               list_add_tail(&tb2->ltb_list, nets);
+
+               tb = tb2;
+       }
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+       static char     networks[LNET_SINGLE_TEXTBUF_NOB];
+       static char     source[LNET_SINGLE_TEXTBUF_NOB];
+
+       struct list_head          raw_entries;
+       struct list_head          matched_nets;
+       struct list_head          current_nets;
+       struct list_head         *t;
+       struct list_head         *t2;
+       lnet_text_buf_t    *tb;
+       lnet_text_buf_t    *tb2;
+       __u32          net1;
+       __u32          net2;
+       int              len;
+       int              count;
+       int              dup;
+       int              rc;
+
+       INIT_LIST_HEAD(&raw_entries);
+       if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+               CERROR("Error parsing ip2nets\n");
+               LASSERT (lnet_tbnob == 0);
+               return -EINVAL;
+       }
+
+       INIT_LIST_HEAD(&matched_nets);
+       INIT_LIST_HEAD(&current_nets);
+       networks[0] = 0;
+       count = 0;
+       len = 0;
+       rc = 0;
+
+       while (!list_empty(&raw_entries)) {
+               tb = list_entry(raw_entries.next, lnet_text_buf_t,
+                                   ltb_list);
+
+               strncpy(source, tb->ltb_text, sizeof(source)-1);
+               source[sizeof(source)-1] = 0;
+
+               /* replace ltb_text with the network(s) add on match */
+               rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+               if (rc < 0)
+                       break;
+
+               list_del(&tb->ltb_list);
+
+               if (rc == 0) {            /* no match */
+                       lnet_free_text_buf(tb);
+                       continue;
+               }
+
+               /* split into separate networks */
+               INIT_LIST_HEAD(&current_nets);
+               list_add(&tb->ltb_list, &current_nets);
+               rc = lnet_splitnets(source, &current_nets);
+               if (rc < 0)
+                       break;
+
+               dup = 0;
+               list_for_each (t, &current_nets) {
+                       tb = list_entry(t, lnet_text_buf_t, ltb_list);
+                       net1 = lnet_netspec2net(tb->ltb_text);
+                       LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+                       list_for_each(t2, &matched_nets) {
+                               tb2 = list_entry(t2, lnet_text_buf_t,
+                                                    ltb_list);
+                               net2 = lnet_netspec2net(tb2->ltb_text);
+                               LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+                               if (net1 == net2) {
+                                       dup = 1;
+                                       break;
+                               }
+                       }
+
+                       if (dup)
+                               break;
+               }
+
+               if (dup) {
+                       lnet_free_text_bufs(&current_nets);
+                       continue;
+               }
+
+               list_for_each_safe(t, t2, &current_nets) {
+                       tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+                       list_del(&tb->ltb_list);
+                       list_add_tail(&tb->ltb_list, &matched_nets);
+
+                       len += snprintf(networks + len, sizeof(networks) - len,
+                                       "%s%s", (len == 0) ? "" : ",",
+                                       tb->ltb_text);
+
+                       if (len >= sizeof(networks)) {
+                               CERROR("Too many matched networks\n");
+                               rc = -E2BIG;
+                               goto out;
+                       }
+               }
+
+               count++;
+       }
+
+ out:
+       lnet_free_text_bufs(&raw_entries);
+       lnet_free_text_bufs(&matched_nets);
+       lnet_free_text_bufs(&current_nets);
+       LASSERT (lnet_tbnob == 0);
+
+       if (rc < 0)
+               return rc;
+
+       *networksp = networks;
+       return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+       LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+       int     up;
+       __u32      netmask;
+       __u32     *ipaddrs;
+       __u32     *ipaddrs2;
+       int     nip;
+       char     **ifnames;
+       int     nif = libcfs_ipif_enumerate(&ifnames);
+       int     i;
+       int     rc;
+
+       if (nif <= 0)
+               return nif;
+
+       LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+       if (ipaddrs == NULL) {
+               CERROR("Can't allocate ipaddrs[%d]\n", nif);
+               libcfs_ipif_free_enumeration(ifnames, nif);
+               return -ENOMEM;
+       }
+
+       for (i = nip = 0; i < nif; i++) {
+               if (!strcmp(ifnames[i], "lo"))
+                       continue;
+
+               rc = libcfs_ipif_query(ifnames[i], &up,
+                                      &ipaddrs[nip], &netmask);
+               if (rc != 0) {
+                       CWARN("Can't query interface %s: %d\n",
+                             ifnames[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Ignoring interface %s: it's down\n",
+                             ifnames[i]);
+                       continue;
+               }
+
+               nip++;
+       }
+
+       libcfs_ipif_free_enumeration(ifnames, nif);
+
+       if (nip == nif) {
+               *ipaddrsp = ipaddrs;
+       } else {
+               if (nip > 0) {
+                       LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+                       if (ipaddrs2 == NULL) {
+                               CERROR("Can't allocate ipaddrs[%d]\n", nip);
+                               nip = -ENOMEM;
+                       } else {
+                               memcpy(ipaddrs2, ipaddrs,
+                                      nip * sizeof(*ipaddrs));
+                               *ipaddrsp = ipaddrs2;
+                               rc = nip;
+                       }
+               }
+               lnet_ipaddr_free_enumeration(ipaddrs, nif);
+       }
+       return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+       __u32     *ipaddrs;
+       int     nip = lnet_ipaddr_enumerate(&ipaddrs);
+       int     rc;
+
+       if (nip < 0) {
+               LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+                                  "interfaces for ip2nets to match\n", nip);
+               return nip;
+       }
+
+       if (nip == 0) {
+               LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+                                  "for ip2nets to match\n");
+               return -ENOENT;
+       }
+
+       rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+       lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+       if (rc < 0) {
+               LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+               return rc;
+       }
+
+       if (rc == 0) {
+               LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+                                  "any local IP interfaces\n");
+               return -ENOENT;
+       }
+
+       return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+       __u32  net = LNET_NIDNET(ni->ni_nid);
+       char **names;
+       int    n;
+       __u32  ip;
+       __u32  netmask;
+       int    up;
+       int    i;
+       int    rc;
+
+       /* Convenience for LNDs that use the IP address of a local interface as
+        * the local address part of their NID */
+
+       if (ni->ni_interfaces[0] != NULL) {
+
+               CLASSERT (LNET_MAX_INTERFACES > 1);
+
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Net %s doesn't support multiple interfaces\n",
+                              libcfs_net2str(net));
+                       return -EPERM;
+               }
+
+               rc = libcfs_ipif_query(ni->ni_interfaces[0],
+                                      &up, &ip, &netmask);
+               if (rc != 0) {
+                       CERROR("Net %s can't query interface %s: %d\n",
+                              libcfs_net2str(net), ni->ni_interfaces[0], rc);
+                       return -EPERM;
+               }
+
+               if (!up) {
+                       CERROR("Net %s can't use interface %s: it's down\n",
+                              libcfs_net2str(net), ni->ni_interfaces[0]);
+                       return -ENETDOWN;
+               }
+
+               ni->ni_nid = LNET_MKNID(net, ip);
+               return 0;
+       }
+
+       n = libcfs_ipif_enumerate(&names);
+       if (n <= 0) {
+               CERROR("Net %s can't enumerate interfaces: %d\n",
+                      libcfs_net2str(net), n);
+               return 0;
+       }
+
+       for (i = 0; i < n; i++) {
+               if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                       continue;
+
+               rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+               if (rc != 0) {
+                       CWARN("Net %s can't query interface %s: %d\n",
+                             libcfs_net2str(net), names[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Net %s ignoring interface %s (down)\n",
+                             libcfs_net2str(net), names[i]);
+                       continue;
+               }
+
+               libcfs_ipif_free_enumeration(names, n);
+               ni->ni_nid = LNET_MKNID(net, ip);
+               return 0;
+       }
+
+       CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+       libcfs_ipif_free_enumeration(names, n);
+       return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644 (file)
index 0000000..78297a7
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+           lnet_handle_eq_t *handle)
+{
+       lnet_eq_t     *eq;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+        * overflow, they don't skip entries, so the queue has the same
+        * apparent capacity at all times */
+
+       count = cfs_power2_roundup(count);
+
+       if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+               CWARN("EQ callback is guaranteed to get every event, "
+                     "do you still want to set eqcount %d for polling "
+                     "event which will have locking overhead? "
+                     "Please contact with developer to confirm\n", count);
+       }
+
+       /* count can be 0 if only need callback, we can eliminate
+        * overhead of enqueue event */
+       if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+               return -EINVAL;
+
+       eq = lnet_eq_alloc();
+       if (eq == NULL)
+               return -ENOMEM;
+
+       if (count != 0) {
+               LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+               if (eq->eq_events == NULL)
+                       goto failed;
+               /* NB allocator has set all event sequence numbers to 0,
+                * so all them should be earlier than eq_deq_seq */
+       }
+
+       eq->eq_deq_seq = 1;
+       eq->eq_enq_seq = 1;
+       eq->eq_size = count;
+       eq->eq_callback = callback;
+
+       eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+                                      sizeof(*eq->eq_refs[0]));
+       if (eq->eq_refs == NULL)
+               goto failed;
+
+       /* MUST hold both exclusive lnet_res_lock */
+       lnet_res_lock(LNET_LOCK_EX);
+       /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+        * both EQ lookup and poll event with only lnet_eq_wait_lock */
+       lnet_eq_wait_lock();
+
+       lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+       list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+       lnet_eq_wait_unlock();
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       lnet_eq2handle(handle, eq);
+       return 0;
+
+failed:
+       if (eq->eq_events != NULL)
+               LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+       if (eq->eq_refs != NULL)
+               cfs_percpt_free(eq->eq_refs);
+
+       lnet_eq_free(eq);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+       struct lnet_eq  *eq;
+       lnet_event_t    *events = NULL;
+       int             **refs = NULL;
+       int             *ref;
+       int             rc = 0;
+       int             size = 0;
+       int             i;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       lnet_res_lock(LNET_LOCK_EX);
+       /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+        * both EQ lookup and poll event with only lnet_eq_wait_lock */
+       lnet_eq_wait_lock();
+
+       eq = lnet_handle2eq(&eqh);
+       if (eq == NULL) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       cfs_percpt_for_each(ref, i, eq->eq_refs) {
+               LASSERT(*ref >= 0);
+               if (*ref == 0)
+                       continue;
+
+               CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+                      i, *ref);
+               rc = -EBUSY;
+               goto out;
+       }
+
+       /* stash for free after lock dropped */
+       events  = eq->eq_events;
+       size    = eq->eq_size;
+       refs    = eq->eq_refs;
+
+       lnet_res_lh_invalidate(&eq->eq_lh);
+       list_del(&eq->eq_list);
+       lnet_eq_free_locked(eq);
+ out:
+       lnet_eq_wait_unlock();
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       if (events != NULL)
+               LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+       if (refs != NULL)
+               cfs_percpt_free(refs);
+
+       return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+       /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+       int index;
+
+       if (eq->eq_size == 0) {
+               LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+               eq->eq_callback(ev);
+               return;
+       }
+
+       lnet_eq_wait_lock();
+       ev->sequence = eq->eq_enq_seq++;
+
+       LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+       index = ev->sequence & (eq->eq_size - 1);
+
+       eq->eq_events[index] = *ev;
+
+       if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+               eq->eq_callback(ev);
+
+       /* Wake anyone waiting in LNetEQPoll() */
+       if (waitqueue_active(&the_lnet.ln_eq_waitq))
+               wake_up_all(&the_lnet.ln_eq_waitq);
+       lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+       int             new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+       lnet_event_t    *new_event = &eq->eq_events[new_index];
+       int             rc;
+       ENTRY;
+
+       /* must called with lnet_eq_wait_lock hold */
+       if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+               RETURN(0);
+
+       /* We've got a new event... */
+       *ev = *new_event;
+
+       CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+              new_event, eq->eq_deq_seq, eq->eq_size);
+
+       /* ...but did it overwrite an event we've not seen yet? */
+       if (eq->eq_deq_seq == new_event->sequence) {
+               rc = 1;
+       } else {
+               /* don't complain with CERROR: some EQs are sized small
+                * anyway; if it's important, the caller should complain */
+               CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+                      eq->eq_deq_seq, new_event->sequence);
+               rc = -EOVERFLOW;
+       }
+
+       eq->eq_deq_seq = new_event->sequence + 1;
+       RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0     No pending event in the EQ.
+ * \retval 1     Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+       int which;
+
+       return LNetEQPoll(&eventq, 1, 0,
+                        event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1     Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+       int which;
+
+       return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+                        event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+       int             tms = *timeout_ms;
+       int             wait;
+       wait_queue_t  wl;
+       cfs_time_t      now;
+
+       if (tms == 0)
+               return -1; /* don't want to wait and no new event */
+
+       init_waitqueue_entry_current(&wl);
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+       lnet_eq_wait_unlock();
+
+       if (tms < 0) {
+               waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+       } else {
+               struct timeval tv;
+
+               now = cfs_time_current();
+               waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+                                   cfs_time_seconds(tms) / 1000);
+               cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+               tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+               if (tms < 0) /* no more wait but may have new event */
+                       tms = 0;
+       }
+
+       wait = tms != 0; /* might need to call here again */
+       *timeout_ms = tms;
+
+       lnet_eq_wait_lock();
+       remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+       return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0     No pending event in the EQs after timeout.
+ * \retval 1     Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+          lnet_event_t *event, int *which)
+{
+       int     wait = 1;
+       int     rc;
+       int     i;
+       ENTRY;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (neq < 1)
+               RETURN(-ENOENT);
+
+       lnet_eq_wait_lock();
+
+       for (;;) {
+               for (i = 0; i < neq; i++) {
+                       lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+                       if (eq == NULL) {
+                               lnet_eq_wait_unlock();
+                               RETURN(-ENOENT);
+                       }
+
+                       rc = lnet_eq_dequeue_event(eq, event);
+                       if (rc != 0) {
+                               lnet_eq_wait_unlock();
+                               *which = i;
+                               RETURN(rc);
+                       }
+               }
+
+               if (wait == 0)
+                       break;
+
+               /*
+                * return value of lnet_eq_wait_locked:
+                * -1 : did nothing and it's sure no new event
+                *  1 : sleep inside and wait until new event
+                *  0 : don't want to wait anymore, but might have new event
+                *      so need to call dequeue again
+                */
+               wait = lnet_eq_wait_locked(&timeout_ms);
+               if (wait < 0) /* no new event */
+                       break;
+       }
+
+       lnet_eq_wait_unlock();
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644 (file)
index 0000000..ae643f2
--- /dev/null
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+       if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+               /* first unlink attempt... */
+               lnet_me_t *me = md->md_me;
+
+               md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+               /* Disassociate from ME (if any), and unlink it if it was created
+                * with LNET_UNLINK */
+               if (me != NULL) {
+                       /* detach MD from portal */
+                       lnet_ptl_detach_md(me, md);
+                       if (me->me_unlink == LNET_UNLINK)
+                               lnet_me_unlink(me);
+               }
+
+               /* ensure all future handle lookups fail */
+               lnet_res_lh_invalidate(&md->md_lh);
+       }
+
+       if (md->md_refcount != 0) {
+               CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+               return;
+       }
+
+       CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+       if (md->md_eq != NULL) {
+               int     cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+               LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+               (*md->md_eq->eq_refs[cpt])--;
+       }
+
+       LASSERT(!list_empty(&md->md_list));
+       list_del_init(&md->md_list);
+       lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+       int       i;
+       unsigned int niov;
+       int       total_length = 0;
+
+       lmd->md_me = NULL;
+       lmd->md_start = umd->start;
+       lmd->md_offset = 0;
+       lmd->md_max_size = umd->max_size;
+       lmd->md_options = umd->options;
+       lmd->md_user_ptr = umd->user_ptr;
+       lmd->md_eq = NULL;
+       lmd->md_threshold = umd->threshold;
+       lmd->md_refcount = 0;
+       lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+       if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+               if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+                       return -EINVAL;
+
+               lmd->md_niov = niov = umd->length;
+               memcpy(lmd->md_iov.iov, umd->start,
+                      niov * sizeof (lmd->md_iov.iov[0]));
+
+               for (i = 0; i < (int)niov; i++) {
+                       /* We take the base address on trust */
+                       if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                               return -EINVAL;
+
+                       total_length += lmd->md_iov.iov[i].iov_len;
+               }
+
+               lmd->md_length = total_length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > total_length)) // illegal max_size
+                       return -EINVAL;
+
+       } else if ((umd->options & LNET_MD_KIOV) != 0) {
+               lmd->md_niov = niov = umd->length;
+               memcpy(lmd->md_iov.kiov, umd->start,
+                      niov * sizeof (lmd->md_iov.kiov[0]));
+
+               for (i = 0; i < (int)niov; i++) {
+                       /* We take the page pointer on trust */
+                       if (lmd->md_iov.kiov[i].kiov_offset +
+                           lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+                               return -EINVAL; /* invalid length */
+
+                       total_length += lmd->md_iov.kiov[i].kiov_len;
+               }
+
+               lmd->md_length = total_length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > total_length)) // illegal max_size
+                       return -EINVAL;
+       } else {   /* contiguous */
+               lmd->md_length = umd->length;
+               lmd->md_niov = niov = 1;
+               lmd->md_iov.iov[0].iov_base = umd->start;
+               lmd->md_iov.iov[0].iov_len = umd->length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > (int)umd->length)) // illegal max_size
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+       struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+       /* NB we are passed an allocated, but inactive md.
+        * if we return success, caller may lnet_md_unlink() it.
+        * otherwise caller may only lnet_md_free() it.
+        */
+       /* This implementation doesn't know how to create START events or
+        * disable END events.  Best to LASSERT our caller is compliant so
+        * we find out quickly...  */
+       /*  TODO - reevaluate what should be here in light of
+        * the removal of the start and end events
+        * maybe there we shouldn't even allow LNET_EQ_NONE!)
+        * LASSERT (eq == NULL);
+        */
+       if (!LNetHandleIsInvalid(eq_handle)) {
+               md->md_eq = lnet_handle2eq(&eq_handle);
+
+               if (md->md_eq == NULL)
+                       return -ENOENT;
+
+               (*md->md_eq->eq_refs[cpt])++;
+       }
+
+       lnet_res_lh_initialize(container, &md->md_lh);
+
+       LASSERT(list_empty(&md->md_list));
+       list_add(&md->md_list, &container->rec_active);
+
+       return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+       /* NB this doesn't copy out all the iov entries so when a
+        * discontiguous MD is copied out, the target gets to know the
+        * original iov pointer (in start) and the number of entries it had
+        * and that's all.
+        */
+       umd->start = lmd->md_start;
+       umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+                     lmd->md_length : lmd->md_niov;
+       umd->threshold = lmd->md_threshold;
+       umd->max_size = lmd->md_max_size;
+       umd->options = lmd->md_options;
+       umd->user_ptr = lmd->md_user_ptr;
+       lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+       if (umd->start == NULL && umd->length != 0) {
+               CERROR("MD start pointer can not be NULL with length %u\n",
+                      umd->length);
+               return -EINVAL;
+       }
+
+       if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+           umd->length > LNET_MAX_IOV) {
+               CERROR("Invalid option: too many fragments %u, %d max\n",
+                      umd->length, LNET_MAX_IOV);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+            lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+       LIST_HEAD               (matches);
+       LIST_HEAD               (drops);
+       struct lnet_me          *me;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (lnet_md_validate(&umd) != 0)
+               return -EINVAL;
+
+       if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+               CERROR("Invalid option: no MD_OP set\n");
+               return -EINVAL;
+       }
+
+       md = lnet_md_alloc(&umd);
+       if (md == NULL)
+               return -ENOMEM;
+
+       rc = lnet_md_build(md, &umd, unlink);
+       cpt = lnet_cpt_of_cookie(meh.cookie);
+
+       lnet_res_lock(cpt);
+       if (rc != 0)
+               goto failed;
+
+       me = lnet_handle2me(&meh);
+       if (me == NULL)
+               rc = -ENOENT;
+       else if (me->me_md != NULL)
+               rc = -EBUSY;
+       else
+               rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+       if (rc != 0)
+               goto failed;
+
+       /* attach this MD to portal of ME and check if it matches any
+        * blocked msgs on this portal */
+       lnet_ptl_attach_md(me, md, &matches, &drops);
+
+       lnet_md2handle(handle, md);
+
+       lnet_res_unlock(cpt);
+
+       lnet_drop_delayed_msg_list(&drops, "Bad match");
+       lnet_recv_delayed_msg_list(&matches);
+
+       return 0;
+
+ failed:
+       lnet_md_free_locked(md);
+
+       lnet_res_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+       lnet_libmd_t    *md;
+       int             cpt;
+       int             rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (lnet_md_validate(&umd) != 0)
+               return -EINVAL;
+
+       if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+               CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+               return -EINVAL;
+       }
+
+       md = lnet_md_alloc(&umd);
+       if (md == NULL)
+               return -ENOMEM;
+
+       rc = lnet_md_build(md, &umd, unlink);
+
+       cpt = lnet_res_lock_current();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_md_link(md, umd.eq_handle, cpt);
+       if (rc != 0)
+               goto failed;
+
+       lnet_md2handle(handle, md);
+
+       lnet_res_unlock(cpt);
+       return 0;
+
+ failed:
+       lnet_md_free_locked(md);
+
+       lnet_res_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+       lnet_event_t    ev;
+       lnet_libmd_t    *md;
+       int             cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL) {
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+        * when the NAL is done, the completion event flags that the MD was
+        * unlinked.  Otherwise, we enqueue an event now... */
+
+       if (md->md_eq != NULL &&
+           md->md_refcount == 0) {
+               lnet_build_unlink_event(md, &ev);
+               lnet_eq_enqueue_event(md->md_eq, &ev);
+       }
+
+       lnet_md_unlink(md);
+
+       lnet_res_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644 (file)
index 0000000..0081075
--- /dev/null
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+            lnet_process_id_t match_id,
+            __u64 match_bits, __u64 ignore_bits,
+            lnet_unlink_t unlink, lnet_ins_pos_t pos,
+            lnet_handle_me_t *handle)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_me          *me;
+       struct list_head                *head;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       if ((int)portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       mtable = lnet_mt_of_attach(portal, match_id,
+                                  match_bits, ignore_bits, pos);
+       if (mtable == NULL) /* can't match portal type */
+               return -EPERM;
+
+       me = lnet_me_alloc();
+       if (me == NULL)
+               return -ENOMEM;
+
+       lnet_res_lock(mtable->mt_cpt);
+
+       me->me_portal = portal;
+       me->me_match_id = match_id;
+       me->me_match_bits = match_bits;
+       me->me_ignore_bits = ignore_bits;
+       me->me_unlink = unlink;
+       me->me_md = NULL;
+
+       lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+                              &me->me_lh);
+       if (ignore_bits != 0)
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+       me->me_pos = head - &mtable->mt_mhash[0];
+       if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+               list_add_tail(&me->me_list, head);
+       else
+               list_add(&me->me_list, head);
+
+       lnet_me2handle(handle, me);
+
+       lnet_res_unlock(mtable->mt_cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+            lnet_process_id_t match_id,
+            __u64 match_bits, __u64 ignore_bits,
+            lnet_unlink_t unlink, lnet_ins_pos_t pos,
+            lnet_handle_me_t *handle)
+{
+       struct lnet_me          *current_me;
+       struct lnet_me          *new_me;
+       struct lnet_portal      *ptl;
+       int                     cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       if (pos == LNET_INS_LOCAL)
+               return -EPERM;
+
+       new_me = lnet_me_alloc();
+       if (new_me == NULL)
+               return -ENOMEM;
+
+       cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+       lnet_res_lock(cpt);
+
+       current_me = lnet_handle2me(&current_meh);
+       if (current_me == NULL) {
+               lnet_me_free_locked(new_me);
+
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+       ptl = the_lnet.ln_portals[current_me->me_portal];
+       if (lnet_ptl_is_unique(ptl)) {
+               /* nosense to insertion on unique portal */
+               lnet_me_free_locked(new_me);
+               lnet_res_unlock(cpt);
+               return -EPERM;
+       }
+
+       new_me->me_pos = current_me->me_pos;
+       new_me->me_portal = current_me->me_portal;
+       new_me->me_match_id = match_id;
+       new_me->me_match_bits = match_bits;
+       new_me->me_ignore_bits = ignore_bits;
+       new_me->me_unlink = unlink;
+       new_me->me_md = NULL;
+
+       lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+       if (pos == LNET_INS_AFTER)
+               list_add(&new_me->me_list, &current_me->me_list);
+       else
+               list_add_tail(&new_me->me_list, &current_me->me_list);
+
+       lnet_me2handle(handle, new_me);
+
+       lnet_res_unlock(cpt);
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+       lnet_me_t       *me;
+       lnet_libmd_t    *md;
+       lnet_event_t    ev;
+       int             cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_cpt_of_cookie(meh.cookie);
+       lnet_res_lock(cpt);
+
+       me = lnet_handle2me(&meh);
+       if (me == NULL) {
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       md = me->me_md;
+       if (md != NULL &&
+           md->md_eq != NULL &&
+           md->md_refcount == 0) {
+               lnet_build_unlink_event(md, &ev);
+               lnet_eq_enqueue_event(md->md_eq, &ev);
+       }
+
+       lnet_me_unlink(me);
+
+       lnet_res_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+       list_del(&me->me_list);
+
+       if (me->me_md != NULL) {
+               lnet_libmd_t *md = me->me_md;
+
+               /* detach MD from portal of this ME */
+               lnet_ptl_detach_md(me, md);
+               lnet_md_unlink(md);
+       }
+
+       lnet_res_lh_invalidate(&me->me_lh);
+       lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+       CWARN("Match Entry %p ("LPX64")\n", me,
+             me->me_lh.lh_cookie);
+
+       CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+             me->me_match_bits, me->me_ignore_bits);
+
+       CWARN("\tMD\t= %p\n", me->md);
+       CWARN("\tprev\t= %p\n",
+             list_entry(me->me_list.prev, lnet_me_t, me_list));
+       CWARN("\tnext\t= %p\n",
+             list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644 (file)
index 0000000..49b0f12
--- /dev/null
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+               "Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+       lnet_test_peer_t  *tp;
+       struct list_head        *el;
+       struct list_head        *next;
+       struct list_head         cull;
+
+       LASSERT (the_lnet.ln_init);
+
+       /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+       if (threshold != 0) {
+               /* Adding a new entry */
+               LIBCFS_ALLOC(tp, sizeof(*tp));
+               if (tp == NULL)
+                       return -ENOMEM;
+
+               tp->tp_nid = nid;
+               tp->tp_threshold = threshold;
+
+               lnet_net_lock(0);
+               list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+               lnet_net_unlock(0);
+               return 0;
+       }
+
+       /* removing entries */
+       INIT_LIST_HEAD(&cull);
+
+       lnet_net_lock(0);
+
+       list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+               tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+               if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                   nid == LNET_NID_ANY ||       /* removing all entries */
+                   tp->tp_nid == nid)    /* matched this one */
+               {
+                       list_del (&tp->tp_list);
+                       list_add (&tp->tp_list, &cull);
+               }
+       }
+
+       lnet_net_unlock(0);
+
+       while (!list_empty (&cull)) {
+               tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+               list_del (&tp->tp_list);
+               LIBCFS_FREE(tp, sizeof (*tp));
+       }
+       return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+       lnet_test_peer_t *tp;
+       struct list_head       *el;
+       struct list_head       *next;
+       struct list_head        cull;
+       int            fail = 0;
+
+       INIT_LIST_HEAD (&cull);
+
+       /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+       lnet_net_lock(0);
+
+       list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+               tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+               if (tp->tp_threshold == 0) {
+                       /* zombie entry */
+                       if (outgoing) {
+                               /* only cull zombies on outgoing tests,
+                                * since we may be at interrupt priority on
+                                * incoming messages. */
+                               list_del (&tp->tp_list);
+                               list_add (&tp->tp_list, &cull);
+                       }
+                       continue;
+               }
+
+               if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+                   nid == tp->tp_nid) {        /* fail this peer */
+                       fail = 1;
+
+                       if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+                               tp->tp_threshold--;
+                               if (outgoing &&
+                                   tp->tp_threshold == 0) {
+                                       /* see above */
+                                       list_del (&tp->tp_list);
+                                       list_add (&tp->tp_list, &cull);
+                               }
+                       }
+                       break;
+               }
+       }
+
+       lnet_net_unlock(0);
+
+       while (!list_empty (&cull)) {
+               tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+               list_del (&tp->tp_list);
+
+               LIBCFS_FREE(tp, sizeof (*tp));
+       }
+
+       return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+       unsigned int nob = 0;
+
+       while (niov-- > 0)
+               nob += (iov++)->iov_len;
+
+       return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                  unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                  unsigned int nob)
+{
+       /* NB diov, siov are READ-ONLY */
+       unsigned int  this_nob;
+
+       if (nob == 0)
+               return;
+
+       /* skip complete frags before 'doffset' */
+       LASSERT (ndiov > 0);
+       while (doffset >= diov->iov_len) {
+               doffset -= diov->iov_len;
+               diov++;
+               ndiov--;
+               LASSERT (ndiov > 0);
+       }
+
+       /* skip complete frags before 'soffset' */
+       LASSERT (nsiov > 0);
+       while (soffset >= siov->iov_len) {
+               soffset -= siov->iov_len;
+               siov++;
+               nsiov--;
+               LASSERT (nsiov > 0);
+       }
+
+       do {
+               LASSERT (ndiov > 0);
+               LASSERT (nsiov > 0);
+               this_nob = MIN(diov->iov_len - doffset,
+                              siov->iov_len - soffset);
+               this_nob = MIN(this_nob, nob);
+
+               memcpy ((char *)diov->iov_base + doffset,
+                       (char *)siov->iov_base + soffset, this_nob);
+               nob -= this_nob;
+
+               if (diov->iov_len > doffset + this_nob) {
+                       doffset += this_nob;
+               } else {
+                       diov++;
+                       ndiov--;
+                       doffset = 0;
+               }
+
+               if (siov->iov_len > soffset + this_nob) {
+                       soffset += this_nob;
+               } else {
+                       siov++;
+                       nsiov--;
+                       soffset = 0;
+               }
+       } while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+                 int src_niov, struct iovec *src,
+                 unsigned int offset, unsigned int len)
+{
+       /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+        * for exactly 'len' bytes, and return the number of entries.
+        * NB not destructive to 'src' */
+       unsigned int    frag_len;
+       unsigned int    niov;
+
+       if (len == 0)                      /* no data => */
+               return (0);                  /* no frags */
+
+       LASSERT (src_niov > 0);
+       while (offset >= src->iov_len) {      /* skip initial frags */
+               offset -= src->iov_len;
+               src_niov--;
+               src++;
+               LASSERT (src_niov > 0);
+       }
+
+       niov = 1;
+       for (;;) {
+               LASSERT (src_niov > 0);
+               LASSERT ((int)niov <= dst_niov);
+
+               frag_len = src->iov_len - offset;
+               dst->iov_base = ((char *)src->iov_base) + offset;
+
+               if (len <= frag_len) {
+                       dst->iov_len = len;
+                       return (niov);
+               }
+
+               dst->iov_len = frag_len;
+
+               len -= frag_len;
+               dst++;
+               src++;
+               niov++;
+               src_niov--;
+               offset = 0;
+       }
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+       unsigned int  nob = 0;
+
+       while (niov-- > 0)
+               nob += (kiov++)->kiov_len;
+
+       return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+                    unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+                    unsigned int nob)
+{
+       /* NB diov, siov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *daddr = NULL;
+       char       *saddr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (ndiov > 0);
+       while (doffset >= diov->kiov_len) {
+               doffset -= diov->kiov_len;
+               diov++;
+               ndiov--;
+               LASSERT (ndiov > 0);
+       }
+
+       LASSERT (nsiov > 0);
+       while (soffset >= siov->kiov_len) {
+               soffset -= siov->kiov_len;
+               siov++;
+               nsiov--;
+               LASSERT (nsiov > 0);
+       }
+
+       do {
+               LASSERT (ndiov > 0);
+               LASSERT (nsiov > 0);
+               this_nob = MIN(diov->kiov_len - doffset,
+                              siov->kiov_len - soffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (daddr == NULL)
+                       daddr = ((char *)kmap(diov->kiov_page)) +
+                               diov->kiov_offset + doffset;
+               if (saddr == NULL)
+                       saddr = ((char *)kmap(siov->kiov_page)) +
+                               siov->kiov_offset + soffset;
+
+               /* Vanishing risk of kmap deadlock when mapping 2 pages.
+                * However in practice at least one of the kiovs will be mapped
+                * kernel pages and the map/unmap will be NOOPs */
+
+               memcpy (daddr, saddr, this_nob);
+               nob -= this_nob;
+
+               if (diov->kiov_len > doffset + this_nob) {
+                       daddr += this_nob;
+                       doffset += this_nob;
+               } else {
+                       kunmap(diov->kiov_page);
+                       daddr = NULL;
+                       diov++;
+                       ndiov--;
+                       doffset = 0;
+               }
+
+               if (siov->kiov_len > soffset + this_nob) {
+                       saddr += this_nob;
+                       soffset += this_nob;
+               } else {
+                       kunmap(siov->kiov_page);
+                       saddr = NULL;
+                       siov++;
+                       nsiov--;
+                       soffset = 0;
+               }
+       } while (nob > 0);
+
+       if (daddr != NULL)
+               kunmap(diov->kiov_page);
+       if (saddr != NULL)
+               kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                   unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                   unsigned int nob)
+{
+       /* NB iov, kiov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *addr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (niov > 0);
+       while (iovoffset >= iov->iov_len) {
+               iovoffset -= iov->iov_len;
+               iov++;
+               niov--;
+               LASSERT (niov > 0);
+       }
+
+       LASSERT (nkiov > 0);
+       while (kiovoffset >= kiov->kiov_len) {
+               kiovoffset -= kiov->kiov_len;
+               kiov++;
+               nkiov--;
+               LASSERT (nkiov > 0);
+       }
+
+       do {
+               LASSERT (niov > 0);
+               LASSERT (nkiov > 0);
+               this_nob = MIN(iov->iov_len - iovoffset,
+                              kiov->kiov_len - kiovoffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (addr == NULL)
+                       addr = ((char *)kmap(kiov->kiov_page)) +
+                               kiov->kiov_offset + kiovoffset;
+
+               memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+               nob -= this_nob;
+
+               if (iov->iov_len > iovoffset + this_nob) {
+                       iovoffset += this_nob;
+               } else {
+                       iov++;
+                       niov--;
+                       iovoffset = 0;
+               }
+
+               if (kiov->kiov_len > kiovoffset + this_nob) {
+                       addr += this_nob;
+                       kiovoffset += this_nob;
+               } else {
+                       kunmap(kiov->kiov_page);
+                       addr = NULL;
+                       kiov++;
+                       nkiov--;
+                       kiovoffset = 0;
+               }
+
+       } while (nob > 0);
+
+       if (addr != NULL)
+               kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                   unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                   unsigned int nob)
+{
+       /* NB kiov, iov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *addr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (nkiov > 0);
+       while (kiovoffset >= kiov->kiov_len) {
+               kiovoffset -= kiov->kiov_len;
+               kiov++;
+               nkiov--;
+               LASSERT (nkiov > 0);
+       }
+
+       LASSERT (niov > 0);
+       while (iovoffset >= iov->iov_len) {
+               iovoffset -= iov->iov_len;
+               iov++;
+               niov--;
+               LASSERT (niov > 0);
+       }
+
+       do {
+               LASSERT (nkiov > 0);
+               LASSERT (niov > 0);
+               this_nob = MIN(kiov->kiov_len - kiovoffset,
+                              iov->iov_len - iovoffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (addr == NULL)
+                       addr = ((char *)kmap(kiov->kiov_page)) +
+                               kiov->kiov_offset + kiovoffset;
+
+               memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+               nob -= this_nob;
+
+               if (kiov->kiov_len > kiovoffset + this_nob) {
+                       addr += this_nob;
+                       kiovoffset += this_nob;
+               } else {
+                       kunmap(kiov->kiov_page);
+                       addr = NULL;
+                       kiov++;
+                       nkiov--;
+                       kiovoffset = 0;
+               }
+
+               if (iov->iov_len > iovoffset + this_nob) {
+                       iovoffset += this_nob;
+               } else {
+                       iov++;
+                       niov--;
+                       iovoffset = 0;
+               }
+       } while (nob > 0);
+
+       if (addr != NULL)
+               kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                  int src_niov, lnet_kiov_t *src,
+                  unsigned int offset, unsigned int len)
+{
+       /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+        * for exactly 'len' bytes, and return the number of entries.
+        * NB not destructive to 'src' */
+       unsigned int    frag_len;
+       unsigned int    niov;
+
+       if (len == 0)                      /* no data => */
+               return (0);                  /* no frags */
+
+       LASSERT (src_niov > 0);
+       while (offset >= src->kiov_len) {      /* skip initial frags */
+               offset -= src->kiov_len;
+               src_niov--;
+               src++;
+               LASSERT (src_niov > 0);
+       }
+
+       niov = 1;
+       for (;;) {
+               LASSERT (src_niov > 0);
+               LASSERT ((int)niov <= dst_niov);
+
+               frag_len = src->kiov_len - offset;
+               dst->kiov_page = src->kiov_page;
+               dst->kiov_offset = src->kiov_offset + offset;
+
+               if (len <= frag_len) {
+                       dst->kiov_len = len;
+                       LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+                       return (niov);
+               }
+
+               dst->kiov_len = frag_len;
+               LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+               len -= frag_len;
+               dst++;
+               src++;
+               niov++;
+               src_niov--;
+               offset = 0;
+       }
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       unsigned int  niov = 0;
+       struct iovec *iov = NULL;
+       lnet_kiov_t  *kiov = NULL;
+       int        rc;
+
+       LASSERT (!in_interrupt ());
+       LASSERT (mlen == 0 || msg != NULL);
+
+       if (msg != NULL) {
+               LASSERT(msg->msg_receiving);
+               LASSERT(!msg->msg_sending);
+               LASSERT(rlen == msg->msg_len);
+               LASSERT(mlen <= msg->msg_len);
+               LASSERT(msg->msg_offset == offset);
+               LASSERT(msg->msg_wanted == mlen);
+
+               msg->msg_receiving = 0;
+
+               if (mlen != 0) {
+                       niov = msg->msg_niov;
+                       iov  = msg->msg_iov;
+                       kiov = msg->msg_kiov;
+
+                       LASSERT (niov > 0);
+                       LASSERT ((iov == NULL) != (kiov == NULL));
+               }
+       }
+
+       rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+                                   niov, iov, kiov, offset, mlen, rlen);
+       if (rc < 0)
+               lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+       lnet_libmd_t *md = msg->msg_md;
+
+       LASSERT (msg->msg_len > 0);
+       LASSERT (!msg->msg_routing);
+       LASSERT (md != NULL);
+       LASSERT (msg->msg_niov == 0);
+       LASSERT (msg->msg_iov == NULL);
+       LASSERT (msg->msg_kiov == NULL);
+
+       msg->msg_niov = md->md_niov;
+       if ((md->md_options & LNET_MD_KIOV) != 0)
+               msg->msg_kiov = md->md_iov.kiov;
+       else
+               msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+              unsigned int offset, unsigned int len)
+{
+       msg->msg_type = type;
+       msg->msg_target = target;
+       msg->msg_len = len;
+       msg->msg_offset = offset;
+
+       if (len != 0)
+               lnet_setpayloadbuffer(msg);
+
+       memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+       msg->msg_hdr.type          = cpu_to_le32(type);
+       msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+       /* src_nid will be set later */
+       msg->msg_hdr.src_pid    = cpu_to_le32(the_lnet.ln_pid);
+       msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       void   *priv = msg->msg_private;
+       int     rc;
+
+       LASSERT (!in_interrupt ());
+       LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+                (msg->msg_txcredit && msg->msg_peertxcredit));
+
+       rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+       if (rc < 0)
+               lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       int     rc;
+
+       LASSERT(!msg->msg_sending);
+       LASSERT(msg->msg_receiving);
+       LASSERT(!msg->msg_rx_ready_delay);
+       LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+       msg->msg_rx_ready_delay = 1;
+       rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+                                         &msg->msg_private);
+       if (rc != 0) {
+               CERROR("recv from %s / send to %s aborted: "
+                      "eager_recv failed %d\n",
+                      libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+                      libcfs_id2str(msg->msg_target), rc);
+               LASSERT(rc < 0); /* required by my callers */
+       }
+
+       return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+       cfs_time_t last_alive = 0;
+
+       LASSERT(lnet_peer_aliveness_enabled(lp));
+       LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+       lnet_net_unlock(lp->lp_cpt);
+       (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+       lnet_net_lock(lp->lp_cpt);
+
+       lp->lp_last_query = cfs_time_current();
+
+       if (last_alive != 0) /* NI has updated timestamp */
+               lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+       int     alive;
+       cfs_time_t deadline;
+
+       LASSERT (lnet_peer_aliveness_enabled(lp));
+
+       /* Trust lnet_notify() if it has more recent aliveness news, but
+        * ignore the initial assumed death (see lnet_peers_start_down()).
+        */
+       if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+           cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+               return 0;
+
+       deadline = cfs_time_add(lp->lp_last_alive,
+                               cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+       alive = cfs_time_after(deadline, now);
+
+       /* Update obsolete lp_alive except for routers assumed to be dead
+        * initially, because router checker would update aliveness in this
+        * case, and moreover lp_last_alive at peer creation is assumed.
+        */
+       if (alive && !lp->lp_alive &&
+           !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+               lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+       return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+       cfs_time_t now = cfs_time_current();
+
+       if (!lnet_peer_aliveness_enabled(lp))
+               return -ENODEV;
+
+       if (lnet_peer_is_alive(lp, now))
+               return 1;
+
+       /* Peer appears dead, but we should avoid frequent NI queries (at
+        * most once per lnet_queryinterval seconds). */
+       if (lp->lp_last_query != 0) {
+               static const int lnet_queryinterval = 1;
+
+               cfs_time_t next_query =
+                          cfs_time_add(lp->lp_last_query,
+                                       cfs_time_seconds(lnet_queryinterval));
+
+               if (cfs_time_before(now, next_query)) {
+                       if (lp->lp_alive)
+                               CWARN("Unexpected aliveness of peer %s: "
+                                     "%d < %d (%d/%d)\n",
+                                     libcfs_nid2str(lp->lp_nid),
+                                     (int)now, (int)next_query,
+                                     lnet_queryinterval,
+                                     lp->lp_ni->ni_peertimeout);
+                       return 0;
+               }
+       }
+
+       /* query NI for latest aliveness news */
+       lnet_ni_query_locked(lp->lp_ni, lp);
+
+       if (lnet_peer_is_alive(lp, now))
+               return 1;
+
+       lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+       return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+       /* lnet_send is going to lnet_net_unlock immediately after this,
+        * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+        * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+        * appears dead, and 0 if sent or OK to send */
+       struct lnet_peer        *lp = msg->msg_txpeer;
+       struct lnet_ni          *ni = lp->lp_ni;
+       struct lnet_tx_queue    *tq;
+       int                     cpt;
+
+       /* non-lnet_send() callers have checked before */
+       LASSERT(!do_send || msg->msg_tx_delayed);
+       LASSERT(!msg->msg_receiving);
+       LASSERT(msg->msg_tx_committed);
+
+       cpt = msg->msg_tx_cpt;
+       tq = ni->ni_tx_queues[cpt];
+
+       /* NB 'lp' is always the next hop */
+       if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+           lnet_peer_alive_locked(lp) == 0) {
+               the_lnet.ln_counters[cpt]->drop_count++;
+               the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+               lnet_net_unlock(cpt);
+
+               CNETERR("Dropping message for %s: peer not alive\n",
+                       libcfs_id2str(msg->msg_target));
+               if (do_send)
+                       lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+               lnet_net_lock(cpt);
+               return EHOSTUNREACH;
+       }
+
+       if (!msg->msg_peertxcredit) {
+               LASSERT ((lp->lp_txcredits < 0) ==
+                        !list_empty(&lp->lp_txq));
+
+               msg->msg_peertxcredit = 1;
+               lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+               lp->lp_txcredits--;
+
+               if (lp->lp_txcredits < lp->lp_mintxcredits)
+                       lp->lp_mintxcredits = lp->lp_txcredits;
+
+               if (lp->lp_txcredits < 0) {
+                       msg->msg_tx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &lp->lp_txq);
+                       return EAGAIN;
+               }
+       }
+
+       if (!msg->msg_txcredit) {
+               LASSERT((tq->tq_credits < 0) ==
+                       !list_empty(&tq->tq_delayed));
+
+               msg->msg_txcredit = 1;
+               tq->tq_credits--;
+
+               if (tq->tq_credits < tq->tq_credits_min)
+                       tq->tq_credits_min = tq->tq_credits;
+
+               if (tq->tq_credits < 0) {
+                       msg->msg_tx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &tq->tq_delayed);
+                       return EAGAIN;
+               }
+       }
+
+       if (do_send) {
+               lnet_net_unlock(cpt);
+               lnet_ni_send(ni, msg);
+               lnet_net_lock(cpt);
+       }
+       return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+       lnet_rtrbufpool_t       *rbp;
+       int                     cpt;
+
+       LASSERT(msg->msg_rx_committed);
+
+       cpt = msg->msg_rx_cpt;
+       rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+       LASSERT(msg->msg_len <= LNET_MTU);
+       while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+               rbp++;
+               LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+       }
+
+       return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+       /* lnet_parse is going to lnet_net_unlock immediately after this, so it
+        * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+        * return EAGAIN if msg blocked and 0 if received or OK to receive */
+       lnet_peer_t      *lp = msg->msg_rxpeer;
+       lnet_rtrbufpool_t   *rbp;
+       lnet_rtrbuf_t       *rb;
+
+       LASSERT (msg->msg_iov == NULL);
+       LASSERT (msg->msg_kiov == NULL);
+       LASSERT (msg->msg_niov == 0);
+       LASSERT (msg->msg_routing);
+       LASSERT (msg->msg_receiving);
+       LASSERT (!msg->msg_sending);
+
+       /* non-lnet_parse callers only receive delayed messages */
+       LASSERT(!do_recv || msg->msg_rx_delayed);
+
+       if (!msg->msg_peerrtrcredit) {
+               LASSERT ((lp->lp_rtrcredits < 0) ==
+                        !list_empty(&lp->lp_rtrq));
+
+               msg->msg_peerrtrcredit = 1;
+               lp->lp_rtrcredits--;
+               if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+                       lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+               if (lp->lp_rtrcredits < 0) {
+                       /* must have checked eager_recv before here */
+                       LASSERT(msg->msg_rx_ready_delay);
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+                       return EAGAIN;
+               }
+       }
+
+       rbp = lnet_msg2bufpool(msg);
+
+       if (!msg->msg_rtrcredit) {
+               LASSERT ((rbp->rbp_credits < 0) ==
+                        !list_empty(&rbp->rbp_msgs));
+
+               msg->msg_rtrcredit = 1;
+               rbp->rbp_credits--;
+               if (rbp->rbp_credits < rbp->rbp_mincredits)
+                       rbp->rbp_mincredits = rbp->rbp_credits;
+
+               if (rbp->rbp_credits < 0) {
+                       /* must have checked eager_recv before here */
+                       LASSERT(msg->msg_rx_ready_delay);
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+                       return EAGAIN;
+               }
+       }
+
+       LASSERT (!list_empty(&rbp->rbp_bufs));
+       rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+       list_del(&rb->rb_list);
+
+       msg->msg_niov = rbp->rbp_npages;
+       msg->msg_kiov = &rb->rb_kiov[0];
+
+       if (do_recv) {
+               int cpt = msg->msg_rx_cpt;
+
+               lnet_net_unlock(cpt);
+               lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+                            0, msg->msg_len, msg->msg_len);
+               lnet_net_lock(cpt);
+       }
+       return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+       lnet_peer_t     *txpeer = msg->msg_txpeer;
+       lnet_msg_t      *msg2;
+
+       if (msg->msg_txcredit) {
+               struct lnet_ni       *ni = txpeer->lp_ni;
+               struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+               /* give back NI txcredits */
+               msg->msg_txcredit = 0;
+
+               LASSERT((tq->tq_credits < 0) ==
+                       !list_empty(&tq->tq_delayed));
+
+               tq->tq_credits++;
+               if (tq->tq_credits <= 0) {
+                       msg2 = list_entry(tq->tq_delayed.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       LASSERT(msg2->msg_txpeer->lp_ni == ni);
+                       LASSERT(msg2->msg_tx_delayed);
+
+                       (void) lnet_post_send_locked(msg2, 1);
+               }
+       }
+
+       if (msg->msg_peertxcredit) {
+               /* give back peer txcredits */
+               msg->msg_peertxcredit = 0;
+
+               LASSERT((txpeer->lp_txcredits < 0) ==
+                       !list_empty(&txpeer->lp_txq));
+
+               txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+               LASSERT (txpeer->lp_txqnob >= 0);
+
+               txpeer->lp_txcredits++;
+               if (txpeer->lp_txcredits <= 0) {
+                       msg2 = list_entry(txpeer->lp_txq.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       LASSERT(msg2->msg_txpeer == txpeer);
+                       LASSERT(msg2->msg_tx_delayed);
+
+                       (void) lnet_post_send_locked(msg2, 1);
+               }
+       }
+
+       if (txpeer != NULL) {
+               msg->msg_txpeer = NULL;
+               lnet_peer_decref_locked(txpeer);
+       }
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+       lnet_peer_t     *rxpeer = msg->msg_rxpeer;
+       lnet_msg_t      *msg2;
+
+       if (msg->msg_rtrcredit) {
+               /* give back global router credits */
+               lnet_rtrbuf_t     *rb;
+               lnet_rtrbufpool_t *rbp;
+
+               /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+                * there until it gets one allocated, or aborts the wait
+                * itself */
+               LASSERT (msg->msg_kiov != NULL);
+
+               rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+               rbp = rb->rb_pool;
+               LASSERT (rbp == lnet_msg2bufpool(msg));
+
+               msg->msg_kiov = NULL;
+               msg->msg_rtrcredit = 0;
+
+               LASSERT((rbp->rbp_credits < 0) ==
+                       !list_empty(&rbp->rbp_msgs));
+               LASSERT((rbp->rbp_credits > 0) ==
+                       !list_empty(&rbp->rbp_bufs));
+
+               list_add(&rb->rb_list, &rbp->rbp_bufs);
+               rbp->rbp_credits++;
+               if (rbp->rbp_credits <= 0) {
+                       msg2 = list_entry(rbp->rbp_msgs.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       (void) lnet_post_routed_recv_locked(msg2, 1);
+               }
+       }
+
+       if (msg->msg_peerrtrcredit) {
+               /* give back peer router credits */
+               msg->msg_peerrtrcredit = 0;
+
+               LASSERT((rxpeer->lp_rtrcredits < 0) ==
+                       !list_empty(&rxpeer->lp_rtrq));
+
+               rxpeer->lp_rtrcredits++;
+               if (rxpeer->lp_rtrcredits <= 0) {
+                       msg2 = list_entry(rxpeer->lp_rtrq.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       (void) lnet_post_routed_recv_locked(msg2, 1);
+               }
+       }
+       if (rxpeer != NULL) {
+               msg->msg_rxpeer = NULL;
+               lnet_peer_decref_locked(rxpeer);
+       }
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+       lnet_peer_t *p1 = r1->lr_gateway;
+       lnet_peer_t *p2 = r2->lr_gateway;
+
+       if (r1->lr_hops < r2->lr_hops)
+               return 1;
+
+       if (r1->lr_hops > r2->lr_hops)
+               return -1;
+
+       if (p1->lp_txqnob < p2->lp_txqnob)
+               return 1;
+
+       if (p1->lp_txqnob > p2->lp_txqnob)
+               return -1;
+
+       if (p1->lp_txcredits > p2->lp_txcredits)
+               return 1;
+
+       if (p1->lp_txcredits < p2->lp_txcredits)
+               return -1;
+
+       if (r1->lr_seq - r2->lr_seq <= 0)
+               return 1;
+
+       return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *rtr;
+       lnet_route_t            *rtr_best;
+       lnet_route_t            *rtr_last;
+       struct lnet_peer        *lp_best;
+       struct lnet_peer        *lp;
+       int                     rc;
+
+       /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+        * rtr_nid nid, otherwise find the best gateway I can use */
+
+       rnet = lnet_find_net_locked(LNET_NIDNET(target));
+       if (rnet == NULL)
+               return NULL;
+
+       lp_best = NULL;
+       rtr_best = rtr_last = NULL;
+       list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+               lp = rtr->lr_gateway;
+
+               if (!lp->lp_alive || /* gateway is down */
+                   ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+                    rtr->lr_downis != 0)) /* NI to target is down */
+                       continue;
+
+               if (ni != NULL && lp->lp_ni != ni)
+                       continue;
+
+               if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+                       return lp;
+
+               if (lp_best == NULL) {
+                       rtr_best = rtr_last = rtr;
+                       lp_best = lp;
+                       continue;
+               }
+
+               /* no protection on below fields, but it's harmless */
+               if (rtr_last->lr_seq - rtr->lr_seq < 0)
+                       rtr_last = rtr;
+
+               rc = lnet_compare_routes(rtr, rtr_best);
+               if (rc < 0)
+                       continue;
+
+               rtr_best = rtr;
+               lp_best = lp;
+       }
+
+       /* set sequence number on the best router to the latest sequence + 1
+        * so we can round-robin all routers, it's race and inaccurate but
+        * harmless and functional  */
+       if (rtr_best != NULL)
+               rtr_best->lr_seq = rtr_last->lr_seq + 1;
+       return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+       lnet_nid_t              dst_nid = msg->msg_target.nid;
+       struct lnet_ni          *src_ni;
+       struct lnet_ni          *local_ni;
+       struct lnet_peer        *lp;
+       int                     cpt;
+       int                     cpt2;
+       int                     rc;
+
+       /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+        * but we might want to use pre-determined router for ACK/REPLY
+        * in the future */
+       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+       LASSERT (msg->msg_txpeer == NULL);
+       LASSERT (!msg->msg_sending);
+       LASSERT (!msg->msg_target_is_router);
+       LASSERT (!msg->msg_receiving);
+
+       msg->msg_sending = 1;
+
+       LASSERT(!msg->msg_tx_committed);
+       cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
+       if (src_nid == LNET_NID_ANY) {
+               src_ni = NULL;
+       } else {
+               src_ni = lnet_nid2ni_locked(src_nid, cpt);
+               if (src_ni == NULL) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("Can't send to %s: src %s is not a "
+                                     "local nid\n", libcfs_nid2str(dst_nid),
+                                     libcfs_nid2str(src_nid));
+                       return -EINVAL;
+               }
+               LASSERT (!msg->msg_routing);
+       }
+
+       /* Is this for someone on a local network? */
+       local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+       if (local_ni != NULL) {
+               if (src_ni == NULL) {
+                       src_ni = local_ni;
+                       src_nid = src_ni->ni_nid;
+               } else if (src_ni == local_ni) {
+                       lnet_ni_decref_locked(local_ni, cpt);
+               } else {
+                       lnet_ni_decref_locked(local_ni, cpt);
+                       lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("No route to %s via from %s\n",
+                                     libcfs_nid2str(dst_nid),
+                                     libcfs_nid2str(src_nid));
+                       return -EINVAL;
+               }
+
+               LASSERT(src_nid != LNET_NID_ANY);
+               lnet_msg_commit(msg, cpt);
+
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+               if (src_ni == the_lnet.ln_loni) {
+                       /* No send credit hassles with LOLND */
+                       lnet_net_unlock(cpt);
+                       lnet_ni_send(src_ni, msg);
+
+                       lnet_net_lock(cpt);
+                       lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+                       return 0;
+               }
+
+               rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+               /* lp has ref on src_ni; lose mine */
+               lnet_ni_decref_locked(src_ni, cpt);
+               if (rc != 0) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+                                     libcfs_nid2str(dst_nid));
+                       /* ENOMEM or shutting down */
+                       return rc;
+               }
+               LASSERT (lp->lp_ni == src_ni);
+       } else {
+               /* sending to a remote network */
+               lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+               if (lp == NULL) {
+                       if (src_ni != NULL)
+                               lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+
+                       LCONSOLE_WARN("No route to %s via %s "
+                                     "(all routers down)\n",
+                                     libcfs_id2str(msg->msg_target),
+                                     libcfs_nid2str(src_nid));
+                       return -EHOSTUNREACH;
+               }
+
+               /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+                * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+                * pre-determined router, this can happen if router table
+                * was changed when we release the lock */
+               if (rtr_nid != lp->lp_nid) {
+                       cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+                       if (cpt2 != cpt) {
+                               if (src_ni != NULL)
+                                       lnet_ni_decref_locked(src_ni, cpt);
+                               lnet_net_unlock(cpt);
+
+                               rtr_nid = lp->lp_nid;
+                               cpt = cpt2;
+                               goto again;
+                       }
+               }
+
+               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+                      libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+                      lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+               if (src_ni == NULL) {
+                       src_ni = lp->lp_ni;
+                       src_nid = src_ni->ni_nid;
+               } else {
+                       LASSERT (src_ni == lp->lp_ni);
+                       lnet_ni_decref_locked(src_ni, cpt);
+               }
+
+               lnet_peer_addref_locked(lp);
+
+               LASSERT(src_nid != LNET_NID_ANY);
+               lnet_msg_commit(msg, cpt);
+
+               if (!msg->msg_routing) {
+                       /* I'm the source and now I know which NI to send on */
+                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+               }
+
+               msg->msg_target_is_router = 1;
+               msg->msg_target.nid = lp->lp_nid;
+               msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+       }
+
+       /* 'lp' is our best choice of peer */
+
+       LASSERT (!msg->msg_peertxcredit);
+       LASSERT (!msg->msg_txcredit);
+       LASSERT (msg->msg_txpeer == NULL);
+
+       msg->msg_txpeer = lp;              /* msg takes my ref on lp */
+
+       rc = lnet_post_send_locked(msg, 0);
+       lnet_net_unlock(cpt);
+
+       if (rc == EHOSTUNREACH)
+               return -EHOSTUNREACH;
+
+       if (rc == 0)
+               lnet_ni_send(src_ni, msg);
+
+       return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+       lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->drop_count++;
+       the_lnet.ln_counters[cpt]->drop_length += nob;
+       lnet_net_unlock(cpt);
+
+       lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t      *hdr = &msg->msg_hdr;
+
+       if (msg->msg_wanted != 0)
+               lnet_setpayloadbuffer(msg);
+
+       lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+       /* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+        * it back into the ACK during lnet_finalize() */
+       msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+                       (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+       lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+                    msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t              *hdr = &msg->msg_hdr;
+       struct lnet_match_info  info;
+       int                     rc;
+
+       /* Convert put fields to host byte order */
+       hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+       hdr->msg.put.ptl_index  = le32_to_cpu(hdr->msg.put.ptl_index);
+       hdr->msg.put.offset     = le32_to_cpu(hdr->msg.put.offset);
+
+       info.mi_id.nid  = hdr->src_nid;
+       info.mi_id.pid  = hdr->src_pid;
+       info.mi_opc     = LNET_MD_OP_PUT;
+       info.mi_portal  = hdr->msg.put.ptl_index;
+       info.mi_rlength = hdr->payload_length;
+       info.mi_roffset = hdr->msg.put.offset;
+       info.mi_mbits   = hdr->msg.put.match_bits;
+
+       msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+       rc = lnet_ptl_match_md(&info, msg);
+       switch (rc) {
+       default:
+               LBUG();
+
+       case LNET_MATCHMD_OK:
+               lnet_recv_put(ni, msg);
+               return 0;
+
+       case LNET_MATCHMD_NONE:
+               if (msg->msg_rx_delayed) /* attached on delayed list */
+                       return 0;
+
+               rc = lnet_ni_eager_recv(ni, msg);
+               if (rc == 0)
+                       goto again;
+               /* fall through */
+
+       case LNET_MATCHMD_DROP:
+               CNETERR("Dropping PUT from %s portal %d match "LPU64
+                       " offset %d length %d: %d\n",
+                       libcfs_id2str(info.mi_id), info.mi_portal,
+                       info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+               return ENOENT;  /* +ve: OK but no match */
+       }
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+       struct lnet_match_info  info;
+       lnet_hdr_t              *hdr = &msg->msg_hdr;
+       lnet_handle_wire_t      reply_wmd;
+       int                     rc;
+
+       /* Convert get fields to host byte order */
+       hdr->msg.get.match_bits   = le64_to_cpu(hdr->msg.get.match_bits);
+       hdr->msg.get.ptl_index    = le32_to_cpu(hdr->msg.get.ptl_index);
+       hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+       hdr->msg.get.src_offset   = le32_to_cpu(hdr->msg.get.src_offset);
+
+       info.mi_id.nid  = hdr->src_nid;
+       info.mi_id.pid  = hdr->src_pid;
+       info.mi_opc     = LNET_MD_OP_GET;
+       info.mi_portal  = hdr->msg.get.ptl_index;
+       info.mi_rlength = hdr->msg.get.sink_length;
+       info.mi_roffset = hdr->msg.get.src_offset;
+       info.mi_mbits   = hdr->msg.get.match_bits;
+
+       rc = lnet_ptl_match_md(&info, msg);
+       if (rc == LNET_MATCHMD_DROP) {
+               CNETERR("Dropping GET from %s portal %d match "LPU64
+                       " offset %d length %d\n",
+                       libcfs_id2str(info.mi_id), info.mi_portal,
+                       info.mi_mbits, info.mi_roffset, info.mi_rlength);
+               return ENOENT;  /* +ve: OK but no match */
+       }
+
+       LASSERT(rc == LNET_MATCHMD_OK);
+
+       lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+       reply_wmd = hdr->msg.get.return_wmd;
+
+       lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+                      msg->msg_offset, msg->msg_wanted);
+
+       msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+       if (rdma_get) {
+               /* The LND completes the REPLY from her recv procedure */
+               lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                            msg->msg_offset, msg->msg_len, msg->msg_len);
+               return 0;
+       }
+
+       lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+       msg->msg_receiving = 0;
+
+       rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+       if (rc < 0) {
+               /* didn't get as far as lnet_ni_send() */
+               CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+                      libcfs_nid2str(ni->ni_nid),
+                      libcfs_id2str(info.mi_id), rc);
+
+               lnet_finalize(ni, msg, rc);
+       }
+
+       return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       void         *private = msg->msg_private;
+       lnet_hdr_t       *hdr = &msg->msg_hdr;
+       lnet_process_id_t src = {0};
+       lnet_libmd_t     *md;
+       int            rlength;
+       int            mlength;
+       int                     cpt;
+
+       cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+       lnet_res_lock(cpt);
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       /* NB handles only looked up by creator (no flips) */
+       md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CNETERR("%s: Dropping REPLY from %s for %s "
+                       "MD "LPX64"."LPX64"\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("REPLY MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+               return ENOENT;            /* +ve: OK but no match */
+       }
+
+       LASSERT (md->md_offset == 0);
+
+       rlength = hdr->payload_length;
+       mlength = MIN(rlength, (int)md->md_length);
+
+       if (mlength < rlength &&
+           (md->md_options & LNET_MD_TRUNCATE) == 0) {
+               CNETERR("%s: Dropping REPLY from %s length %d "
+                       "for MD "LPX64" would overflow (%d)\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                       rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       mlength);
+               lnet_res_unlock(cpt);
+               return ENOENT;    /* +ve: OK but no match */
+       }
+
+       CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+              mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+       lnet_msg_attach_md(msg, md, 0, mlength);
+
+       if (mlength != 0)
+               lnet_setpayloadbuffer(msg);
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+       lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+       return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t       *hdr = &msg->msg_hdr;
+       lnet_process_id_t src = {0};
+       lnet_libmd_t     *md;
+       int                     cpt;
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       /* Convert ack fields to host byte order */
+       hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+       hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+       cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+       lnet_res_lock(cpt);
+
+       /* NB handles only looked up by creator (no flips) */
+       md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               /* Don't moan; this is expected */
+               CDEBUG(D_NET,
+                      "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+                      libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                      (md == NULL) ? "invalid" : "inactive",
+                      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                      hdr->msg.ack.dst_wmd.wh_object_cookie);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("Source MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+               return ENOENT;            /* +ve! */
+       }
+
+       CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+              hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+       lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+       return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       int     rc = 0;
+
+       if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+           lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+               if (ni->ni_lnd->lnd_eager_recv == NULL) {
+                       msg->msg_rx_ready_delay = 1;
+               } else {
+                       lnet_net_unlock(msg->msg_rx_cpt);
+                       rc = lnet_ni_eager_recv(ni, msg);
+                       lnet_net_lock(msg->msg_rx_cpt);
+               }
+       }
+
+       if (rc == 0)
+               rc = lnet_post_routed_recv_locked(msg, 0);
+       return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+       switch (type) {
+       case LNET_MSG_ACK:
+               return ("ACK");
+       case LNET_MSG_PUT:
+               return ("PUT");
+       case LNET_MSG_GET:
+               return ("GET");
+       case LNET_MSG_REPLY:
+               return ("REPLY");
+       case LNET_MSG_HELLO:
+               return ("HELLO");
+       default:
+               return ("<UNKNOWN>");
+       }
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+       lnet_process_id_t src = {0};
+       lnet_process_id_t dst = {0};
+       char *type_str = lnet_msgtyp2str (hdr->type);
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       dst.nid = hdr->dest_nid;
+       dst.pid = hdr->dest_pid;
+
+       CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+       CWARN("    From %s\n", libcfs_id2str(src));
+       CWARN("    To   %s\n", libcfs_id2str(dst));
+
+       switch (hdr->type) {
+       default:
+               break;
+
+       case LNET_MSG_PUT:
+               CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+                     "match bits "LPU64"\n",
+                     hdr->msg.put.ptl_index,
+                     hdr->msg.put.ack_wmd.wh_interface_cookie,
+                     hdr->msg.put.ack_wmd.wh_object_cookie,
+                     hdr->msg.put.match_bits);
+               CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+                     hdr->payload_length, hdr->msg.put.offset,
+                     hdr->msg.put.hdr_data);
+               break;
+
+       case LNET_MSG_GET:
+               CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+                     "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+                     hdr->msg.get.return_wmd.wh_interface_cookie,
+                     hdr->msg.get.return_wmd.wh_object_cookie,
+                     hdr->msg.get.match_bits);
+               CWARN("    Length %d, src offset %d\n",
+                     hdr->msg.get.sink_length,
+                     hdr->msg.get.src_offset);
+               break;
+
+       case LNET_MSG_ACK:
+               CWARN("    dst md "LPX64"."LPX64", "
+                     "manipulated length %d\n",
+                     hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                     hdr->msg.ack.dst_wmd.wh_object_cookie,
+                     hdr->msg.ack.mlength);
+               break;
+
+       case LNET_MSG_REPLY:
+               CWARN("    dst md "LPX64"."LPX64", "
+                     "length %d\n",
+                     hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                     hdr->msg.reply.dst_wmd.wh_object_cookie,
+                     hdr->payload_length);
+       }
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+          void *private, int rdma_req)
+{
+       int             rc = 0;
+       int             cpt;
+       int             for_me;
+       struct lnet_msg *msg;
+       lnet_pid_t     dest_pid;
+       lnet_nid_t     dest_nid;
+       lnet_nid_t     src_nid;
+       __u32     payload_length;
+       __u32     type;
+
+       LASSERT (!in_interrupt ());
+
+       type = le32_to_cpu(hdr->type);
+       src_nid = le64_to_cpu(hdr->src_nid);
+       dest_nid = le64_to_cpu(hdr->dest_nid);
+       dest_pid = le32_to_cpu(hdr->dest_pid);
+       payload_length = le32_to_cpu(hdr->payload_length);
+
+       for_me = (ni->ni_nid == dest_nid);
+       cpt = lnet_cpt_of_nid(from_nid);
+
+       switch (type) {
+       case LNET_MSG_ACK:
+       case LNET_MSG_GET:
+               if (payload_length > 0) {
+                       CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+                              libcfs_nid2str(from_nid),
+                              libcfs_nid2str(src_nid),
+                              lnet_msgtyp2str(type), payload_length);
+                       return -EPROTO;
+               }
+               break;
+
+       case LNET_MSG_PUT:
+       case LNET_MSG_REPLY:
+               if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+                       CERROR("%s, src %s: bad %s payload %d "
+                              "(%d max expected)\n",
+                              libcfs_nid2str(from_nid),
+                              libcfs_nid2str(src_nid),
+                              lnet_msgtyp2str(type),
+                              payload_length,
+                              for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+                       return -EPROTO;
+               }
+               break;
+
+       default:
+               CERROR("%s, src %s: Bad message type 0x%x\n",
+                      libcfs_nid2str(from_nid),
+                      libcfs_nid2str(src_nid), type);
+               return -EPROTO;
+       }
+
+       if (the_lnet.ln_routing &&
+           ni->ni_last_alive != cfs_time_current_sec()) {
+               lnet_ni_lock(ni);
+
+               /* NB: so far here is the only place to set NI status to "up */
+               ni->ni_last_alive = cfs_time_current_sec();
+               if (ni->ni_status != NULL &&
+                   ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+                       ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+               lnet_ni_unlock(ni);
+       }
+
+       /* Regard a bad destination NID as a protocol error.  Senders should
+        * know what they're doing; if they don't they're misconfigured, buggy
+        * or malicious so we chop them off at the knees :) */
+
+       if (!for_me) {
+               if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+                       /* should have gone direct */
+                       CERROR ("%s, src %s: Bad dest nid %s "
+                               "(should have been sent direct)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (lnet_islocalnid(dest_nid)) {
+                       /* dest is another local NI; sender should have used
+                        * this node's NID on its own network */
+                       CERROR ("%s, src %s: Bad dest nid %s "
+                               "(it's my nid but on a different network)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (rdma_req && type == LNET_MSG_GET) {
+                       CERROR ("%s, src %s: Bad optimized GET for %s "
+                               "(final destination must be me)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (!the_lnet.ln_routing) {
+                       CERROR ("%s, src %s: Dropping message for %s "
+                               "(routing not enabled)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       goto drop;
+               }
+       }
+
+       /* Message looks OK; we're not going to return an error, so we MUST
+        * call back lnd_recv() come what may... */
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (src_nid, 0))          /* shall we now? */
+       {
+               CERROR("%s, src %s: Dropping %s to simulate failure\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type));
+               goto drop;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("%s, src %s: Dropping %s (out of memory)\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type));
+               goto drop;
+       }
+
+       /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+       msg->msg_type = type;
+       msg->msg_private = private;
+       msg->msg_receiving = 1;
+       msg->msg_len = msg->msg_wanted = payload_length;
+       msg->msg_offset = 0;
+       msg->msg_hdr = *hdr;
+       /* for building message event */
+       msg->msg_from = from_nid;
+       if (!for_me) {
+               msg->msg_target.pid     = dest_pid;
+               msg->msg_target.nid     = dest_nid;
+               msg->msg_routing        = 1;
+
+       } else {
+               /* convert common msg->hdr fields to host byteorder */
+               msg->msg_hdr.type       = type;
+               msg->msg_hdr.src_nid    = src_nid;
+               msg->msg_hdr.src_pid    = le32_to_cpu(msg->msg_hdr.src_pid);
+               msg->msg_hdr.dest_nid   = dest_nid;
+               msg->msg_hdr.dest_pid   = dest_pid;
+               msg->msg_hdr.payload_length = payload_length;
+       }
+
+       lnet_net_lock(cpt);
+       rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+       if (rc != 0) {
+               lnet_net_unlock(cpt);
+               CERROR("%s, src %s: Dropping %s "
+                      "(error %d looking up sender)\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type), rc);
+               lnet_msg_free(msg);
+               goto drop;
+       }
+
+       lnet_msg_commit(msg, cpt);
+
+       if (!for_me) {
+               rc = lnet_parse_forward_locked(ni, msg);
+               lnet_net_unlock(cpt);
+
+               if (rc < 0)
+                       goto free_drop;
+               if (rc == 0) {
+                       lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                                    0, payload_length, payload_length);
+               }
+               return 0;
+       }
+
+       lnet_net_unlock(cpt);
+
+       switch (type) {
+       case LNET_MSG_ACK:
+               rc = lnet_parse_ack(ni, msg);
+               break;
+       case LNET_MSG_PUT:
+               rc = lnet_parse_put(ni, msg);
+               break;
+       case LNET_MSG_GET:
+               rc = lnet_parse_get(ni, msg, rdma_req);
+               break;
+       case LNET_MSG_REPLY:
+               rc = lnet_parse_reply(ni, msg);
+               break;
+       default:
+               LASSERT(0);
+               rc = -EPROTO;
+               goto free_drop;  /* prevent an unused label if !kernel */
+       }
+
+       if (rc == 0)
+               return 0;
+
+       LASSERT (rc == ENOENT);
+
+ free_drop:
+       LASSERT(msg->msg_md == NULL);
+       lnet_finalize(ni, msg, rc);
+
+ drop:
+       lnet_drop_message(ni, cpt, private, payload_length);
+       return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+       while (!list_empty(head)) {
+               lnet_process_id_t       id = {0};
+               lnet_msg_t              *msg;
+
+               msg = list_entry(head->next, lnet_msg_t, msg_list);
+               list_del(&msg->msg_list);
+
+               id.nid = msg->msg_hdr.src_nid;
+               id.pid = msg->msg_hdr.src_pid;
+
+               LASSERT(msg->msg_md == NULL);
+               LASSERT(msg->msg_rx_delayed);
+               LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+               CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+                     " offset %d length %d: %s\n",
+                     libcfs_id2str(id),
+                     msg->msg_hdr.msg.put.ptl_index,
+                     msg->msg_hdr.msg.put.match_bits,
+                     msg->msg_hdr.msg.put.offset,
+                     msg->msg_hdr.payload_length, reason);
+
+               /* NB I can't drop msg's ref on msg_rxpeer until after I've
+                * called lnet_drop_message(), so I just hang onto msg as well
+                * until that's done */
+
+               lnet_drop_message(msg->msg_rxpeer->lp_ni,
+                                 msg->msg_rxpeer->lp_cpt,
+                                 msg->msg_private, msg->msg_len);
+               /*
+                * NB: message will not generate event because w/o attached MD,
+                * but we still should give error code so lnet_msg_decommit()
+                * can skip counters operations and other checks.
+                */
+               lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+       }
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+       while (!list_empty(head)) {
+               lnet_msg_t        *msg;
+               lnet_process_id_t  id;
+
+               msg = list_entry(head->next, lnet_msg_t, msg_list);
+               list_del(&msg->msg_list);
+
+               /* md won't disappear under me, since each msg
+                * holds a ref on it */
+
+               id.nid = msg->msg_hdr.src_nid;
+               id.pid = msg->msg_hdr.src_pid;
+
+               LASSERT(msg->msg_rx_delayed);
+               LASSERT(msg->msg_md != NULL);
+               LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+               CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+                      "match "LPU64" offset %d length %d.\n",
+                       libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+                       msg->msg_hdr.msg.put.match_bits,
+                       msg->msg_hdr.msg.put.offset,
+                       msg->msg_hdr.payload_length);
+
+               lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+       }
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+       lnet_process_id_t target, unsigned int portal,
+       __u64 match_bits, unsigned int offset,
+       __u64 hdr_data)
+{
+       struct lnet_msg         *msg;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (target.nid, 1))    /* shall we now? */
+       {
+               CERROR("Dropping PUT to %s: simulated failure\n",
+                      libcfs_id2str(target));
+               return -EIO;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+                      libcfs_id2str(target));
+               return -ENOMEM;
+       }
+       msg->msg_vmflush = !!memory_pressure_get();
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+                      match_bits, portal, libcfs_id2str(target),
+                      md == NULL ? -1 : md->md_threshold);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("Source MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+               lnet_res_unlock(cpt);
+
+               lnet_msg_free(msg);
+               return -ENOENT;
+       }
+
+       CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+       msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+       msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+       msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+       msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+       /* NB handles only looked up by creator (no flips) */
+       if (ack == LNET_ACK_REQ) {
+               msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+                       the_lnet.ln_interface_cookie;
+               msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+                       md->md_lh.lh_cookie;
+       } else {
+               msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+                       LNET_WIRE_HANDLE_COOKIE_NONE;
+               msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+                       LNET_WIRE_HANDLE_COOKIE_NONE;
+       }
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+       rc = lnet_send(self, msg, LNET_NID_ANY);
+       if (rc != 0) {
+               CNETERR( "Error sending PUT to %s: %d\n",
+                      libcfs_id2str(target), rc);
+               lnet_finalize (NULL, msg, rc);
+       }
+
+       /* completion will be signalled by an event */
+       return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+       /* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+        * returns a msg for the LND to pass to lnet_finalize() when the sink
+        * data has been received.
+        *
+        * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+        * lnet_finalize() is called on it, so the LND must call this first */
+
+       struct lnet_msg         *msg = lnet_msg_alloc();
+       struct lnet_libmd       *getmd = getmsg->msg_md;
+       lnet_process_id_t       peer_id = getmsg->msg_target;
+       int                     cpt;
+
+       LASSERT(!getmsg->msg_target_is_router);
+       LASSERT(!getmsg->msg_routing);
+
+       cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+       lnet_res_lock(cpt);
+
+       LASSERT (getmd->md_refcount > 0);
+
+       if (msg == NULL) {
+               CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+               goto drop;
+       }
+
+       if (getmd->md_threshold == 0) {
+               CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+                       getmd);
+               lnet_res_unlock(cpt);
+               goto drop;
+       }
+
+       LASSERT(getmd->md_offset == 0);
+
+       CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+       /* setup information for lnet_build_msg_event */
+       msg->msg_from = peer_id.nid;
+       msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+       msg->msg_hdr.src_nid = peer_id.nid;
+       msg->msg_hdr.payload_length = getmd->md_length;
+       msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+       lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+       lnet_res_unlock(cpt);
+
+       cpt = lnet_cpt_of_nid(peer_id.nid);
+
+       lnet_net_lock(cpt);
+       lnet_msg_commit(msg, cpt);
+       lnet_net_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+       return msg;
+
+ drop:
+       cpt = lnet_cpt_of_nid(peer_id.nid);
+
+       lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->drop_count++;
+       the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+       lnet_net_unlock(cpt);
+
+       if (msg != NULL)
+               lnet_msg_free(msg);
+
+       return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+       /* Set the REPLY length, now the RDMA that elides the REPLY message has
+        * completed and I know it. */
+       LASSERT (reply != NULL);
+       LASSERT (reply->msg_type == LNET_MSG_GET);
+       LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+       /* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+        * the end of my buffer, I might as well be dead. */
+       LASSERT (len <= reply->msg_ev.mlength);
+
+       reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+       lnet_process_id_t target, unsigned int portal,
+       __u64 match_bits, unsigned int offset)
+{
+       struct lnet_msg         *msg;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (target.nid, 1))    /* shall we now? */
+       {
+               CERROR("Dropping GET to %s: simulated failure\n",
+                      libcfs_id2str(target));
+               return -EIO;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+                      libcfs_id2str(target));
+               return -ENOMEM;
+       }
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+                      match_bits, portal, libcfs_id2str(target),
+                      md == NULL ? -1 : md->md_threshold);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("REPLY MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+
+               lnet_msg_free(msg);
+
+               return -ENOENT;
+       }
+
+       CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+       msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+       msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+       msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+       msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+       /* NB handles only looked up by creator (no flips) */
+       msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+               the_lnet.ln_interface_cookie;
+       msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+               md->md_lh.lh_cookie;
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+       rc = lnet_send(self, msg, LNET_NID_ANY);
+       if (rc < 0) {
+               CNETERR( "Error sending GET to %s: %d\n",
+                      libcfs_id2str(target), rc);
+               lnet_finalize (NULL, msg, rc);
+       }
+
+       /* completion will be signalled by an event */
+       return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+       struct list_head                *e;
+       struct lnet_ni          *ni;
+       lnet_remotenet_t        *rnet;
+       __u32                   dstnet = LNET_NIDNET(dstnid);
+       int                     hops;
+       int                     cpt;
+       __u32                   order = 2;
+       struct list_head                *rn_list;
+
+       /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+        * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+        * keep order 0 free for 0@lo and order 1 free for a local NID
+        * match */
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       cpt = lnet_net_lock_current();
+
+       list_for_each (e, &the_lnet.ln_nis) {
+               ni = list_entry(e, lnet_ni_t, ni_list);
+
+               if (ni->ni_nid == dstnid) {
+                       if (srcnidp != NULL)
+                               *srcnidp = dstnid;
+                       if (orderp != NULL) {
+                               if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+                                       *orderp = 0;
+                               else
+                                       *orderp = 1;
+                       }
+                       lnet_net_unlock(cpt);
+
+                       return local_nid_dist_zero ? 0 : 1;
+               }
+
+               if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+                       if (srcnidp != NULL)
+                               *srcnidp = ni->ni_nid;
+                       if (orderp != NULL)
+                               *orderp = order;
+                       lnet_net_unlock(cpt);
+                       return 1;
+               }
+
+               order++;
+       }
+
+       rn_list = lnet_net2rnethash(dstnet);
+       list_for_each(e, rn_list) {
+               rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+               if (rnet->lrn_net == dstnet) {
+                       lnet_route_t *route;
+                       lnet_route_t *shortest = NULL;
+
+                       LASSERT (!list_empty(&rnet->lrn_routes));
+
+                       list_for_each_entry(route, &rnet->lrn_routes,
+                                               lr_list) {
+                               if (shortest == NULL ||
+                                   route->lr_hops < shortest->lr_hops)
+                                       shortest = route;
+                       }
+
+                       LASSERT (shortest != NULL);
+                       hops = shortest->lr_hops;
+                       if (srcnidp != NULL)
+                               *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+                       if (orderp != NULL)
+                               *orderp = order;
+                       lnet_net_unlock(cpt);
+                       return hops + 1;
+               }
+               order++;
+       }
+
+       lnet_net_unlock(cpt);
+       return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+       return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644 (file)
index 0000000..8f3a50b
--- /dev/null
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+       ENTRY;
+
+       memset(ev, 0, sizeof(*ev));
+
+       ev->status   = 0;
+       ev->unlinked = 1;
+       ev->type     = LNET_EVENT_UNLINK;
+       lnet_md_deconstruct(md, &ev->md);
+       lnet_md2handle(&ev->md_handle, md);
+       EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+       lnet_hdr_t      *hdr = &msg->msg_hdr;
+       lnet_event_t    *ev  = &msg->msg_ev;
+
+       LASSERT(!msg->msg_routing);
+
+       ev->type = ev_type;
+
+       if (ev_type == LNET_EVENT_SEND) {
+               /* event for active message */
+               ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+               ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+               ev->initiator.nid = LNET_NID_ANY;
+               ev->initiator.pid = the_lnet.ln_pid;
+               ev->sender        = LNET_NID_ANY;
+
+       } else {
+               /* event for passive message */
+               ev->target.pid    = hdr->dest_pid;
+               ev->target.nid    = hdr->dest_nid;
+               ev->initiator.pid = hdr->src_pid;
+               ev->initiator.nid = hdr->src_nid;
+               ev->rlength       = hdr->payload_length;
+               ev->sender        = msg->msg_from;
+               ev->mlength       = msg->msg_wanted;
+               ev->offset        = msg->msg_offset;
+       }
+
+       switch (ev_type) {
+       default:
+               LBUG();
+
+       case LNET_EVENT_PUT: /* passive PUT */
+               ev->pt_index   = hdr->msg.put.ptl_index;
+               ev->match_bits = hdr->msg.put.match_bits;
+               ev->hdr_data   = hdr->msg.put.hdr_data;
+               return;
+
+       case LNET_EVENT_GET: /* passive GET */
+               ev->pt_index   = hdr->msg.get.ptl_index;
+               ev->match_bits = hdr->msg.get.match_bits;
+               ev->hdr_data   = 0;
+               return;
+
+       case LNET_EVENT_ACK: /* ACK */
+               ev->match_bits = hdr->msg.ack.match_bits;
+               ev->mlength    = hdr->msg.ack.mlength;
+               return;
+
+       case LNET_EVENT_REPLY: /* REPLY */
+               return;
+
+       case LNET_EVENT_SEND: /* active message */
+               if (msg->msg_type == LNET_MSG_PUT) {
+                       ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+                       ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+                       ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+                       ev->mlength    =
+                       ev->rlength    = le32_to_cpu(hdr->payload_length);
+                       ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+               } else {
+                       LASSERT(msg->msg_type == LNET_MSG_GET);
+                       ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+                       ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+                       ev->mlength    =
+                       ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+                       ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+                       ev->hdr_data   = 0;
+               }
+               return;
+       }
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+       struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+       lnet_counters_t           *counters  = the_lnet.ln_counters[cpt];
+
+       /* routed message can be committed for both receiving and sending */
+       LASSERT(!msg->msg_tx_committed);
+
+       if (msg->msg_sending) {
+               LASSERT(!msg->msg_receiving);
+
+               msg->msg_tx_cpt = cpt;
+               msg->msg_tx_committed = 1;
+               if (msg->msg_rx_committed) { /* routed message REPLY */
+                       LASSERT(msg->msg_onactivelist);
+                       return;
+               }
+       } else {
+               LASSERT(!msg->msg_sending);
+               msg->msg_rx_cpt = cpt;
+               msg->msg_rx_committed = 1;
+       }
+
+       LASSERT(!msg->msg_onactivelist);
+       msg->msg_onactivelist = 1;
+       list_add(&msg->msg_activelist, &container->msc_active);
+
+       counters->msgs_alloc++;
+       if (counters->msgs_alloc > counters->msgs_max)
+               counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+       lnet_counters_t *counters;
+       lnet_event_t    *ev = &msg->msg_ev;
+
+       LASSERT(msg->msg_tx_committed);
+       if (status != 0)
+               goto out;
+
+       counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+       switch (ev->type) {
+       default: /* routed message */
+               LASSERT(msg->msg_routing);
+               LASSERT(msg->msg_rx_committed);
+               LASSERT(ev->type == 0);
+
+               counters->route_length += msg->msg_len;
+               counters->route_count++;
+               goto out;
+
+       case LNET_EVENT_PUT:
+               /* should have been decommitted */
+               LASSERT(!msg->msg_rx_committed);
+               /* overwritten while sending ACK */
+               LASSERT(msg->msg_type == LNET_MSG_ACK);
+               msg->msg_type = LNET_MSG_PUT; /* fix type */
+               break;
+
+       case LNET_EVENT_SEND:
+               LASSERT(!msg->msg_rx_committed);
+               if (msg->msg_type == LNET_MSG_PUT)
+                       counters->send_length += msg->msg_len;
+               break;
+
+       case LNET_EVENT_GET:
+               LASSERT(msg->msg_rx_committed);
+               /* overwritten while sending reply, we should never be
+                * here for optimized GET */
+               LASSERT(msg->msg_type == LNET_MSG_REPLY);
+               msg->msg_type = LNET_MSG_GET; /* fix type */
+               break;
+       }
+
+       counters->send_count++;
+ out:
+       lnet_return_tx_credits_locked(msg);
+       msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+       lnet_counters_t *counters;
+       lnet_event_t    *ev = &msg->msg_ev;
+
+       LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+       LASSERT(msg->msg_rx_committed);
+
+       if (status != 0)
+               goto out;
+
+       counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+       switch (ev->type) {
+       default:
+               LASSERT(ev->type == 0);
+               LASSERT(msg->msg_routing);
+               goto out;
+
+       case LNET_EVENT_ACK:
+               LASSERT(msg->msg_type == LNET_MSG_ACK);
+               break;
+
+       case LNET_EVENT_GET:
+               /* type is "REPLY" if it's an optimized GET on passive side,
+                * because optimized GET will never be committed for sending,
+                * so message type wouldn't be changed back to "GET" by
+                * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+               LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+                       msg->msg_type == LNET_MSG_GET);
+               counters->send_length += msg->msg_wanted;
+               break;
+
+       case LNET_EVENT_PUT:
+               LASSERT(msg->msg_type == LNET_MSG_PUT);
+               break;
+
+       case LNET_EVENT_REPLY:
+               /* type is "GET" if it's an optimized GET on active side,
+                * see details in lnet_create_reply_msg() */
+               LASSERT(msg->msg_type == LNET_MSG_GET ||
+                       msg->msg_type == LNET_MSG_REPLY);
+               break;
+       }
+
+       counters->recv_count++;
+       if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+               counters->recv_length += msg->msg_wanted;
+
+ out:
+       lnet_return_rx_credits_locked(msg);
+       msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+       int     cpt2 = cpt;
+
+       LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+       LASSERT(msg->msg_onactivelist);
+
+       if (msg->msg_tx_committed) { /* always decommit for sending first */
+               LASSERT(cpt == msg->msg_tx_cpt);
+               lnet_msg_decommit_tx(msg, status);
+       }
+
+       if (msg->msg_rx_committed) {
+               /* forwarding msg committed for both receiving and sending */
+               if (cpt != msg->msg_rx_cpt) {
+                       lnet_net_unlock(cpt);
+                       cpt2 = msg->msg_rx_cpt;
+                       lnet_net_lock(cpt2);
+               }
+               lnet_msg_decommit_rx(msg, status);
+       }
+
+       list_del(&msg->msg_activelist);
+       msg->msg_onactivelist = 0;
+
+       the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+       if (cpt2 != cpt) {
+               lnet_net_unlock(cpt2);
+               lnet_net_lock(cpt);
+       }
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+                  unsigned int offset, unsigned int mlen)
+{
+       /* NB: @offset and @len are only useful for receiving */
+       /* Here, we attach the MD on lnet_msg and mark it busy and
+        * decrementing its threshold. Come what may, the lnet_msg "owns"
+        * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+        * signals completion. */
+       LASSERT(!msg->msg_routing);
+
+       msg->msg_md = md;
+       if (msg->msg_receiving) { /* commited for receiving */
+               msg->msg_offset = offset;
+               msg->msg_wanted = mlen;
+       }
+
+       md->md_refcount++;
+       if (md->md_threshold != LNET_MD_THRESH_INF) {
+               LASSERT(md->md_threshold > 0);
+               md->md_threshold--;
+       }
+
+       /* build umd in event */
+       lnet_md2handle(&msg->msg_ev.md_handle, md);
+       lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+       lnet_libmd_t    *md = msg->msg_md;
+       int             unlink;
+
+       /* Now it's safe to drop my caller's ref */
+       md->md_refcount--;
+       LASSERT(md->md_refcount >= 0);
+
+       unlink = lnet_md_unlinkable(md);
+       if (md->md_eq != NULL) {
+               msg->msg_ev.status   = status;
+               msg->msg_ev.unlinked = unlink;
+               lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+       }
+
+       if (unlink)
+               lnet_md_unlink(md);
+
+       msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+       lnet_handle_wire_t ack_wmd;
+       int             rc;
+       int             status = msg->msg_ev.status;
+
+       LASSERT (msg->msg_onactivelist);
+
+       if (status == 0 && msg->msg_ack) {
+               /* Only send an ACK if the PUT completed successfully */
+
+               lnet_msg_decommit(msg, cpt, 0);
+
+               msg->msg_ack = 0;
+               lnet_net_unlock(cpt);
+
+               LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+               LASSERT(!msg->msg_routing);
+
+               ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+               lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+               msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+               msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+               msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+               /* NB: we probably want to use NID of msg::msg_from as 3rd
+                * parameter (router NID) if it's routed message */
+               rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+               lnet_net_lock(cpt);
+               /*
+                * NB: message is committed for sending, we should return
+                * on success because LND will finalize this message later.
+                *
+                * Also, there is possibility that message is commited for
+                * sending and also failed before delivering to LND,
+                * i.e: ENOMEM, in that case we can't fall through either
+                * because CPT for sending can be different with CPT for
+                * receiving, so we should return back to lnet_finalize()
+                * to make sure we are locking the correct partition.
+                */
+               return rc;
+
+       } else if (status == 0 &&       /* OK so far */
+                  (msg->msg_routing && !msg->msg_sending)) {
+               /* not forwarded */
+               LASSERT(!msg->msg_receiving);   /* called back recv already */
+               lnet_net_unlock(cpt);
+
+               rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+               lnet_net_lock(cpt);
+               /*
+                * NB: message is committed for sending, we should return
+                * on success because LND will finalize this message later.
+                *
+                * Also, there is possibility that message is commited for
+                * sending and also failed before delivering to LND,
+                * i.e: ENOMEM, in that case we can't fall through either:
+                * - The rule is message must decommit for sending first if
+                *   the it's committed for both sending and receiving
+                * - CPT for sending can be different with CPT for receiving,
+                *   so we should return back to lnet_finalize() to make
+                *   sure we are locking the correct partition.
+                */
+               return rc;
+       }
+
+       lnet_msg_decommit(msg, cpt, status);
+       lnet_msg_free_locked(msg);
+       return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+       struct lnet_msg_container       *container;
+       int                             my_slot;
+       int                             cpt;
+       int                             rc;
+       int                             i;
+
+       LASSERT (!in_interrupt ());
+
+       if (msg == NULL)
+               return;
+#if 0
+       CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+              lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+              msg->msg_target_is_router ? "t" : "",
+              msg->msg_routing ? "X" : "",
+              msg->msg_ack ? "A" : "",
+              msg->msg_sending ? "S" : "",
+              msg->msg_receiving ? "R" : "",
+              msg->msg_delayed ? "d" : "",
+              msg->msg_txcredit ? "C" : "",
+              msg->msg_peertxcredit ? "c" : "",
+              msg->msg_rtrcredit ? "F" : "",
+              msg->msg_peerrtrcredit ? "f" : "",
+              msg->msg_onactivelist ? "!" : "",
+              msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+              msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+       msg->msg_ev.status = status;
+
+       if (msg->msg_md != NULL) {
+               cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+               lnet_res_lock(cpt);
+               lnet_msg_detach_md(msg, status);
+               lnet_res_unlock(cpt);
+       }
+
+ again:
+       rc = 0;
+       if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+               /* not commited to network yet */
+               LASSERT(!msg->msg_onactivelist);
+               lnet_msg_free(msg);
+               return;
+       }
+
+       /*
+        * NB: routed message can be commited for both receiving and sending,
+        * we should finalize in LIFO order and keep counters correct.
+        * (finalize sending first then finalize receiving)
+        */
+       cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+       lnet_net_lock(cpt);
+
+       container = the_lnet.ln_msg_containers[cpt];
+       list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+       /* Recursion breaker.  Don't complete the message here if I am (or
+        * enough other threads are) already completing messages */
+
+       my_slot = -1;
+       for (i = 0; i < container->msc_nfinalizers; i++) {
+               if (container->msc_finalizers[i] == current)
+                       break;
+
+               if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+                       my_slot = i;
+       }
+
+       if (i < container->msc_nfinalizers || my_slot < 0) {
+               lnet_net_unlock(cpt);
+               return;
+       }
+
+       container->msc_finalizers[my_slot] = current;
+
+       while (!list_empty(&container->msc_finalizing)) {
+               msg = list_entry(container->msc_finalizing.next,
+                                    lnet_msg_t, msg_list);
+
+               list_del(&msg->msg_list);
+
+               /* NB drops and regains the lnet lock if it actually does
+                * anything, so my finalizing friends can chomp along too */
+               rc = lnet_complete_msg_locked(msg, cpt);
+               if (rc != 0)
+                       break;
+       }
+
+       container->msc_finalizers[my_slot] = NULL;
+       lnet_net_unlock(cpt);
+
+       if (rc != 0)
+               goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+       int     count = 0;
+
+       if (container->msc_init == 0)
+               return;
+
+       while (!list_empty(&container->msc_active)) {
+               lnet_msg_t *msg = list_entry(container->msc_active.next,
+                                                lnet_msg_t, msg_activelist);
+
+               LASSERT(msg->msg_onactivelist);
+               msg->msg_onactivelist = 0;
+               list_del(&msg->msg_activelist);
+               lnet_msg_free(msg);
+               count++;
+       }
+
+       if (count > 0)
+               CERROR("%d active msg on exit\n", count);
+
+       if (container->msc_finalizers != NULL) {
+               LIBCFS_FREE(container->msc_finalizers,
+                           container->msc_nfinalizers *
+                           sizeof(*container->msc_finalizers));
+               container->msc_finalizers = NULL;
+       }
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_fini(&container->msc_freelist);
+#endif
+       container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+       int     rc;
+
+       container->msc_init = 1;
+
+       INIT_LIST_HEAD(&container->msc_active);
+       INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+       memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+       rc = lnet_freelist_init(&container->msc_freelist,
+                               LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+       if (rc != 0) {
+               CERROR("Failed to init freelist for message container\n");
+               lnet_msg_container_cleanup(container);
+               return rc;
+       }
+#else
+       rc = 0;
+#endif
+       /* number of CPUs */
+       container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+       LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+                        container->msc_nfinalizers *
+                        sizeof(*container->msc_finalizers));
+
+       if (container->msc_finalizers == NULL) {
+               CERROR("Failed to allocate message finalizers\n");
+               lnet_msg_container_cleanup(container);
+               return -ENOMEM;
+       }
+
+       return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+       struct lnet_msg_container *container;
+       int     i;
+
+       if (the_lnet.ln_msg_containers == NULL)
+               return;
+
+       cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+               lnet_msg_container_cleanup(container);
+
+       cfs_percpt_free(the_lnet.ln_msg_containers);
+       the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+       struct lnet_msg_container *container;
+       int     rc;
+       int     i;
+
+       the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+                                                     sizeof(*container));
+
+       if (the_lnet.ln_msg_containers == NULL) {
+               CERROR("Failed to allocate cpu-partition data for network\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+               rc = lnet_msg_container_setup(container, i);
+               if (rc != 0) {
+                       lnet_msg_containers_destroy();
+                       return rc;
+               }
+       }
+
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644 (file)
index 0000000..9b9e7d3
--- /dev/null
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int    portal_rotor    = LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+               "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+                   __u64 mbits, __u64 ignore_bits)
+{
+       struct lnet_portal      *ptl = the_lnet.ln_portals[index];
+       int                     unique;
+
+       unique = ignore_bits == 0 &&
+                match_id.nid != LNET_NID_ANY &&
+                match_id.pid != LNET_PID_ANY;
+
+       LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+       /* prefer to check w/o any lock */
+       if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+               goto match;
+
+       /* unset, new portal */
+       lnet_ptl_lock(ptl);
+       /* check again with lock */
+       if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+               lnet_ptl_unlock(ptl);
+               goto match;
+       }
+
+       /* still not set */
+       if (unique)
+               lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+       else
+               lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+       lnet_ptl_unlock(ptl);
+
+       return 1;
+
+ match:
+       if ((lnet_ptl_is_unique(ptl) && !unique) ||
+           (lnet_ptl_is_wildcard(ptl) && unique))
+               return 0;
+       return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+       struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+       int                     i;
+
+       /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       mtable->mt_enabled = 1;
+
+       ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+       for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+               LASSERT(ptl->ptl_mt_maps[i] != cpt);
+               if (ptl->ptl_mt_maps[i] < cpt)
+                       break;
+
+               /* swap to order */
+               ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+               ptl->ptl_mt_maps[i] = cpt;
+       }
+
+       ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+       struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+       int                     i;
+
+       /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       if (LNET_CPT_NUMBER == 1)
+               return; /* never disable the only match-table */
+
+       mtable->mt_enabled = 0;
+
+       LASSERT(ptl->ptl_mt_nmaps > 0 &&
+               ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+       /* remove it from mt_maps */
+       ptl->ptl_mt_nmaps--;
+       for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+               if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+                       ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+       }
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+                 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+        * lnet_match_blocked_msg() relies on this to avoid races */
+       unsigned int    offset;
+       unsigned int    mlength;
+       lnet_me_t       *me = md->md_me;
+
+       /* MD exhausted */
+       if (lnet_md_exhausted(md))
+               return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+       /* mismatched MD op */
+       if ((md->md_options & info->mi_opc) == 0)
+               return LNET_MATCHMD_NONE;
+
+       /* mismatched ME nid/pid? */
+       if (me->me_match_id.nid != LNET_NID_ANY &&
+           me->me_match_id.nid != info->mi_id.nid)
+               return LNET_MATCHMD_NONE;
+
+       if (me->me_match_id.pid != LNET_PID_ANY &&
+           me->me_match_id.pid != info->mi_id.pid)
+               return LNET_MATCHMD_NONE;
+
+       /* mismatched ME matchbits? */
+       if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+               return LNET_MATCHMD_NONE;
+
+       /* Hurrah! This _is_ a match; check it out... */
+
+       if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+               offset = md->md_offset;
+       else
+               offset = info->mi_roffset;
+
+       if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+               mlength = md->md_max_size;
+               LASSERT(md->md_offset + mlength <= md->md_length);
+       } else {
+               mlength = md->md_length - offset;
+       }
+
+       if (info->mi_rlength <= mlength) {      /* fits in allowed space */
+               mlength = info->mi_rlength;
+       } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+               /* this packet _really_ is too big */
+               CERROR("Matching packet from %s, match "LPU64
+                      " length %d too big: %d left, %d allowed\n",
+                      libcfs_id2str(info->mi_id), info->mi_mbits,
+                      info->mi_rlength, md->md_length - offset, mlength);
+
+               return LNET_MATCHMD_DROP;
+       }
+
+       /* Commit to this ME/MD */
+       CDEBUG(D_NET, "Incoming %s index %x from %s of "
+              "length %d/%d into md "LPX64" [%d] + %d\n",
+              (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+              info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+              info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+       lnet_msg_attach_md(msg, md, offset, mlength);
+       md->md_offset = offset + mlength;
+
+       if (!lnet_md_exhausted(md))
+               return LNET_MATCHMD_OK;
+
+       /* Auto-unlink NOW, so the ME gets unlinked if required.
+        * We bumped md->md_refcount above so the MD just gets flagged
+        * for unlink when it is finalized. */
+       if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+               lnet_md_unlink(md);
+
+       return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+       if (LNET_CPT_NUMBER == 1)
+               return ptl->ptl_mtables[0]; /* the only one */
+
+       /* if it's a unique portal, return match-table hashed by NID */
+       return lnet_ptl_is_unique(ptl) ?
+              ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+                 __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+       struct lnet_portal      *ptl;
+       struct lnet_match_table *mtable;
+
+       /* NB: called w/o lock */
+       LASSERT(index < the_lnet.ln_nportals);
+
+       if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+               return NULL;
+
+       ptl = the_lnet.ln_portals[index];
+
+       mtable = lnet_match2mt(ptl, id, mbits);
+       if (mtable != NULL) /* unique portal or only one match-table */
+               return mtable;
+
+       /* it's a wildcard portal */
+       switch (pos) {
+       default:
+               return NULL;
+       case LNET_INS_BEFORE:
+       case LNET_INS_AFTER:
+               /* posted by no affinity thread, always hash to specific
+                * match-table to avoid buffer stealing which is heavy */
+               return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+       case LNET_INS_LOCAL:
+               /* posted by cpu-affinity thread */
+               return ptl->ptl_mtables[lnet_cpt_current()];
+       }
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_portal      *ptl;
+       int                     nmaps;
+       int                     rotor;
+       int                     routed;
+       int                     cpt;
+
+       /* NB: called w/o lock */
+       LASSERT(info->mi_portal < the_lnet.ln_nportals);
+       ptl = the_lnet.ln_portals[info->mi_portal];
+
+       LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+       mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+       if (mtable != NULL)
+               return mtable;
+
+       /* it's a wildcard portal */
+       routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+                LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+       if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+           (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+               cpt = lnet_cpt_current();
+               if (ptl->ptl_mtables[cpt]->mt_enabled)
+                       return ptl->ptl_mtables[cpt];
+       }
+
+       rotor = ptl->ptl_rotor++; /* get round-robin factor */
+       if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+               cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+       else
+               cpt = rotor % LNET_CPT_NUMBER;
+
+       if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+               /* is there any active entry for this portal? */
+               nmaps = ptl->ptl_mt_nmaps;
+               /* map to an active mtable to avoid heavy "stealing" */
+               if (nmaps != 0) {
+                       /* NB: there is possibility that ptl_mt_maps is being
+                        * changed because we are not under protection of
+                        * lnet_ptl_lock, but it shouldn't hurt anything */
+                       cpt = ptl->ptl_mt_maps[rotor % nmaps];
+               }
+       }
+
+       return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+       __u64   *bmap;
+       int     i;
+
+       if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+               return 0;
+
+       if (pos < 0) { /* check all bits */
+               for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+                       if (mtable->mt_exhausted[i] != (__u64)(-1))
+                               return 0;
+               }
+               return 1;
+       }
+
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+       /* mtable::mt_mhash[pos] is marked as exhausted or not */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+       __u64   *bmap;
+
+       LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+       /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       if (!exhausted)
+               *bmap &= ~(1ULL << pos);
+       else
+               *bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+                  lnet_process_id_t id, __u64 mbits)
+{
+       struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+       if (lnet_ptl_is_wildcard(ptl)) {
+               return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+       } else {
+               unsigned long hash = mbits + id.nid + id.pid;
+
+               LASSERT(lnet_ptl_is_unique(ptl));
+               hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+               return &mtable->mt_mhash[hash];
+       }
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+                struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct list_head                *head;
+       lnet_me_t               *me;
+       lnet_me_t               *tmp;
+       int                     exhausted = 0;
+       int                     rc;
+
+       /* any ME with ignore bits? */
+       if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+       /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+       if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+               exhausted = LNET_MATCHMD_EXHAUSTED;
+
+       list_for_each_entry_safe(me, tmp, head, me_list) {
+               /* ME attached but MD not attached yet */
+               if (me->me_md == NULL)
+                       continue;
+
+               LASSERT(me == me->me_md->md_me);
+
+               rc = lnet_try_match_md(me->me_md, info, msg);
+               if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+                       exhausted = 0; /* mlist is not empty */
+
+               if ((rc & LNET_MATCHMD_FINISH) != 0) {
+                       /* don't return EXHAUSTED bit because we don't know
+                        * whether the mlist is empty or not */
+                       return rc & ~LNET_MATCHMD_EXHAUSTED;
+               }
+       }
+
+       if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+               lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+               if (!lnet_mt_test_exhausted(mtable, -1))
+                       exhausted = 0;
+       }
+
+       if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+               goto again; /* re-check MEs w/o ignore-bits */
+       }
+
+       if (info->mi_opc == LNET_MD_OP_GET ||
+           !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+               return LNET_MATCHMD_DROP | exhausted;
+
+       return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+       int     rc;
+
+       /* message arrived before any buffer posting on this portal,
+        * simply delay or drop this message */
+       if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+               return 0;
+
+       lnet_ptl_lock(ptl);
+       /* check it again with hold of lock */
+       if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+               lnet_ptl_unlock(ptl);
+               return 0;
+       }
+
+       if (lnet_ptl_is_lazy(ptl)) {
+               if (msg->msg_rx_ready_delay) {
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list,
+                                         &ptl->ptl_msg_delayed);
+               }
+               rc = LNET_MATCHMD_NONE;
+       } else {
+               rc = LNET_MATCHMD_DROP;
+       }
+
+       lnet_ptl_unlock(ptl);
+       return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+                    struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       int     first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+       int     rc = 0;
+       int     i;
+
+       /* steal buffer from other CPTs, and delay it if nothing to steal,
+        * this function is more expensive than a regular match, but we
+        * don't expect it can happen a lot */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       for (i = 0; i < LNET_CPT_NUMBER; i++) {
+               struct lnet_match_table *mtable;
+               int                     cpt;
+
+               cpt = (first + i) % LNET_CPT_NUMBER;
+               mtable = ptl->ptl_mtables[cpt];
+               if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+                       continue;
+
+               lnet_res_lock(cpt);
+               lnet_ptl_lock(ptl);
+
+               if (i == 0) { /* the first try, attach on stealing list */
+                       list_add_tail(&msg->msg_list,
+                                         &ptl->ptl_msg_stealing);
+               }
+
+               if (!list_empty(&msg->msg_list)) { /* on stealing list */
+                       rc = lnet_mt_match_md(mtable, info, msg);
+
+                       if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+                           mtable->mt_enabled)
+                               lnet_ptl_disable_mt(ptl, cpt);
+
+                       if ((rc & LNET_MATCHMD_FINISH) != 0)
+                               list_del_init(&msg->msg_list);
+
+               } else {
+                       /* could be matched by lnet_ptl_attach_md()
+                        * which is called by another thread */
+                       rc = msg->msg_md == NULL ?
+                            LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+               }
+
+               if (!list_empty(&msg->msg_list) && /* not matched yet */
+                   (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+                    ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+                    (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+                     ptl->ptl_mt_maps[0] == cpt))) {
+                       /* nothing to steal, delay or drop */
+                       list_del_init(&msg->msg_list);
+
+                       if (lnet_ptl_is_lazy(ptl)) {
+                               msg->msg_rx_delayed = 1;
+                               list_add_tail(&msg->msg_list,
+                                                 &ptl->ptl_msg_delayed);
+                               rc = LNET_MATCHMD_NONE;
+                       } else {
+                               rc = LNET_MATCHMD_DROP;
+                       }
+               }
+
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(cpt);
+
+               if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+                       break;
+       }
+
+       return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_portal      *ptl;
+       int                     rc;
+
+       CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+              "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+              info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+       if (info->mi_portal >= the_lnet.ln_nportals) {
+               CERROR("Invalid portal %d not in [0-%d]\n",
+                      info->mi_portal, the_lnet.ln_nportals);
+               return LNET_MATCHMD_DROP;
+       }
+
+       ptl = the_lnet.ln_portals[info->mi_portal];
+       rc = lnet_ptl_match_early(ptl, msg);
+       if (rc != 0) /* matched or delayed early message */
+               return rc;
+
+       mtable = lnet_mt_of_match(info, msg);
+       lnet_res_lock(mtable->mt_cpt);
+
+       if (the_lnet.ln_shutdown) {
+               rc = LNET_MATCHMD_DROP;
+               goto out1;
+       }
+
+       rc = lnet_mt_match_md(mtable, info, msg);
+       if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+               lnet_ptl_lock(ptl);
+               lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+               lnet_ptl_unlock(ptl);
+       }
+
+       if ((rc & LNET_MATCHMD_FINISH) != 0)    /* matched or dropping */
+               goto out1;
+
+       if (!msg->msg_rx_ready_delay)
+               goto out1;
+
+       LASSERT(lnet_ptl_is_lazy(ptl));
+       LASSERT(!msg->msg_rx_delayed);
+
+       /* NB: we don't expect "delay" can happen a lot */
+       if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+               lnet_ptl_lock(ptl);
+
+               msg->msg_rx_delayed = 1;
+               list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(mtable->mt_cpt);
+
+       } else  {
+               lnet_res_unlock(mtable->mt_cpt);
+               rc = lnet_ptl_match_delay(ptl, info, msg);
+       }
+
+       if (msg->msg_rx_delayed) {
+               CDEBUG(D_NET,
+                      "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+                      info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+                      libcfs_id2str(info->mi_id), info->mi_portal,
+                      info->mi_mbits, info->mi_roffset, info->mi_rlength);
+       }
+       goto out0;
+ out1:
+       lnet_res_unlock(mtable->mt_cpt);
+ out0:
+       /* EXHAUSTED bit is only meaningful for internal functions */
+       return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+       LASSERT(me->me_md == md && md->md_me == me);
+
+       me->me_md = NULL;
+       md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+                  struct list_head *matches, struct list_head *drops)
+{
+       struct lnet_portal      *ptl = the_lnet.ln_portals[me->me_portal];
+       struct lnet_match_table *mtable;
+       struct list_head                *head;
+       lnet_msg_t              *tmp;
+       lnet_msg_t              *msg;
+       int                     exhausted = 0;
+       int                     cpt;
+
+       LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+       me->me_md = md;
+       md->md_me = me;
+
+       cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+       mtable = ptl->ptl_mtables[cpt];
+
+       if (list_empty(&ptl->ptl_msg_stealing) &&
+           list_empty(&ptl->ptl_msg_delayed) &&
+           !lnet_mt_test_exhausted(mtable, me->me_pos))
+               return;
+
+       lnet_ptl_lock(ptl);
+       head = &ptl->ptl_msg_stealing;
+ again:
+       list_for_each_entry_safe(msg, tmp, head, msg_list) {
+               struct lnet_match_info  info;
+               lnet_hdr_t              *hdr;
+               int                     rc;
+
+               LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+               hdr   = &msg->msg_hdr;
+               info.mi_id.nid  = hdr->src_nid;
+               info.mi_id.pid  = hdr->src_pid;
+               info.mi_opc     = LNET_MD_OP_PUT;
+               info.mi_portal  = hdr->msg.put.ptl_index;
+               info.mi_rlength = hdr->payload_length;
+               info.mi_roffset = hdr->msg.put.offset;
+               info.mi_mbits   = hdr->msg.put.match_bits;
+
+               rc = lnet_try_match_md(md, &info, msg);
+
+               exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+               if ((rc & LNET_MATCHMD_NONE) != 0) {
+                       if (exhausted)
+                               break;
+                       continue;
+               }
+
+               /* Hurrah! This _is_ a match */
+               LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+               list_del_init(&msg->msg_list);
+
+               if (head == &ptl->ptl_msg_stealing) {
+                       if (exhausted)
+                               break;
+                       /* stealing thread will handle the message */
+                       continue;
+               }
+
+               if ((rc & LNET_MATCHMD_OK) != 0) {
+                       list_add_tail(&msg->msg_list, matches);
+
+                       CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+                              "match "LPU64" offset %d length %d.\n",
+                              libcfs_id2str(info.mi_id),
+                              info.mi_portal, info.mi_mbits,
+                              info.mi_roffset, info.mi_rlength);
+               } else {
+                       list_add_tail(&msg->msg_list, drops);
+               }
+
+               if (exhausted)
+                       break;
+       }
+
+       if (!exhausted && head == &ptl->ptl_msg_stealing) {
+               head = &ptl->ptl_msg_delayed;
+               goto again;
+       }
+
+       if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+               lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+               if (!mtable->mt_enabled)
+                       lnet_ptl_enable_mt(ptl, cpt);
+       }
+
+       lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+       struct lnet_match_table *mtable;
+       int                     i;
+
+       if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+               return;
+
+       LASSERT(list_empty(&ptl->ptl_msg_delayed));
+       LASSERT(list_empty(&ptl->ptl_msg_stealing));
+       cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+               struct list_head        *mhash;
+               lnet_me_t       *me;
+               int             j;
+
+               if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+                       continue;
+
+               mhash = mtable->mt_mhash;
+               /* cleanup ME */
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+                       while (!list_empty(&mhash[j])) {
+                               me = list_entry(mhash[j].next,
+                                                   lnet_me_t, me_list);
+                               CERROR("Active ME %p on exit\n", me);
+                               list_del(&me->me_list);
+                               lnet_me_free(me);
+                       }
+               }
+               /* the extra entry is for MEs with ignore bits */
+               LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+       }
+
+       cfs_percpt_free(ptl->ptl_mtables);
+       ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+       struct lnet_match_table *mtable;
+       struct list_head                *mhash;
+       int                     i;
+       int                     j;
+
+       ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(struct lnet_match_table));
+       if (ptl->ptl_mtables == NULL) {
+               CERROR("Failed to create match table for portal %d\n", index);
+               return -ENOMEM;
+       }
+
+       ptl->ptl_index = index;
+       INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+       INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+       spin_lock_init(&ptl->ptl_lock);
+       cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+               /* the extra entry is for MEs with ignore bits */
+               LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+                                sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+               if (mhash == NULL) {
+                       CERROR("Failed to create match hash for portal %d\n",
+                              index);
+                       goto failed;
+               }
+
+               memset(&mtable->mt_exhausted[0], -1,
+                      sizeof(mtable->mt_exhausted[0]) *
+                      LNET_MT_EXHAUSTED_BMAP);
+               mtable->mt_mhash = mhash;
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+                       INIT_LIST_HEAD(&mhash[j]);
+
+               mtable->mt_portal = index;
+               mtable->mt_cpt = i;
+       }
+
+       return 0;
+ failed:
+       lnet_ptl_cleanup(ptl);
+       return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+       int     i;
+
+       if (the_lnet.ln_portals == NULL)
+               return;
+
+       for (i = 0; i < the_lnet.ln_nportals; i++)
+               lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+       cfs_array_free(the_lnet.ln_portals);
+       the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+       int     size;
+       int     i;
+
+       size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+       the_lnet.ln_nportals = MAX_PORTALS;
+       the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+       if (the_lnet.ln_portals == NULL) {
+               CERROR("Failed to allocate portals table\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < the_lnet.ln_nportals; i++) {
+               if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+                       lnet_portals_destroy();
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+       struct lnet_portal *ptl;
+
+       if (portal < 0 || portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+       ptl = the_lnet.ln_portals[portal];
+
+       lnet_res_lock(LNET_LOCK_EX);
+       lnet_ptl_lock(ptl);
+
+       lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+       lnet_ptl_unlock(ptl);
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+       struct lnet_portal      *ptl;
+       LIST_HEAD               (zombies);
+
+       if (portal < 0 || portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       ptl = the_lnet.ln_portals[portal];
+
+       lnet_res_lock(LNET_LOCK_EX);
+       lnet_ptl_lock(ptl);
+
+       if (!lnet_ptl_is_lazy(ptl)) {
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(LNET_LOCK_EX);
+               return 0;
+       }
+
+       if (the_lnet.ln_shutdown)
+               CWARN("Active lazy portal %d on exit\n", portal);
+       else
+               CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+       /* grab all the blocked messages atomically */
+       list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+       lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+       lnet_ptl_unlock(ptl);
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644 (file)
index 0000000..670dae3
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       LASSERT (!lntmsg->msg_routing);
+       LASSERT (!lntmsg->msg_target_is_router);
+
+       return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+           int delayed, unsigned int niov,
+           struct iovec *iov, lnet_kiov_t *kiov,
+           unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       lnet_msg_t *sendmsg = private;
+
+       if (lntmsg != NULL) {              /* not discarding */
+               if (sendmsg->msg_iov != NULL) {
+                       if (iov != NULL)
+                               lnet_copy_iov2iov(niov, iov, offset,
+                                                 sendmsg->msg_niov,
+                                                 sendmsg->msg_iov,
+                                                 sendmsg->msg_offset, mlen);
+                       else
+                               lnet_copy_iov2kiov(niov, kiov, offset,
+                                                  sendmsg->msg_niov,
+                                                  sendmsg->msg_iov,
+                                                  sendmsg->msg_offset, mlen);
+               } else {
+                       if (iov != NULL)
+                               lnet_copy_kiov2iov(niov, iov, offset,
+                                                  sendmsg->msg_niov,
+                                                  sendmsg->msg_kiov,
+                                                  sendmsg->msg_offset, mlen);
+                       else
+                               lnet_copy_kiov2kiov(niov, kiov, offset,
+                                                   sendmsg->msg_niov,
+                                                   sendmsg->msg_kiov,
+                                                   sendmsg->msg_offset, mlen);
+               }
+
+               lnet_finalize(ni, lntmsg, 0);
+       }
+
+       lnet_finalize(ni, sendmsg, 0);
+       return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+       LASSERT (lolnd_instanced);
+
+       lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+       LASSERT (ni->ni_lnd == &the_lolnd);
+       LASSERT (!lolnd_instanced);
+       lolnd_instanced = 1;
+
+       return (0);
+}
+
+lnd_t the_lolnd = {
+       /* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+       /* .lnd_refcount   = */ 0,
+       /* .lnd_type       = */ LOLND,
+       /* .lnd_startup    = */ lolnd_startup,
+       /* .lnd_shutdown   = */ lolnd_shutdown,
+       /* .lnt_ctl     = */ NULL,
+       /* .lnd_send       = */ lolnd_send,
+       /* .lnd_recv       = */ lolnd_recv,
+       /* .lnd_eager_recv = */ NULL,
+       /* .lnd_notify     = */ NULL,
+       /* .lnd_accept     = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644 (file)
index 0000000..c832385
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+               "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+       /* 'arg' only there so I can be passed to cfs_create_thread() */
+       int    rc = 0;
+
+       LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+       if (!the_lnet.ln_niinit_self) {
+               rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+               if (rc >= 0) {
+                       the_lnet.ln_niinit_self = 1;
+                       rc = 0;
+               }
+       }
+
+       LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+       return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+       int   refcount;
+
+       LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+       if (the_lnet.ln_niinit_self) {
+               the_lnet.ln_niinit_self = 0;
+               LNetNIFini();
+       }
+
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+       refcount = the_lnet.ln_refcount;
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+       LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+       return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+       int   rc;
+
+       switch (cmd) {
+       case IOC_LIBCFS_CONFIGURE:
+               return lnet_configure(NULL);
+
+       case IOC_LIBCFS_UNCONFIGURE:
+               return lnet_unconfigure();
+
+       default:
+               /* Passing LNET_PID_ANY only gives me a ref if the net is up
+                * already; I'll need it to ensure the net can't go down while
+                * I'm called into it */
+               rc = LNetNIInit(LNET_PID_ANY);
+               if (rc >= 0) {
+                       rc = LNetCtl(cmd, data);
+                       LNetNIFini();
+               }
+               return rc;
+       }
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+       int               rc;
+       ENTRY;
+
+       mutex_init(&lnet_config_mutex);
+
+       rc = LNetInit();
+       if (rc != 0) {
+               CERROR("LNetInit: error %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+       LASSERT (rc == 0);
+
+       if (config_on_load) {
+               /* Have to schedule a separate thread to avoid deadlocking
+                * in modload */
+               (void) kthread_run(lnet_configure, NULL, "lnet_initd");
+       }
+
+       RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+       int rc;
+
+       rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+       LASSERT (rc == 0);
+
+       LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644 (file)
index 0000000..2869776
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+       struct lnet_peer_table  *ptable;
+       struct list_head                *hash;
+       int                     i;
+       int                     j;
+
+       the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+                                                  sizeof(*ptable));
+       if (the_lnet.ln_peer_tables == NULL) {
+               CERROR("Failed to allocate cpu-partition peer tables\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+               LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+                                LNET_PEER_HASH_SIZE * sizeof(*hash));
+               if (hash == NULL) {
+                       CERROR("Failed to create peer hash table\n");
+                       lnet_peer_tables_destroy();
+                       return -ENOMEM;
+               }
+
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+                       INIT_LIST_HEAD(&hash[j]);
+               ptable->pt_hash = hash; /* sign of initialization */
+       }
+
+       return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+       struct lnet_peer_table  *ptable;
+       struct list_head                *hash;
+       int                     i;
+       int                     j;
+
+       if (the_lnet.ln_peer_tables == NULL)
+               return;
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               hash = ptable->pt_hash;
+               if (hash == NULL) /* not intialized */
+                       break;
+
+               LASSERT(list_empty(&ptable->pt_deathrow));
+
+               ptable->pt_hash = NULL;
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+                       LASSERT(list_empty(&hash[j]));
+
+               LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+       }
+
+       cfs_percpt_free(the_lnet.ln_peer_tables);
+       the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+       struct lnet_peer_table  *ptable;
+       int                     i;
+       int                     j;
+
+       LASSERT(the_lnet.ln_shutdown);  /* i.e. no new peers */
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               lnet_net_lock(i);
+
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+                       struct list_head *peers = &ptable->pt_hash[j];
+
+                       while (!list_empty(peers)) {
+                               lnet_peer_t *lp = list_entry(peers->next,
+                                                                lnet_peer_t,
+                                                                lp_hashlist);
+                               list_del_init(&lp->lp_hashlist);
+                               /* lose hash table's ref */
+                               lnet_peer_decref_locked(lp);
+                       }
+               }
+
+               lnet_net_unlock(i);
+       }
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               LIST_HEAD       (deathrow);
+               lnet_peer_t     *lp;
+
+               lnet_net_lock(i);
+
+               for (j = 3; ptable->pt_number != 0; j++) {
+                       lnet_net_unlock(i);
+
+                       if ((j & (j - 1)) == 0) {
+                               CDEBUG(D_WARNING,
+                                      "Waiting for %d peers on peer table\n",
+                                      ptable->pt_number);
+                       }
+                       cfs_pause(cfs_time_seconds(1) / 2);
+                       lnet_net_lock(i);
+               }
+               list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+               lnet_net_unlock(i);
+
+               while (!list_empty(&deathrow)) {
+                       lp = list_entry(deathrow.next,
+                                           lnet_peer_t, lp_hashlist);
+                       list_del(&lp->lp_hashlist);
+                       LIBCFS_FREE(lp, sizeof(*lp));
+               }
+       }
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+       struct lnet_peer_table *ptable;
+
+       LASSERT(lp->lp_refcount == 0);
+       LASSERT(lp->lp_rtr_refcount == 0);
+       LASSERT(list_empty(&lp->lp_txq));
+       LASSERT(list_empty(&lp->lp_hashlist));
+       LASSERT(lp->lp_txqnob == 0);
+
+       ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+       LASSERT(ptable->pt_number > 0);
+       ptable->pt_number--;
+
+       lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+       lp->lp_ni = NULL;
+
+       list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+       struct list_head        *peers;
+       lnet_peer_t     *lp;
+
+       LASSERT(!the_lnet.ln_shutdown);
+
+       peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+       list_for_each_entry(lp, peers, lp_hashlist) {
+               if (lp->lp_nid == nid) {
+                       lnet_peer_addref_locked(lp);
+                       return lp;
+               }
+       }
+
+       return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+       struct lnet_peer_table  *ptable;
+       lnet_peer_t             *lp = NULL;
+       lnet_peer_t             *lp2;
+       int                     cpt2;
+       int                     rc = 0;
+
+       *lpp = NULL;
+       if (the_lnet.ln_shutdown) /* it's shutting down */
+               return -ESHUTDOWN;
+
+       /* cpt can be LNET_LOCK_EX if it's called from router functions */
+       cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+       ptable = the_lnet.ln_peer_tables[cpt2];
+       lp = lnet_find_peer_locked(ptable, nid);
+       if (lp != NULL) {
+               *lpp = lp;
+               return 0;
+       }
+
+       if (!list_empty(&ptable->pt_deathrow)) {
+               lp = list_entry(ptable->pt_deathrow.next,
+                                   lnet_peer_t, lp_hashlist);
+               list_del(&lp->lp_hashlist);
+       }
+
+       /*
+        * take extra refcount in case another thread has shutdown LNet
+        * and destroyed locks and peer-table before I finish the allocation
+        */
+       ptable->pt_number++;
+       lnet_net_unlock(cpt);
+
+       if (lp != NULL)
+               memset(lp, 0, sizeof(*lp));
+       else
+               LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+       if (lp == NULL) {
+               rc = -ENOMEM;
+               lnet_net_lock(cpt);
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&lp->lp_txq);
+       INIT_LIST_HEAD(&lp->lp_rtrq);
+       INIT_LIST_HEAD(&lp->lp_routes);
+
+       lp->lp_notify = 0;
+       lp->lp_notifylnd = 0;
+       lp->lp_notifying = 0;
+       lp->lp_alive_count = 0;
+       lp->lp_timestamp = 0;
+       lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+       lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+       lp->lp_last_query = 0; /* haven't asked NI yet */
+       lp->lp_ping_timestamp = 0;
+       lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+       lp->lp_nid = nid;
+       lp->lp_cpt = cpt2;
+       lp->lp_refcount = 2;    /* 1 for caller; 1 for hash */
+       lp->lp_rtr_refcount = 0;
+
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
+
+       lp2 = lnet_find_peer_locked(ptable, nid);
+       if (lp2 != NULL) {
+               *lpp = lp2;
+               goto out;
+       }
+
+       lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+       if (lp->lp_ni == NULL) {
+               rc = -EHOSTUNREACH;
+               goto out;
+       }
+
+       lp->lp_txcredits    =
+       lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+       lp->lp_rtrcredits    =
+       lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+       list_add_tail(&lp->lp_hashlist,
+                         &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+       ptable->pt_version++;
+       *lpp = lp;
+
+       return 0;
+out:
+       if (lp != NULL)
+               list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+       ptable->pt_number--;
+       return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+       char            *aliveness = "NA";
+       lnet_peer_t     *lp;
+       int             rc;
+       int             cpt;
+
+       cpt = lnet_cpt_of_nid(nid);
+       lnet_net_lock(cpt);
+
+       rc = lnet_nid2peer_locked(&lp, nid, cpt);
+       if (rc != 0) {
+               lnet_net_unlock(cpt);
+               CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+               return;
+       }
+
+       if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+               aliveness = lp->lp_alive ? "up" : "down";
+
+       CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+              libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+              aliveness, lp->lp_ni->ni_peertxcredits,
+              lp->lp_rtrcredits, lp->lp_minrtrcredits,
+              lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+       lnet_peer_decref_locked(lp);
+
+       lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644 (file)
index 0000000..c5ff97a
--- /dev/null
@@ -0,0 +1,1693 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN      512     /* min value for each CPT */
+#define LNET_NRB_TINY          (LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN     4096    /* min value for each CPT */
+#define LNET_NRB_SMALL         (LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN     256     /* min value for each CPT */
+#define LNET_NRB_LARGE         (LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+               "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+               "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+               "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+               "# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+               "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+       /* NI option overrides LNet default */
+       if (ni->ni_peerrtrcredits > 0)
+               return ni->ni_peerrtrcredits;
+       if (peer_buffer_credits > 0)
+               return peer_buffer_credits;
+
+       /* As an approximation, allow this peer the same number of router
+        * buffers as it is allowed outstanding sends */
+       return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+       return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+               "Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+               "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+               "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+               "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+               "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+       return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+       if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+               CDEBUG(D_NET, "Out of date\n");
+               return;
+       }
+
+       lp->lp_timestamp = when;                /* update timestamp */
+       lp->lp_ping_deadline = 0;              /* disable ping timeout */
+
+       if (lp->lp_alive_count != 0 &&    /* got old news */
+           (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+               CDEBUG(D_NET, "Old news\n");
+               return;
+       }
+
+       /* Flag that notification is outstanding */
+
+       lp->lp_alive_count++;
+       lp->lp_alive = !(!alive);              /* 1 bit! */
+       lp->lp_notify = 1;
+       lp->lp_notifylnd |= notifylnd;
+       if (lp->lp_alive)
+               lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+       int     alive;
+       int     notifylnd;
+
+       /* Notify only in 1 thread at any time to ensure ordered notification.
+        * NB individual events can be missed; the only guarantee is that you
+        * always get the most recent news */
+
+       if (lp->lp_notifying)
+               return;
+
+       lp->lp_notifying = 1;
+
+       while (lp->lp_notify) {
+               alive     = lp->lp_alive;
+               notifylnd = lp->lp_notifylnd;
+
+               lp->lp_notifylnd = 0;
+               lp->lp_notify    = 0;
+
+               if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+                       lnet_net_unlock(lp->lp_cpt);
+
+                       /* A new notification could happen now; I'll handle it
+                        * when control returns to me */
+
+                       (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+                       lnet_net_lock(lp->lp_cpt);
+               }
+       }
+
+       lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+       LASSERT(lp->lp_refcount > 0);
+       LASSERT(lp->lp_rtr_refcount >= 0);
+
+       /* lnet_net_lock must be exclusively locked */
+       lp->lp_rtr_refcount++;
+       if (lp->lp_rtr_refcount == 1) {
+               struct list_head *pos;
+
+               /* a simple insertion sort */
+               list_for_each_prev(pos, &the_lnet.ln_routers) {
+                       lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+                                                         lp_rtr_list);
+
+                       if (rtr->lp_nid < lp->lp_nid)
+                               break;
+               }
+
+               list_add(&lp->lp_rtr_list, pos);
+               /* addref for the_lnet.ln_routers */
+               lnet_peer_addref_locked(lp);
+               the_lnet.ln_routers_version++;
+       }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+       LASSERT(lp->lp_refcount > 0);
+       LASSERT(lp->lp_rtr_refcount > 0);
+
+       /* lnet_net_lock must be exclusively locked */
+       lp->lp_rtr_refcount--;
+       if (lp->lp_rtr_refcount == 0) {
+               LASSERT(list_empty(&lp->lp_routes));
+
+               if (lp->lp_rcd != NULL) {
+                       list_add(&lp->lp_rcd->rcd_list,
+                                    &the_lnet.ln_rcd_deathrow);
+                       lp->lp_rcd = NULL;
+               }
+
+               list_del(&lp->lp_rtr_list);
+               /* decref for the_lnet.ln_routers */
+               lnet_peer_decref_locked(lp);
+               the_lnet.ln_routers_version++;
+       }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+       lnet_remotenet_t        *rnet;
+       struct list_head                *tmp;
+       struct list_head                *rn_list;
+
+       LASSERT(!the_lnet.ln_shutdown);
+
+       rn_list = lnet_net2rnethash(net);
+       list_for_each(tmp, rn_list) {
+               rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+               if (rnet->lrn_net == net)
+                       return rnet;
+       }
+       return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+       static int seeded = 0;
+       int lnd_type, seed[2];
+       struct timeval tv;
+       lnet_ni_t *ni;
+       struct list_head *tmp;
+
+       if (seeded)
+               return;
+
+       cfs_get_random_bytes(seed, sizeof(seed));
+
+       /* Nodes with small feet have little entropy
+        * the NID for this node gives the most entropy in the low bits */
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+               lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+               if (lnd_type != LOLND)
+                       seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+       }
+
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+       seeded = 1;
+       return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+       unsigned int      len = 0;
+       unsigned int      offset = 0;
+       struct list_head       *e;
+
+       lnet_shuffle_seed();
+
+       list_for_each (e, &rnet->lrn_routes) {
+               len++;
+       }
+
+       /* len+1 positions to add a new entry, also prevents division by 0 */
+       offset = cfs_rand() % (len + 1);
+       list_for_each (e, &rnet->lrn_routes) {
+               if (offset == 0)
+                       break;
+               offset--;
+       }
+       list_add(&route->lr_list, e);
+       list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+       the_lnet.ln_remote_nets_version++;
+       lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+       struct list_head          *e;
+       lnet_remotenet_t    *rnet;
+       lnet_remotenet_t    *rnet2;
+       lnet_route_t    *route;
+       lnet_ni_t          *ni;
+       int               add_route;
+       int               rc;
+
+       CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+              libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+       if (gateway == LNET_NID_ANY ||
+           LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+           net == LNET_NIDNET(LNET_NID_ANY) ||
+           LNET_NETTYP(net) == LOLND ||
+           LNET_NIDNET(gateway) == net ||
+           hops < 1 || hops > 255)
+               return (-EINVAL);
+
+       if (lnet_islocalnet(net))              /* it's a local network */
+               return 0;                      /* ignore the route entry */
+
+       /* Assume net, route, all new */
+       LIBCFS_ALLOC(route, sizeof(*route));
+       LIBCFS_ALLOC(rnet, sizeof(*rnet));
+       if (route == NULL || rnet == NULL) {
+               CERROR("Out of memory creating route %s %d %s\n",
+                      libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+               if (route != NULL)
+                       LIBCFS_FREE(route, sizeof(*route));
+               if (rnet != NULL)
+                       LIBCFS_FREE(rnet, sizeof(*rnet));
+               return -ENOMEM;
+       }
+
+       INIT_LIST_HEAD(&rnet->lrn_routes);
+       rnet->lrn_net = net;
+       route->lr_hops = hops;
+       route->lr_net = net;
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+       if (rc != 0) {
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               LIBCFS_FREE(route, sizeof(*route));
+               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+               if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+                       return 0;       /* ignore the route entry */
+               } else {
+                       CERROR("Error %d creating route %s %d %s\n", rc,
+                              libcfs_net2str(net), hops,
+                              libcfs_nid2str(gateway));
+               }
+               return rc;
+       }
+
+       LASSERT (!the_lnet.ln_shutdown);
+
+       rnet2 = lnet_find_net_locked(net);
+       if (rnet2 == NULL) {
+               /* new network */
+               list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+               rnet2 = rnet;
+       }
+
+       /* Search for a duplicate route (it's a NOOP if it is) */
+       add_route = 1;
+       list_for_each (e, &rnet2->lrn_routes) {
+               lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+               if (route2->lr_gateway == route->lr_gateway) {
+                       add_route = 0;
+                       break;
+               }
+
+               /* our lookups must be true */
+               LASSERT (route2->lr_gateway->lp_nid != gateway);
+       }
+
+       if (add_route) {
+               lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+               lnet_add_route_to_rnet(rnet2, route);
+
+               ni = route->lr_gateway->lp_ni;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               /* XXX Assume alive */
+               if (ni->ni_lnd->lnd_notify != NULL)
+                       (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       /* -1 for notify or !add_route */
+       lnet_peer_decref_locked(route->lr_gateway);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       if (!add_route)
+               LIBCFS_FREE(route, sizeof(*route));
+
+       if (rnet != rnet2)
+               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+       return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       lnet_route_t            *route2;
+       struct list_head                *e1;
+       struct list_head                *e2;
+       int                     cpt;
+       struct list_head                *rn_list;
+       int                     i;
+
+       cpt = lnet_net_lock_current();
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each(e1, rn_list) {
+                       rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                       route2 = NULL;
+                       list_for_each(e2, &rnet->lrn_routes) {
+                               lnet_nid_t      nid1;
+                               lnet_nid_t      nid2;
+                               int             net;
+
+                               route = list_entry(e2, lnet_route_t,
+                                                      lr_list);
+
+                               if (route2 == NULL) {
+                                       route2 = route;
+                                       continue;
+                               }
+
+                               if (route->lr_gateway->lp_ni ==
+                                   route2->lr_gateway->lp_ni)
+                                       continue;
+
+                               nid1 = route->lr_gateway->lp_nid;
+                               nid2 = route2->lr_gateway->lp_nid;
+                               net = rnet->lrn_net;
+
+                               lnet_net_unlock(cpt);
+
+                               CERROR("Routes to %s via %s and %s not "
+                                      "supported\n",
+                                      libcfs_net2str(net),
+                                      libcfs_nid2str(nid1),
+                                      libcfs_nid2str(nid2));
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       lnet_net_unlock(cpt);
+       return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+       struct lnet_peer        *gateway;
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       struct list_head                *e1;
+       struct list_head                *e2;
+       int                     rc = -ENOENT;
+       struct list_head                *rn_list;
+       int                     idx = 0;
+
+       CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+              libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+       /* NB Caller may specify either all routes via the given gateway
+        * or a specific route entry actual NIDs) */
+
+       lnet_net_lock(LNET_LOCK_EX);
+       if (net == LNET_NIDNET(LNET_NID_ANY))
+               rn_list = &the_lnet.ln_remote_nets_hash[0];
+       else
+               rn_list = lnet_net2rnethash(net);
+
+ again:
+       list_for_each(e1, rn_list) {
+               rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+               if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+                       net == rnet->lrn_net))
+                       continue;
+
+               list_for_each(e2, &rnet->lrn_routes) {
+                       route = list_entry(e2, lnet_route_t, lr_list);
+
+                       gateway = route->lr_gateway;
+                       if (!(gw_nid == LNET_NID_ANY ||
+                             gw_nid == gateway->lp_nid))
+                               continue;
+
+                       list_del(&route->lr_list);
+                       list_del(&route->lr_gwlist);
+                       the_lnet.ln_remote_nets_version++;
+
+                       if (list_empty(&rnet->lrn_routes))
+                               list_del(&rnet->lrn_list);
+                       else
+                               rnet = NULL;
+
+                       lnet_rtr_decref_locked(gateway);
+                       lnet_peer_decref_locked(gateway);
+
+                       lnet_net_unlock(LNET_LOCK_EX);
+
+                       LIBCFS_FREE(route, sizeof(*route));
+
+                       if (rnet != NULL)
+                               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+                       rc = 0;
+                       lnet_net_lock(LNET_LOCK_EX);
+                       goto again;
+               }
+       }
+
+       if (net == LNET_NIDNET(LNET_NID_ANY) &&
+           ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+               rn_list = &the_lnet.ln_remote_nets_hash[idx];
+               goto again;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+       lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+              lnet_nid_t *gateway, __u32 *alive)
+{
+       struct list_head                *e1;
+       struct list_head                *e2;
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       int                     cpt;
+       int                     i;
+       struct list_head                *rn_list;
+
+       cpt = lnet_net_lock_current();
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each(e1, rn_list) {
+                       rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                       list_for_each(e2, &rnet->lrn_routes) {
+                               route = list_entry(e2, lnet_route_t,
+                                                      lr_list);
+
+                               if (idx-- == 0) {
+                                       *net     = rnet->lrn_net;
+                                       *hops    = route->lr_hops;
+                                       *gateway = route->lr_gateway->lp_nid;
+                                       *alive   = route->lr_gateway->lp_alive;
+                                       lnet_net_unlock(cpt);
+                                       return 0;
+                               }
+                       }
+               }
+       }
+
+       lnet_net_unlock(cpt);
+       return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+       int            i;
+       lnet_ni_status_t *stat;
+
+       __swab32s(&info->pi_magic);
+       __swab32s(&info->pi_features);
+       __swab32s(&info->pi_pid);
+       __swab32s(&info->pi_nnis);
+       for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+               stat = &info->pi_ni[i];
+               __swab64s(&stat->ns_nid);
+               __swab32s(&stat->ns_status);
+       }
+       return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+       lnet_ping_info_t        *info = rcd->rcd_pinginfo;
+       struct lnet_peer        *gw   = rcd->rcd_gateway;
+       lnet_route_t            *rtr;
+
+       if (!gw->lp_alive)
+               return;
+
+       if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+               lnet_swap_pinginfo(info);
+
+       /* NB always racing with network! */
+       if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+               CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+                      libcfs_nid2str(gw->lp_nid), info->pi_magic);
+               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+               return;
+       }
+
+       gw->lp_ping_feats = info->pi_features;
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+               CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+                      libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+               return; /* nothing I can understand */
+       }
+
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+               return; /* can't carry NI status info */
+
+       list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+               int     ptl_status = LNET_NI_STATUS_INVALID;
+               int     down = 0;
+               int     up = 0;
+               int     i;
+
+               for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                       lnet_ni_status_t *stat = &info->pi_ni[i];
+                       lnet_nid_t       nid = stat->ns_nid;
+
+                       if (nid == LNET_NID_ANY) {
+                               CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+                                      libcfs_nid2str(gw->lp_nid));
+                               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                               return;
+                       }
+
+                       if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                               continue;
+
+                       if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+                               if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+                                       down++;
+                               else if (ptl_status != LNET_NI_STATUS_UP)
+                                       ptl_status = LNET_NI_STATUS_DOWN;
+                               continue;
+                       }
+
+                       if (stat->ns_status == LNET_NI_STATUS_UP) {
+                               if (LNET_NIDNET(nid) == rtr->lr_net) {
+                                       up = 1;
+                                       break;
+                               }
+                               /* ptl NIs are considered down only when
+                                * they're all down */
+                               if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                                       ptl_status = LNET_NI_STATUS_UP;
+                               continue;
+                       }
+
+                       CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+                              libcfs_nid2str(gw->lp_nid), stat->ns_status);
+                       gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                       return;
+               }
+
+               if (up) { /* ignore downed NIs if NI for dest network is up */
+                       rtr->lr_downis = 0;
+                       continue;
+               }
+               rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+       }
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+       lnet_rc_data_t          *rcd = event->md.user_ptr;
+       struct lnet_peer        *lp;
+
+       LASSERT(rcd != NULL);
+
+       if (event->unlinked) {
+               LNetInvalidateHandle(&rcd->rcd_mdh);
+               return;
+       }
+
+       LASSERT(event->type == LNET_EVENT_SEND ||
+               event->type == LNET_EVENT_REPLY);
+
+       lp = rcd->rcd_gateway;
+       LASSERT(lp != NULL);
+
+        /* NB: it's called with holding lnet_res_lock, we have a few
+         * places need to hold both locks at the same time, please take
+         * care of lock ordering */
+       lnet_net_lock(lp->lp_cpt);
+       if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+               /* ignore if no longer a router or rcd is replaced */
+               goto out;
+       }
+
+       if (event->type == LNET_EVENT_SEND) {
+               lp->lp_ping_notsent = 0;
+               if (event->status == 0)
+                       goto out;
+       }
+
+       /* LNET_EVENT_REPLY */
+       /* A successful REPLY means the router is up.  If _any_ comms
+        * to the router fail I assume it's down (this will happen if
+        * we ping alive routers to try to detect router death before
+        * apps get burned). */
+
+       lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+       /* The router checker will wake up very shortly and do the
+        * actual notification.
+        * XXX If 'lp' stops being a router before then, it will still
+        * have the notification pending!!! */
+
+       if (avoid_asym_router_failure && event->status == 0)
+               lnet_parse_rc_info(rcd);
+
+ out:
+       lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+       lnet_peer_t      *rtr;
+       struct list_head          *entry;
+       int               all_known;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       for (;;) {
+               int     cpt = lnet_net_lock_current();
+
+               all_known = 1;
+               list_for_each (entry, &the_lnet.ln_routers) {
+                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                       if (rtr->lp_alive_count == 0) {
+                               all_known = 0;
+                               break;
+                       }
+               }
+
+               lnet_net_unlock(cpt);
+
+               if (all_known)
+                       return;
+
+               cfs_pause(cfs_time_seconds(1));
+       }
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+       lnet_ni_t       *ni;
+       long            now;
+       int             timeout;
+
+       LASSERT(the_lnet.ln_routing);
+
+       timeout = router_ping_timeout +
+                 MAX(live_router_check_interval, dead_router_check_interval);
+
+       now = cfs_time_current_sec();
+       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+               if (ni->ni_lnd->lnd_type == LOLND)
+                       continue;
+
+               if (now < ni->ni_last_alive + timeout)
+                       continue;
+
+               lnet_ni_lock(ni);
+               /* re-check with lock */
+               if (now < ni->ni_last_alive + timeout) {
+                       lnet_ni_unlock(ni);
+                       continue;
+               }
+
+               LASSERT(ni->ni_status != NULL);
+
+               if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+                       CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+                              libcfs_nid2str(ni->ni_nid), timeout);
+                       /* NB: so far, this is the only place to set
+                        * NI status to "down" */
+                       ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+               }
+               lnet_ni_unlock(ni);
+       }
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+       LASSERT(list_empty(&rcd->rcd_list));
+       /* detached from network */
+       LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+       if (rcd->rcd_gateway != NULL) {
+               int cpt = rcd->rcd_gateway->lp_cpt;
+
+               lnet_net_lock(cpt);
+               lnet_peer_decref_locked(rcd->rcd_gateway);
+               lnet_net_unlock(cpt);
+       }
+
+       if (rcd->rcd_pinginfo != NULL)
+               LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+       LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+       lnet_rc_data_t          *rcd = NULL;
+       lnet_ping_info_t        *pi;
+       int                     rc;
+       int                     i;
+
+       lnet_net_unlock(gateway->lp_cpt);
+
+       LIBCFS_ALLOC(rcd, sizeof(*rcd));
+       if (rcd == NULL)
+               goto out;
+
+       LNetInvalidateHandle(&rcd->rcd_mdh);
+       INIT_LIST_HEAD(&rcd->rcd_list);
+
+       LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+       if (pi == NULL)
+               goto out;
+
+       memset(pi, 0, LNET_PINGINFO_SIZE);
+       for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+               pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+               pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+       }
+       rcd->rcd_pinginfo = pi;
+
+       LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+       rc = LNetMDBind((lnet_md_t){.start     = pi,
+                                   .user_ptr  = rcd,
+                                   .length    = LNET_PINGINFO_SIZE,
+                                   .threshold = LNET_MD_THRESH_INF,
+                                   .options   = LNET_MD_TRUNCATE,
+                                   .eq_handle = the_lnet.ln_rc_eqh},
+                       LNET_UNLINK,
+                       &rcd->rcd_mdh);
+       if (rc < 0) {
+               CERROR("Can't bind MD: %d\n", rc);
+               goto out;
+       }
+       LASSERT(rc == 0);
+
+       lnet_net_lock(gateway->lp_cpt);
+       /* router table changed or someone has created rcd for this gateway */
+       if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+               lnet_net_unlock(gateway->lp_cpt);
+               goto out;
+       }
+
+       lnet_peer_addref_locked(gateway);
+       rcd->rcd_gateway = gateway;
+       gateway->lp_rcd = rcd;
+       gateway->lp_ping_notsent = 0;
+
+       return rcd;
+
+ out:
+       if (rcd != NULL) {
+               if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                       rc = LNetMDUnlink(rcd->rcd_mdh);
+                       LASSERT(rc == 0);
+               }
+               lnet_destroy_rc_data(rcd);
+       }
+
+       lnet_net_lock(gateway->lp_cpt);
+       return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+       int secs;
+
+       secs = rtr->lp_alive ? live_router_check_interval :
+                              dead_router_check_interval;
+       if (secs < 0)
+               secs = 0;
+
+       return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+       lnet_rc_data_t *rcd = NULL;
+       cfs_time_t      now = cfs_time_current();
+       int          secs;
+
+       lnet_peer_addref_locked(rtr);
+
+       if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+           cfs_time_after(now, rtr->lp_ping_deadline))
+               lnet_notify_locked(rtr, 1, 0, now);
+
+       /* Run any outstanding notifications */
+       lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+       if (!lnet_isrouter(rtr) ||
+           the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+               /* router table changed or router checker is shutting down */
+               lnet_peer_decref_locked(rtr);
+               return;
+       }
+
+       rcd = rtr->lp_rcd != NULL ?
+             rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+       if (rcd == NULL)
+               return;
+
+       secs = lnet_router_check_interval(rtr);
+
+       CDEBUG(D_NET,
+              "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+              "alive_count %d lp_ping_timestamp %lu\n",
+              libcfs_nid2str(rtr->lp_nid), secs,
+              rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+              rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+       if (secs != 0 && !rtr->lp_ping_notsent &&
+           cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+                                            cfs_time_seconds(secs)))) {
+               int            rc;
+               lnet_process_id_t id;
+               lnet_handle_md_t  mdh;
+
+               id.nid = rtr->lp_nid;
+               id.pid = LUSTRE_SRV_LNET_PID;
+               CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+               rtr->lp_ping_notsent   = 1;
+               rtr->lp_ping_timestamp = now;
+
+               mdh = rcd->rcd_mdh;
+
+               if (rtr->lp_ping_deadline == 0) {
+                       rtr->lp_ping_deadline =
+                               cfs_time_shift(router_ping_timeout);
+               }
+
+               lnet_net_unlock(rtr->lp_cpt);
+
+               rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+                            LNET_PROTO_PING_MATCHBITS, 0);
+
+               lnet_net_lock(rtr->lp_cpt);
+               if (rc != 0)
+                       rtr->lp_ping_notsent = 0; /* no event pending */
+       }
+
+       lnet_peer_decref_locked(rtr);
+       return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+       int       rc;
+       int       eqsz;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+       if (check_routers_before_use &&
+           dead_router_check_interval <= 0) {
+               LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+                                  " set if 'check_routers_before_use' is set"
+                                  "\n");
+               return -EINVAL;
+       }
+
+       if (!the_lnet.ln_routing &&
+           live_router_check_interval <= 0 &&
+           dead_router_check_interval <= 0)
+               return 0;
+
+       sema_init(&the_lnet.ln_rc_signal, 0);
+       /* EQ size doesn't matter; the callback is guaranteed to get every
+        * event */
+       eqsz = 0;
+       rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+                        &the_lnet.ln_rc_eqh);
+       if (rc != 0) {
+               CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+               return -ENOMEM;
+       }
+
+       the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+       rc = PTR_ERR(kthread_run(lnet_router_checker,
+                                NULL, "router_checker"));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("Can't start router checker thread: %d\n", rc);
+               /* block until event callback signals exit */
+               down(&the_lnet.ln_rc_signal);
+               rc = LNetEQFree(the_lnet.ln_rc_eqh);
+               LASSERT(rc == 0);
+               the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+               return -ENOMEM;
+       }
+
+       if (check_routers_before_use) {
+               /* Note that a helpful side-effect of pinging all known routers
+                * at startup is that it makes them drop stale connections they
+                * may have to a previous instance of me. */
+               lnet_wait_known_routerstate();
+       }
+
+       return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+       int rc;
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+               return;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+       the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+       /* block until event callback signals exit */
+       down(&the_lnet.ln_rc_signal);
+       LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+       rc = LNetEQFree(the_lnet.ln_rc_eqh);
+       LASSERT (rc == 0);
+       return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+       lnet_rc_data_t          *rcd;
+       lnet_rc_data_t          *tmp;
+       lnet_peer_t             *lp;
+       struct list_head                head;
+       int                     i = 2;
+
+       if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+                  list_empty(&the_lnet.ln_rcd_deathrow) &&
+                  list_empty(&the_lnet.ln_rcd_zombie)))
+               return;
+
+       INIT_LIST_HEAD(&head);
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+               /* router checker is stopping, prune all */
+               list_for_each_entry(lp, &the_lnet.ln_routers,
+                                       lp_rtr_list) {
+                       if (lp->lp_rcd == NULL)
+                               continue;
+
+                       LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+                       list_add(&lp->lp_rcd->rcd_list,
+                                    &the_lnet.ln_rcd_deathrow);
+                       lp->lp_rcd = NULL;
+               }
+       }
+
+       /* unlink all RCDs on deathrow list */
+       list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+       if (!list_empty(&head)) {
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               list_for_each_entry(rcd, &head, rcd_list)
+                       LNetMDUnlink(rcd->rcd_mdh);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+       /* release all zombie RCDs */
+       while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+               list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+                                            rcd_list) {
+                       if (LNetHandleIsInvalid(rcd->rcd_mdh))
+                               list_move(&rcd->rcd_list, &head);
+               }
+
+               wait_unlink = wait_unlink &&
+                             !list_empty(&the_lnet.ln_rcd_zombie);
+
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               while (!list_empty(&head)) {
+                       rcd = list_entry(head.next,
+                                            lnet_rc_data_t, rcd_list);
+                       list_del_init(&rcd->rcd_list);
+                       lnet_destroy_rc_data(rcd);
+               }
+
+               if (!wait_unlink)
+                       return;
+
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                      "Waiting for rc buffers to unlink\n");
+               cfs_pause(cfs_time_seconds(1) / 4);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+       lnet_peer_t       *rtr;
+       struct list_head        *entry;
+
+       cfs_block_allsigs();
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+               __u64   version;
+               int     cpt;
+               int     cpt2;
+
+               cpt = lnet_net_lock_current();
+rescan:
+               version = the_lnet.ln_routers_version;
+
+               list_for_each(entry, &the_lnet.ln_routers) {
+                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                       cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+                       if (cpt != cpt2) {
+                               lnet_net_unlock(cpt);
+                               cpt = cpt2;
+                               lnet_net_lock(cpt);
+                               /* the routers list has changed */
+                               if (version != the_lnet.ln_routers_version)
+                                       goto rescan;
+                       }
+
+                       lnet_ping_router_locked(rtr);
+
+                       /* NB dropped lock */
+                       if (version != the_lnet.ln_routers_version) {
+                               /* the routers list has changed */
+                               goto rescan;
+                       }
+               }
+
+               if (the_lnet.ln_routing)
+                       lnet_update_ni_status_locked();
+
+               lnet_net_unlock(cpt);
+
+               lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+               /* Call cfs_pause() here always adds 1 to load average
+                * because kernel counts # active tasks as nr_running
+                * + nr_uninterruptible. */
+               schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+                                                  cfs_time_seconds(1));
+       }
+
+       LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+       lnet_prune_rc_data(1); /* wait for UNLINK */
+
+       the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+       up(&the_lnet.ln_rc_signal);
+       /* The unlink event callback will signal final completion */
+       return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+       int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+       while (--npages >= 0)
+               __free_page(rb->rb_kiov[npages].kiov_page);
+
+       LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+       int         npages = rbp->rbp_npages;
+       int         sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+       struct page   *page;
+       lnet_rtrbuf_t *rb;
+       int         i;
+
+       LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+       if (rb == NULL)
+               return NULL;
+
+       rb->rb_pool = rbp;
+
+       for (i = 0; i < npages; i++) {
+               page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+                                         __GFP_ZERO | GFP_IOFS);
+               if (page == NULL) {
+                       while (--i >= 0)
+                               __free_page(rb->rb_kiov[i].kiov_page);
+
+                       LIBCFS_FREE(rb, sz);
+                       return NULL;
+               }
+
+               rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+               rb->rb_kiov[i].kiov_offset = 0;
+               rb->rb_kiov[i].kiov_page = page;
+       }
+
+       return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+       int             npages = rbp->rbp_npages;
+       int             nbuffers = 0;
+       lnet_rtrbuf_t   *rb;
+
+       if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+               return;
+
+       LASSERT (list_empty(&rbp->rbp_msgs));
+       LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+       while (!list_empty(&rbp->rbp_bufs)) {
+               LASSERT (rbp->rbp_credits > 0);
+
+               rb = list_entry(rbp->rbp_bufs.next,
+                                   lnet_rtrbuf_t, rb_list);
+               list_del(&rb->rb_list);
+               lnet_destroy_rtrbuf(rb, npages);
+               nbuffers++;
+       }
+
+       LASSERT (rbp->rbp_nbuffers == nbuffers);
+       LASSERT (rbp->rbp_credits == nbuffers);
+
+       rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+       lnet_rtrbuf_t *rb;
+       int         i;
+
+       if (rbp->rbp_nbuffers != 0) {
+               LASSERT (rbp->rbp_nbuffers == nbufs);
+               return 0;
+       }
+
+       for (i = 0; i < nbufs; i++) {
+               rb = lnet_new_rtrbuf(rbp, cpt);
+
+               if (rb == NULL) {
+                       CERROR("Failed to allocate %d router bufs of %d pages\n",
+                              nbufs, rbp->rbp_npages);
+                       return -ENOMEM;
+               }
+
+               rbp->rbp_nbuffers++;
+               rbp->rbp_credits++;
+               rbp->rbp_mincredits++;
+               list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+               /* No allocation "under fire" */
+               /* Otherwise we'd need code to schedule blocked msgs etc */
+               LASSERT (!the_lnet.ln_routing);
+       }
+
+       LASSERT (rbp->rbp_credits == nbufs);
+       return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+       INIT_LIST_HEAD(&rbp->rbp_msgs);
+       INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+       rbp->rbp_npages = npages;
+       rbp->rbp_credits = 0;
+       rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+       lnet_rtrbufpool_t *rtrp;
+       int               i;
+
+       if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+               return;
+
+       cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+               lnet_rtrpool_free_bufs(&rtrp[0]);
+               lnet_rtrpool_free_bufs(&rtrp[1]);
+               lnet_rtrpool_free_bufs(&rtrp[2]);
+       }
+
+       cfs_percpt_free(the_lnet.ln_rtrpools);
+       the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_TINY;
+
+       if (tiny_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "tiny_router_buffers=%d invalid when "
+                                  "routing enabled\n", tiny_router_buffers);
+               return -1;
+       }
+
+       if (tiny_router_buffers > 0)
+               nrbs = tiny_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_SMALL;
+
+       if (small_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "small_router_buffers=%d invalid when "
+                                  "routing enabled\n", small_router_buffers);
+               return -1;
+       }
+
+       if (small_router_buffers > 0)
+               nrbs = small_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_LARGE;
+
+       if (large_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "large_router_buffers=%d invalid when "
+                                  "routing enabled\n", large_router_buffers);
+               return -1;
+       }
+
+       if (large_router_buffers > 0)
+               nrbs = large_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+       lnet_rtrbufpool_t *rtrp;
+       int     large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       int     small_pages = 1;
+       int     nrb_tiny;
+       int     nrb_small;
+       int     nrb_large;
+       int     rc;
+       int     i;
+
+       if (!strcmp(forwarding, "")) {
+               /* not set either way */
+               if (!im_a_router)
+                       return 0;
+       } else if (!strcmp(forwarding, "disabled")) {
+               /* explicitly disabled */
+               return 0;
+       } else if (!strcmp(forwarding, "enabled")) {
+               /* explicitly enabled */
+       } else {
+               LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+                                  "'enabled' or 'disabled'\n");
+               return -EINVAL;
+       }
+
+       nrb_tiny = lnet_nrb_tiny_calculate(0);
+       if (nrb_tiny < 0)
+               return -EINVAL;
+
+       nrb_small = lnet_nrb_small_calculate(small_pages);
+       if (nrb_small < 0)
+               return -EINVAL;
+
+       nrb_large = lnet_nrb_large_calculate(large_pages);
+       if (nrb_large < 0)
+               return -EINVAL;
+
+       the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+                                               LNET_NRBPOOLS *
+                                               sizeof(lnet_rtrbufpool_t));
+       if (the_lnet.ln_rtrpools == NULL) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "Failed to initialize router buffe pool\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+               lnet_rtrpool_init(&rtrp[0], 0);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+               if (rc != 0)
+                       goto failed;
+
+               lnet_rtrpool_init(&rtrp[1], small_pages);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+               if (rc != 0)
+                       goto failed;
+
+               lnet_rtrpool_init(&rtrp[2], large_pages);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+               if (rc != 0)
+                       goto failed;
+       }
+
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_routing = 1;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return 0;
+
+ failed:
+       lnet_rtrpools_free();
+       return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+       struct lnet_peer        *lp = NULL;
+       cfs_time_t              now = cfs_time_current();
+       int                     cpt = lnet_cpt_of_nid(nid);
+
+       LASSERT (!in_interrupt ());
+
+       CDEBUG (D_NET, "%s notifying %s: %s\n",
+               (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+               libcfs_nid2str(nid),
+               alive ? "up" : "down");
+
+       if (ni != NULL &&
+           LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+               CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+                       libcfs_nid2str(nid), alive ? "birth" : "death",
+                       libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+       }
+
+       /* can't do predictions... */
+       if (cfs_time_after(when, now)) {
+               CWARN ("Ignoring prediction from %s of %s %s "
+                      "%ld seconds in the future\n",
+                      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+                      libcfs_nid2str(nid), alive ? "up" : "down",
+                      cfs_duration_sec(cfs_time_sub(when, now)));
+               return -EINVAL;
+       }
+
+       if (ni != NULL && !alive &&          /* LND telling me she's down */
+           !auto_down) {                      /* auto-down disabled */
+               CDEBUG(D_NET, "Auto-down disabled\n");
+               return 0;
+       }
+
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
+       lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+       if (lp == NULL) {
+               /* nid not found */
+               lnet_net_unlock(cpt);
+               CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+               return 0;
+       }
+
+       /* We can't fully trust LND on reporting exact peer last_alive
+        * if he notifies us about dead peer. For example ksocklnd can
+        * call us with when == _time_when_the_node_was_booted_ if
+        * no connections were successfully established */
+       if (ni != NULL && !alive && when < lp->lp_last_alive)
+               when = lp->lp_last_alive;
+
+       lnet_notify_locked(lp, ni == NULL, alive, when);
+
+       lnet_ni_notify_locked(ni, lp);
+
+       lnet_peer_decref_locked(lp);
+
+       lnet_net_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+       return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+       return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+       static time_t last = 0;
+       static int    running = 0;
+
+       time_t      now = cfs_time_current_sec();
+       int            interval = now - last;
+       int            rc;
+       __u64        version;
+       lnet_peer_t      *rtr;
+
+       /* It's no use to call me again within a sec - all intervals and
+        * timeouts are measured in seconds */
+       if (last != 0 && interval < 2)
+               return;
+
+       if (last != 0 &&
+           interval > MAX(live_router_check_interval,
+                          dead_router_check_interval))
+               CNETERR("Checker(%d/%d) not called for %d seconds\n",
+                       live_router_check_interval, dead_router_check_interval,
+                       interval);
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_net_lock(0);
+       LASSERT(!running); /* recursion check */
+       running = 1;
+       lnet_net_unlock(0);
+
+       last = now;
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+               lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+       /* consume all pending events */
+       while (1) {
+               int       i;
+               lnet_event_t ev;
+
+               /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+                * recursion breaker in LNetEQPoll would fail */
+               rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+               if (rc == 0)   /* no event pending */
+                       break;
+
+               /* NB a lost SENT prevents me from pinging a router again */
+               if (rc == -EOVERFLOW) {
+                       CERROR("Dropped an event!!!\n");
+                       abort();
+               }
+
+               LASSERT (rc == 1);
+
+               lnet_router_checker_event(&ev);
+       }
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+               lnet_prune_rc_data(1); /* release rcd */
+               the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+               running = 0;
+               return;
+       }
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       lnet_net_lock(0);
+
+       version = the_lnet.ln_routers_version;
+       list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+               lnet_ping_router_locked(rtr);
+               LASSERT (version == the_lnet.ln_routers_version);
+       }
+
+       lnet_net_unlock(0);
+
+       running = 0; /* lock only needed for the recursion check */
+       return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+       char *s;
+
+       s = getenv("LNET_ROUTER_PING_TIMEOUT");
+       if (s != NULL) router_ping_timeout = atoi(s);
+
+       s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+       if (s != NULL) live_router_check_interval = atoi(s);
+
+       s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+       if (s != NULL) dead_router_check_interval = atoi(s);
+
+       /* This replaces old lnd_notify mechanism */
+       check_routers_before_use = 1;
+       if (dead_router_check_interval <= 0)
+               dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+       return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644 (file)
index 0000000..3084b0c
--- /dev/null
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET        (0x100)
+enum {
+       PSDEV_LNET_STATS = 100,
+       PSDEV_LNET_ROUTES,
+       PSDEV_LNET_ROUTERS,
+       PSDEV_LNET_PEERS,
+       PSDEV_LNET_BUFFERS,
+       PSDEV_LNET_NIS,
+       PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS                (sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS     (LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS     MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS    LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS    (LNET_LOFFT_BITS -       \
+                                LNET_PROC_CPT_BITS -    \
+                                LNET_PROC_VER_BITS -    \
+                                LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS    (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS    (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK     ((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK     ((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK    ((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK    ((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)                         \
+       (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)                         \
+       (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)                                \
+       (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)                                \
+       (int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)                \
+       (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+       ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+       ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+       ((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)   ((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+                            loff_t pos, void *buffer, int nob)
+{
+       int           rc;
+       lnet_counters_t *ctrs;
+       int           len;
+       char        *tmpstr;
+       const int       tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+       if (write) {
+               lnet_counters_reset();
+               return 0;
+       }
+
+       /* read */
+
+       LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+       if (ctrs == NULL)
+               return -ENOMEM;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL) {
+               LIBCFS_FREE(ctrs, sizeof(*ctrs));
+               return -ENOMEM;
+       }
+
+       lnet_counters_get(ctrs);
+
+       len = snprintf(tmpstr, tmpsiz,
+                      "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+                      LPU64" "LPU64,
+                      ctrs->msgs_alloc, ctrs->msgs_max,
+                      ctrs->errors,
+                      ctrs->send_count, ctrs->recv_count,
+                      ctrs->route_count, ctrs->drop_count,
+                      ctrs->send_length, ctrs->recv_length,
+                      ctrs->route_length, ctrs->drop_length);
+
+       if (pos >= min_t(int, len, strlen(tmpstr)))
+               rc = 0;
+       else
+               rc = cfs_trace_copyout_string(buffer, nob,
+                                             tmpstr + pos, "\n");
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+       LIBCFS_FREE(ctrs, sizeof(*ctrs));
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+       const int       tmpsiz = 256;
+       char            *tmpstr;
+       char            *s;
+       int             rc = 0;
+       int             len;
+       int             ver;
+       int             off;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       CLASSERT(sizeof(loff_t) >= 4);
+
+       off = LNET_PROC_HOFF_GET(*ppos);
+       ver = LNET_PROC_VER_GET(*ppos);
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+                             the_lnet.ln_routing ? "enabled" : "disabled");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+                             "net", "hops", "state", "router");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               lnet_net_lock(0);
+               ver = (unsigned int)the_lnet.ln_remote_nets_version;
+               lnet_net_unlock(0);
+               *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+       } else {
+               struct list_head                *n;
+               struct list_head                *r;
+               lnet_route_t            *route = NULL;
+               lnet_remotenet_t        *rnet  = NULL;
+               int                     skip  = off - 1;
+               struct list_head                *rn_list;
+               int                     i;
+
+               lnet_net_lock(0);
+
+               if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+                       lnet_net_unlock(0);
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+                    i++) {
+                       rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+                       n = rn_list->next;
+
+                       while (n != rn_list && route == NULL) {
+                               rnet = list_entry(n, lnet_remotenet_t,
+                                                     lrn_list);
+
+                               r = rnet->lrn_routes.next;
+
+                               while (r != &rnet->lrn_routes) {
+                                       lnet_route_t *re =
+                                               list_entry(r, lnet_route_t,
+                                                              lr_list);
+                                       if (skip == 0) {
+                                               route = re;
+                                               break;
+                                       }
+
+                                       skip--;
+                                       r = r->next;
+                               }
+
+                               n = n->next;
+                       }
+               }
+
+               if (route != NULL) {
+                       __u32   net   = rnet->lrn_net;
+                       unsigned int hops  = route->lr_hops;
+                       lnet_nid_t   nid   = route->lr_gateway->lp_nid;
+                       int       alive = route->lr_gateway->lp_alive;
+
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-8s %4u %7s %s\n",
+                                     libcfs_net2str(net), hops,
+                                     alive ? "up" : "down",
+                                     libcfs_nid2str(nid));
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else {
+                       off += 1;
+                       *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+               }
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+       int     rc = 0;
+       char      *tmpstr;
+       char      *s;
+       const int  tmpsiz = 256;
+       int     len;
+       int     ver;
+       int     off;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       off = LNET_PROC_HOFF_GET(*ppos);
+       ver = LNET_PROC_VER_GET(*ppos);
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+                             "ref", "rtr_ref", "alive_cnt", "state",
+                             "last_ping", "ping_sent", "deadline",
+                             "down_ni", "router");
+               LASSERT(tmpstr + tmpsiz - s > 0);
+
+               lnet_net_lock(0);
+               ver = (unsigned int)the_lnet.ln_routers_version;
+               lnet_net_unlock(0);
+               *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+       } else {
+               struct list_head                *r;
+               struct lnet_peer        *peer = NULL;
+               int                     skip = off - 1;
+
+               lnet_net_lock(0);
+
+               if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+                       lnet_net_unlock(0);
+
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               r = the_lnet.ln_routers.next;
+
+               while (r != &the_lnet.ln_routers) {
+                       lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+                                                        lp_rtr_list);
+
+                       if (skip == 0) {
+                               peer = lp;
+                               break;
+                       }
+
+                       skip--;
+                       r = r->next;
+               }
+
+               if (peer != NULL) {
+                       lnet_nid_t nid = peer->lp_nid;
+                       cfs_time_t now = cfs_time_current();
+                       cfs_time_t deadline = peer->lp_ping_deadline;
+                       int nrefs     = peer->lp_refcount;
+                       int nrtrrefs  = peer->lp_rtr_refcount;
+                       int alive_cnt = peer->lp_alive_count;
+                       int alive     = peer->lp_alive;
+                       int pingsent  = !peer->lp_ping_notsent;
+                       int last_ping = cfs_duration_sec(cfs_time_sub(now,
+                                                    peer->lp_ping_timestamp));
+                       int down_ni   = 0;
+                       lnet_route_t *rtr;
+
+                       if ((peer->lp_ping_feats &
+                            LNET_PING_FEAT_NI_STATUS) != 0) {
+                               list_for_each_entry(rtr, &peer->lp_routes,
+                                                       lr_gwlist) {
+                                       /* downis on any route should be the
+                                        * number of downis on the gateway */
+                                       if (rtr->lr_downis != 0) {
+                                               down_ni = rtr->lr_downis;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if (deadline == 0)
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                             "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+                                             nrefs, nrtrrefs, alive_cnt,
+                                             alive ? "up" : "down", last_ping,
+                                             pingsent, "NA", down_ni,
+                                             libcfs_nid2str(nid));
+                       else
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                             "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+                                             nrefs, nrtrrefs, alive_cnt,
+                                             alive ? "up" : "down", last_ping,
+                                             pingsent,
+                                             cfs_duration_sec(cfs_time_sub(deadline, now)),
+                                             down_ni, libcfs_nid2str(nid));
+                       LASSERT (tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else {
+                       off += 1;
+                       *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+               }
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+       const int               tmpsiz  = 256;
+       struct lnet_peer_table  *ptable;
+       char                    *tmpstr;
+       char                    *s;
+       int                     cpt  = LNET_PROC_CPT_GET(*ppos);
+       int                     ver  = LNET_PROC_VER_GET(*ppos);
+       int                     hash = LNET_PROC_HASH_GET(*ppos);
+       int                     hoff = LNET_PROC_HOFF_GET(*ppos);
+       int                     rc = 0;
+       int                     len;
+
+       CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+       LASSERT(!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       if (cpt >= LNET_CPT_NUMBER) {
+               *lenp = 0;
+               return 0;
+       }
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+                             "nid", "refs", "state", "last", "max",
+                             "rtr", "min", "tx", "min", "queue");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               hoff++;
+       } else {
+               struct lnet_peer        *peer;
+               struct list_head                *p;
+               int                     skip;
+ again:
+               p = NULL;
+               peer = NULL;
+               skip = hoff - 1;
+
+               lnet_net_lock(cpt);
+               ptable = the_lnet.ln_peer_tables[cpt];
+               if (hoff == 1)
+                       ver = LNET_PROC_VERSION(ptable->pt_version);
+
+               if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+                       lnet_net_unlock(cpt);
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               while (hash < LNET_PEER_HASH_SIZE) {
+                       if (p == NULL)
+                               p = ptable->pt_hash[hash].next;
+
+                       while (p != &ptable->pt_hash[hash]) {
+                               lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+                                                                lp_hashlist);
+                               if (skip == 0) {
+                                       peer = lp;
+
+                                       /* minor optimization: start from idx+1
+                                        * on next iteration if we've just
+                                        * drained lp_hashlist */
+                                       if (lp->lp_hashlist.next ==
+                                           &ptable->pt_hash[hash]) {
+                                               hoff = 1;
+                                               hash++;
+                                       } else {
+                                               hoff++;
+                                       }
+
+                                       break;
+                               }
+
+                               skip--;
+                               p = lp->lp_hashlist.next;
+                       }
+
+                       if (peer != NULL)
+                               break;
+
+                       p = NULL;
+                       hoff = 1;
+                       hash++;
+               }
+
+               if (peer != NULL) {
+                       lnet_nid_t nid       = peer->lp_nid;
+                       int     nrefs     = peer->lp_refcount;
+                       int     lastalive = -1;
+                       char      *aliveness = "NA";
+                       int     maxcr     = peer->lp_ni->ni_peertxcredits;
+                       int     txcr      = peer->lp_txcredits;
+                       int     mintxcr   = peer->lp_mintxcredits;
+                       int     rtrcr     = peer->lp_rtrcredits;
+                       int     minrtrcr  = peer->lp_minrtrcredits;
+                       int     txqnob    = peer->lp_txqnob;
+
+                       if (lnet_isrouter(peer) ||
+                           lnet_peer_aliveness_enabled(peer))
+                               aliveness = peer->lp_alive ? "up" : "down";
+
+                       if (lnet_peer_aliveness_enabled(peer)) {
+                               cfs_time_t     now = cfs_time_current();
+                               cfs_duration_t delta;
+
+                               delta = cfs_time_sub(now, peer->lp_last_alive);
+                               lastalive = cfs_duration_sec(delta);
+
+                               /* No need to mess up peers contents with
+                                * arbitrarily long integers - it suffices to
+                                * know that lastalive is more than 10000s old
+                                */
+                               if (lastalive >= 10000)
+                                       lastalive = 9999;
+                       }
+
+                       lnet_net_unlock(cpt);
+
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+                                     libcfs_nid2str(nid), nrefs, aliveness,
+                                     lastalive, maxcr, rtrcr, minrtrcr, txcr,
+                                     mintxcr, txqnob);
+                       LASSERT (tmpstr + tmpsiz - s > 0);
+
+               } else { /* peer is NULL */
+                       lnet_net_unlock(cpt);
+               }
+
+               if (hash == LNET_PEER_HASH_SIZE) {
+                       cpt++;
+                       hash = 0;
+                       hoff = 1;
+                       if (peer == NULL && cpt < LNET_CPT_NUMBER)
+                               goto again;
+               }
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else
+                       *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+                              loff_t pos, void *buffer, int nob)
+{
+       char        *s;
+       char        *tmpstr;
+       int             tmpsiz;
+       int             idx;
+       int             len;
+       int             rc;
+       int             i;
+
+       LASSERT(!write);
+
+       /* (4 %d) * 4 * LNET_CPT_NUMBER */
+       tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       s += snprintf(s, tmpstr + tmpsiz - s,
+                     "%5s %5s %7s %7s\n",
+                     "pages", "count", "credits", "min");
+       LASSERT (tmpstr + tmpsiz - s > 0);
+
+       if (the_lnet.ln_rtrpools == NULL)
+               goto out; /* I'm not a router */
+
+       for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+               lnet_rtrbufpool_t *rbp;
+
+               lnet_net_lock(LNET_LOCK_EX);
+               cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%5d %5d %7d %7d\n",
+                                     rbp[idx].rbp_npages,
+                                     rbp[idx].rbp_nbuffers,
+                                     rbp[idx].rbp_credits,
+                                     rbp[idx].rbp_mincredits);
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+               lnet_net_unlock(LNET_LOCK_EX);
+       }
+
+ out:
+       len = s - tmpstr;
+
+       if (pos >= min_t(int, len, strlen(tmpstr)))
+               rc = 0;
+       else
+               rc = cfs_trace_copyout_string(buffer, nob,
+                                             tmpstr + pos, NULL);
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+       int     tmpsiz = 128 * LNET_CPT_NUMBER;
+       int     rc = 0;
+       char      *tmpstr;
+       char      *s;
+       int     len;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+                             "nid", "status", "alive", "refs", "peer",
+                             "rtr", "max", "tx", "min");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+       } else {
+               struct list_head        *n;
+               lnet_ni_t        *ni   = NULL;
+               int             skip = *ppos - 1;
+
+               lnet_net_lock(0);
+
+               n = the_lnet.ln_nis.next;
+
+               while (n != &the_lnet.ln_nis) {
+                       lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+                       if (skip == 0) {
+                               ni = a_ni;
+                               break;
+                       }
+
+                       skip--;
+                       n = n->next;
+               }
+
+               if (ni != NULL) {
+                       struct lnet_tx_queue    *tq;
+                       char    *stat;
+                       long    now = cfs_time_current_sec();
+                       int     last_alive = -1;
+                       int     i;
+                       int     j;
+
+                       if (the_lnet.ln_routing)
+                               last_alive = now - ni->ni_last_alive;
+
+                       /* @lo forever alive */
+                       if (ni->ni_lnd->lnd_type == LOLND)
+                               last_alive = 0;
+
+                       lnet_ni_lock(ni);
+                       LASSERT(ni->ni_status != NULL);
+                       stat = (ni->ni_status->ns_status ==
+                               LNET_NI_STATUS_UP) ? "up" : "down";
+                       lnet_ni_unlock(ni);
+
+                       /* we actually output credits information for
+                        * TX queue of each partition */
+                       cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+                               for (j = 0; ni->ni_cpts != NULL &&
+                                    j < ni->ni_ncpts; j++) {
+                                       if (i == ni->ni_cpts[j])
+                                               break;
+                               }
+
+                               if (j == ni->ni_ncpts)
+                                       continue;
+
+                               if (i != 0)
+                                       lnet_net_lock(i);
+
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+                                     libcfs_nid2str(ni->ni_nid), stat,
+                                     last_alive, *ni->ni_refs[i],
+                                     ni->ni_peertxcredits,
+                                     ni->ni_peerrtrcredits,
+                                     tq->tq_credits_max,
+                                     tq->tq_credits, tq->tq_credits_min);
+                               if (i != 0)
+                                       lnet_net_unlock(i);
+                       }
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else
+                       *ppos += 1;
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+struct lnet_portal_rotors {
+       int          pr_value;
+       const char      *pr_name;
+       const char      *pr_desc;
+};
+
+static struct lnet_portal_rotors       portal_rotors[] = {
+       {
+               .pr_value = LNET_PTL_ROTOR_OFF,
+               .pr_name  = "OFF",
+               .pr_desc  = "Turn off message rotor for wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_ON,
+               .pr_name  = "ON",
+               .pr_desc  = "round-robin dispatch all PUT messages for "
+                           "wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_RR_RT,
+               .pr_name  = "RR_RT",
+               .pr_desc  = "round-robin dispatch routed PUT message for "
+                           "wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_HASH_RT,
+               .pr_name  = "HASH_RT",
+               .pr_desc  = "dispatch routed PUT message by hashing source "
+                           "NID for wildcard portals"
+       },
+       {
+               .pr_value = -1,
+               .pr_name  = NULL,
+               .pr_desc  = NULL
+       },
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+                                   loff_t pos, void *buffer, int nob)
+{
+       const int       buf_len = 128;
+       char            *buf;
+       char            *tmp;
+       int             rc;
+       int             i;
+
+       LIBCFS_ALLOC(buf, buf_len);
+       if (buf == NULL)
+               return -ENOMEM;
+
+       if (!write) {
+               lnet_res_lock(0);
+
+               for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+                       if (portal_rotors[i].pr_value == portal_rotor)
+                               break;
+               }
+
+               LASSERT(portal_rotors[i].pr_value == portal_rotor);
+               lnet_res_unlock(0);
+
+               rc = snprintf(buf, buf_len,
+                             "{\n\tportals: all\n"
+                             "\trotor: %s\n\tdescription: %s\n}",
+                             portal_rotors[i].pr_name,
+                             portal_rotors[i].pr_desc);
+
+               if (pos >= min_t(int, rc, buf_len)) {
+                       rc = 0;
+               } else {
+                       rc = cfs_trace_copyout_string(buffer, nob,
+                                       buf + pos, "\n");
+               }
+               goto out;
+       }
+
+       rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+       if (rc < 0)
+               goto out;
+
+       tmp = cfs_trimwhite(buf);
+
+       rc = -EINVAL;
+       lnet_res_lock(0);
+       for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+               if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+                                   strlen(portal_rotors[i].pr_name)) == 0) {
+                       portal_rotor = portal_rotors[i].pr_value;
+                       rc = 0;
+                       break;
+               }
+       }
+       lnet_res_unlock(0);
+out:
+       LIBCFS_FREE(buf, buf_len);
+       return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+       /*
+        * NB No .strategy entries have been provided since sysctl(8) prefers
+        * to go via /proc for portability.
+        */
+       {
+               INIT_CTL_NAME(PSDEV_LNET_STATS)
+               .procname = "stats",
+               .mode     = 0644,
+               .proc_handler = &proc_lnet_stats,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+               .procname = "routes",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_routes,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+               .procname = "routers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_routers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PEERS)
+               .procname = "peers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_peers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PEERS)
+               .procname = "buffers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_buffers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_NIS)
+               .procname = "nis",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_nis,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+               .procname = "portal_rotor",
+               .mode     = 0644,
+               .proc_handler = &proc_lnet_portal_rotor,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+static ctl_table_t top_table[] = {
+       {
+               INIT_CTL_NAME(CTL_LNET)
+               .procname = "lnet",
+               .mode     = 0555,
+               .data     = NULL,
+               .maxlen   = 0,
+               .child    = lnet_table,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header == NULL)
+               lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header != NULL)
+               unregister_sysctl_table(lnet_table_header);
+
+       lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644 (file)
index 0000000..1e40aee
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+                  module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644 (file)
index 0000000..3bb6fbe
--- /dev/null
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+               "# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+       srpc_bulk_t     *bulk;
+       sfw_test_unit_t *tsu;
+
+       LASSERT (tsi->tsi_is_client);
+
+       list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+               bulk = tsu->tsu_private;
+               if (bulk == NULL) continue;
+
+               srpc_free_bulk(bulk);
+               tsu->tsu_private = NULL;
+       }
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+       sfw_session_t    *sn = tsi->tsi_batch->bat_session;
+       int               flags;
+       int               npg;
+       int               len;
+       int               opc;
+       srpc_bulk_t      *bulk;
+       sfw_test_unit_t  *tsu;
+
+       LASSERT(sn != NULL);
+       LASSERT(tsi->tsi_is_client);
+
+       if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+               test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               npg   = breq->blk_npg;
+               /* NB: this is not going to work for variable page size,
+                * but we have to keep it for compatibility */
+               len   = npg * PAGE_CACHE_SIZE;
+
+       } else {
+               test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+               /* I should never get this step if it's unknown feature
+                * because make_session will reject unknown feature */
+               LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               len   = breq->blk_len;
+               npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       if (npg > LNET_MAX_IOV || npg <= 0)
+               return -EINVAL;
+
+       if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+               return -EINVAL;
+
+       if (flags != LST_BRW_CHECK_NONE &&
+           flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+               return -EINVAL;
+
+       list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+               bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+                                      npg, len, opc == LST_BRW_READ);
+               if (bulk == NULL) {
+                       brw_client_fini(tsi);
+                       return -ENOMEM;
+               }
+
+               tsu->tsu_private = bulk;
+       }
+
+       return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+       struct timeval tv;
+
+       if (brw_inject_errors <= 0) return 0;
+
+       do_gettimeofday(&tv);
+
+       if ((tv.tv_usec & 1) == 0) return 0;
+
+       return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+       char *addr = page_address(pg);
+       int   i;
+
+       LASSERT (addr != NULL);
+
+       if (pattern == LST_BRW_CHECK_NONE) return;
+
+       if (magic == BRW_MAGIC)
+               magic += brw_inject_one_error();
+
+       if (pattern == LST_BRW_CHECK_SIMPLE) {
+               memcpy(addr, &magic, BRW_MSIZE);
+               addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+               memcpy(addr, &magic, BRW_MSIZE);
+               return;
+       }
+
+       if (pattern == LST_BRW_CHECK_FULL) {
+               for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+                       memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+       char  *addr = page_address(pg);
+       __u64  data = 0; /* make compiler happy */
+       int    i;
+
+       LASSERT (addr != NULL);
+
+       if (pattern == LST_BRW_CHECK_NONE)
+               return 0;
+
+       if (pattern == LST_BRW_CHECK_SIMPLE) {
+               data = *((__u64 *) addr);
+               if (data != magic) goto bad_data;
+
+               addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+               data = *((__u64 *) addr);
+               if (data != magic) goto bad_data;
+
+               return 0;
+       }
+
+       if (pattern == LST_BRW_CHECK_FULL) {
+               for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+                       data = *(((__u64 *) addr) + i);
+                       if (data != magic) goto bad_data;
+               }
+
+               return 0;
+       }
+
+       LBUG ();
+
+bad_data:
+       CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+               pg, data, magic);
+       return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+       int      i;
+       struct page *pg;
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               brw_fill_page(pg, pattern, magic);
+       }
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+       int      i;
+       struct page *pg;
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               if (brw_check_page(pg, pattern, magic) != 0) {
+                       CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+                               pg, i, bk->bk_niov);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+                    lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+       srpc_bulk_t      *bulk = tsu->tsu_private;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_client_rpc_t   *rpc;
+       srpc_brw_reqst_t    *req;
+       int                  flags;
+       int                  npg;
+       int                  len;
+       int                  opc;
+       int                  rc;
+
+       LASSERT(sn != NULL);
+       LASSERT(bulk != NULL);
+
+       if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+               test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               npg   = breq->blk_npg;
+               len   = npg * PAGE_CACHE_SIZE;
+
+       } else {
+               test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+               /* I should never get this step if it's unknown feature
+                * because make_session will reject unknown feature */
+               LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               len   = breq->blk_len;
+               npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+       if (rc != 0)
+               return rc;
+
+       memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+       if (opc == LST_BRW_WRITE)
+               brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+       else
+               brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+       req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+       req->brw_flags = flags;
+       req->brw_rw    = opc;
+       req->brw_len   = len;
+
+       *rpcpp = rpc;
+       return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+       __u64           magic = BRW_MAGIC;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_msg_t        *msg = &rpc->crpc_replymsg;
+       srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+       srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+       LASSERT (sn != NULL);
+
+       if (rpc->crpc_status != 0) {
+               CERROR ("BRW RPC to %s failed with %d\n",
+                       libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+               if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                       atomic_inc(&sn->sn_brw_errors);
+               goto out;
+       }
+
+       if (msg->msg_magic != SRPC_MSG_MAGIC) {
+               __swab64s(&magic);
+               __swab32s(&reply->brw_status);
+       }
+
+       CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+               "BRW RPC to %s finished with brw_status: %d\n",
+               libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+       if (reply->brw_status != 0) {
+               atomic_inc(&sn->sn_brw_errors);
+               rpc->crpc_status = -(int)reply->brw_status;
+               goto out;
+       }
+
+       if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+       if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+               CERROR ("Bulk data from %s is corrupted!\n",
+                       libcfs_id2str(rpc->crpc_dest));
+               atomic_inc(&sn->sn_brw_errors);
+               rpc->crpc_status = -EBADMSG;
+       }
+
+out:
+       return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+       srpc_bulk_t *blk = rpc->srpc_bulk;
+
+       if (blk == NULL) return;
+
+       if (rpc->srpc_status != 0)
+               CERROR ("Bulk transfer %s %s has failed: %d\n",
+                       blk->bk_sink ? "from" : "to",
+                       libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+       else
+               CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+                       blk->bk_niov, blk->bk_sink ? "from" : "to",
+                       libcfs_id2str(rpc->srpc_peer));
+
+       sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+       __u64        magic = BRW_MAGIC;
+       srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+       srpc_brw_reqst_t *reqst;
+       srpc_msg_t       *reqstmsg;
+
+       LASSERT (rpc->srpc_bulk != NULL);
+       LASSERT (rpc->srpc_reqstbuf != NULL);
+
+       reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       reqst = &reqstmsg->msg_body.brw_reqst;
+
+       if (status != 0) {
+               CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+                       reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+                       libcfs_id2str(rpc->srpc_peer), status);
+               return -EIO;
+       }
+
+       if (reqst->brw_rw == LST_BRW_READ)
+               return 0;
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+               __swab64s(&magic);
+
+       if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+               CERROR ("Bulk data from %s is corrupted!\n",
+                       libcfs_id2str(rpc->srpc_peer));
+               reply->brw_status = EBADMSG;
+       }
+
+       return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+       srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+       srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+       int               npg;
+       int            rc;
+
+       LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+               LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+               __swab32s(&reqst->brw_rw);
+               __swab32s(&reqst->brw_len);
+               __swab32s(&reqst->brw_flags);
+               __swab64s(&reqst->brw_rpyid);
+               __swab64s(&reqst->brw_bulkid);
+       }
+       LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+       reply->brw_status = 0;
+       rpc->srpc_done = brw_server_rpc_done;
+
+       if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+           (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+            reqst->brw_flags != LST_BRW_CHECK_FULL &&
+            reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+               reply->brw_status = EINVAL;
+               return 0;
+       }
+
+       if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               replymsg->msg_ses_feats = LST_FEATS_MASK;
+               reply->brw_status = EPROTO;
+               return 0;
+       }
+
+       if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+               /* compat with old version */
+               if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+                       reply->brw_status = EINVAL;
+                       return 0;
+               }
+               npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+       } else {
+               npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+       if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+               reply->brw_status = EINVAL;
+               return 0;
+       }
+
+       rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+                            reqst->brw_len,
+                            reqst->brw_rw == LST_BRW_WRITE);
+       if (rc != 0)
+               return rc;
+
+       if (reqst->brw_rw == LST_BRW_READ)
+               brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+       else
+               brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+       return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+       brw_test_client.tso_init       = brw_client_init;
+       brw_test_client.tso_fini       = brw_client_fini;
+       brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+       brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+       brw_test_service.sv_id   = SRPC_SERVICE_BRW;
+       brw_test_service.sv_name       = "brw_test";
+       brw_test_service.sv_handler    = brw_server_handle;
+       brw_test_service.sv_bulk_ready = brw_bulk_ready;
+       brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644 (file)
index 0000000..bce3d3b
--- /dev/null
@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+       char      *name;
+       int     rc;
+
+       if (args->lstio_ses_idp   == NULL || /* address for output sid */
+           args->lstio_ses_key   == 0 || /* no key is specified */
+           args->lstio_ses_namep == NULL || /* session name */
+           args->lstio_ses_nmlen <= 0 ||
+           args->lstio_ses_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_ses_namep,
+                              args->lstio_ses_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_ses_nmlen] = 0;
+
+       rc = lstcon_session_new(name,
+                               args->lstio_ses_key,
+                               args->lstio_ses_feats,
+                               args->lstio_ses_force,
+                               args->lstio_ses_timeout,
+                               args->lstio_ses_idp);
+
+       LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+       return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+       if (args->lstio_ses_key != console_session.ses_key)
+               return -EACCES;
+
+       return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+       /* no checking of key */
+
+       if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+           args->lstio_ses_keyp  == NULL || /* address for ouput key */
+           args->lstio_ses_featp  == NULL || /* address for ouput features */
+           args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+           args->lstio_ses_namep == NULL || /* address for ouput name */
+           args->lstio_ses_nmlen <= 0 ||
+           args->lstio_ses_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_session_info(args->lstio_ses_idp,
+                                  args->lstio_ses_keyp,
+                                  args->lstio_ses_featp,
+                                  args->lstio_ses_ndinfo,
+                                  args->lstio_ses_namep,
+                                  args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+       char   *name   = NULL;
+       int     client = 1;
+       int     rc;
+
+       if (args->lstio_dbg_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_dbg_resultp == NULL)
+               return -EINVAL;
+
+       if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+           (args->lstio_dbg_nmlen <= 0 ||
+            args->lstio_dbg_nmlen > LST_NAME_SIZE))
+               return -EINVAL;
+
+       if (args->lstio_dbg_namep != NULL) {
+               LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+               if (name == NULL)
+                       return -ENOMEM;
+
+               if (copy_from_user(name, args->lstio_dbg_namep,
+                                      args->lstio_dbg_nmlen)) {
+                       LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+                       return -EFAULT;
+               }
+
+               name[args->lstio_dbg_nmlen] = 0;
+       }
+
+       rc = -EINVAL;
+
+       switch (args->lstio_dbg_type) {
+       case LST_OPC_SESSION:
+               rc = lstcon_session_debug(args->lstio_dbg_timeout,
+                                         args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_BATCHSRV:
+               client = 0;
+       case LST_OPC_BATCHCLI:
+               if (name == NULL)
+                       goto out;
+
+               rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+                                       name, client, args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_GROUP:
+               if (name == NULL)
+                       goto out;
+
+               rc = lstcon_group_debug(args->lstio_dbg_timeout,
+                                       name, args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_NODES:
+               if (args->lstio_dbg_count <= 0 ||
+                   args->lstio_dbg_idsp == NULL)
+                       goto out;
+
+               rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+                                       args->lstio_dbg_count,
+                                       args->lstio_dbg_idsp,
+                                       args->lstio_dbg_resultp);
+               break;
+
+       default:
+               break;
+       }
+
+out:
+       if (name != NULL)
+               LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+       char       *name;
+       int          rc;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_add(name);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_del(name);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_resultp == NULL ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                          args->lstio_grp_namep,
+                          args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       switch (args->lstio_grp_opc) {
+       case LST_GROUP_CLEAN:
+               rc = lstcon_group_clean(name, args->lstio_grp_args);
+               break;
+
+       case LST_GROUP_REFRESH:
+               rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+               break;
+
+       case LST_GROUP_RMND:
+               if (args->lstio_grp_count  <= 0 ||
+                   args->lstio_grp_idsp == NULL) {
+                       rc = -EINVAL;
+                       break;
+               }
+               rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+                                        args->lstio_grp_idsp,
+                                        args->lstio_grp_resultp);
+               break;
+
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+       unsigned feats;
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_idsp == NULL || /* array of ids */
+           args->lstio_grp_count <= 0 ||
+           args->lstio_grp_resultp == NULL ||
+           args->lstio_grp_featp == NULL ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_nodes_add(name, args->lstio_grp_count,
+                             args->lstio_grp_idsp, &feats,
+                             args->lstio_grp_resultp);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+       if (rc == 0 &&
+           copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+               return -EINVAL;
+       }
+
+       return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_idx   < 0 ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_group_list(args->lstio_grp_idx,
+                             args->lstio_grp_nmlen,
+                             args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+       char       *name;
+       int          ndent;
+       int          index;
+       int          rc;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_grp_entp  == NULL && /* output: group entry */
+           args->lstio_grp_dentsp == NULL)  /* output: node entry */
+               return -EINVAL;
+
+       if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+               if (args->lstio_grp_idxp == NULL || /* node index */
+                   args->lstio_grp_ndentp == NULL) /* # of node entry */
+                       return -EINVAL;
+
+               if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+                                      sizeof(ndent)) ||
+                   copy_from_user(&index, args->lstio_grp_idxp,
+                                      sizeof(index)))
+                       return -EFAULT;
+
+               if (ndent <= 0 || index < 0)
+                       return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_info(name, args->lstio_grp_entp,
+                              &index, &ndent, args->lstio_grp_dentsp);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       if (rc != 0)
+               return rc;
+
+       if (args->lstio_grp_dentsp != NULL &&
+           (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+            copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+               rc = -EFAULT;
+
+       return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_add(name);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+                             args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_resultp == NULL ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_stop(name, args->lstio_bat_force,
+                              args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+       char   *name;
+       int     rc;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_resultp == NULL ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_bat_testidx < 0)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_test_batch_query(name,
+                                    args->lstio_bat_testidx,
+                                    args->lstio_bat_client,
+                                    args->lstio_bat_timeout,
+                                    args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_idx   < 0 ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_batch_list(args->lstio_bat_idx,
+                             args->lstio_bat_nmlen,
+                             args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+       char       *name;
+       int          rc;
+       int          index;
+       int          ndent;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL || /* batch name */
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_bat_entp == NULL && /* output: batch entry */
+           args->lstio_bat_dentsp == NULL) /* output: node entry */
+               return -EINVAL;
+
+       if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+               if (args->lstio_bat_idxp == NULL || /* node index */
+                   args->lstio_bat_ndentp == NULL) /* # of node entry */
+                       return -EINVAL;
+
+               if (copy_from_user(&index, args->lstio_bat_idxp,
+                                      sizeof(index)) ||
+                   copy_from_user(&ndent, args->lstio_bat_ndentp,
+                                      sizeof(ndent)))
+                       return -EFAULT;
+
+               if (ndent <= 0 || index < 0)
+                       return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_info(name,
+                           args->lstio_bat_entp, args->lstio_bat_server,
+                           args->lstio_bat_testidx, &index, &ndent,
+                           args->lstio_bat_dentsp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       if (rc != 0)
+               return rc;
+
+       if (args->lstio_bat_dentsp != NULL &&
+           (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+            copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+               rc = -EFAULT;
+
+       return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       /* TODO: not finished */
+       if (args->lstio_sta_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_sta_resultp == NULL ||
+           (args->lstio_sta_namep  == NULL &&
+            args->lstio_sta_idsp   == NULL) ||
+           args->lstio_sta_nmlen <= 0 ||
+           args->lstio_sta_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_sta_idsp != NULL &&
+           args->lstio_sta_count <= 0)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, args->lstio_sta_namep,
+                              args->lstio_sta_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+               return -EFAULT;
+       }
+
+       if (args->lstio_sta_idsp == NULL) {
+               rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+                                      args->lstio_sta_resultp);
+       } else {
+               rc = lstcon_nodes_stat(args->lstio_sta_count,
+                                      args->lstio_sta_idsp,
+                                      args->lstio_sta_timeout,
+                                      args->lstio_sta_resultp);
+       }
+
+       LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+       return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+       char       *name;
+       char       *srcgrp = NULL;
+       char       *dstgrp = NULL;
+       void       *param = NULL;
+       int          ret = 0;
+       int          rc = -ENOMEM;
+
+       if (args->lstio_tes_resultp == NULL ||
+           args->lstio_tes_retp == NULL ||
+           args->lstio_tes_bat_name == NULL || /* no specified batch */
+           args->lstio_tes_bat_nmlen <= 0 ||
+           args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+           args->lstio_tes_sgrp_name == NULL || /* no source group */
+           args->lstio_tes_sgrp_nmlen <= 0 ||
+           args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+           args->lstio_tes_dgrp_name == NULL || /* no target group */
+           args->lstio_tes_dgrp_nmlen <= 0 ||
+           args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_tes_loop == 0 || /* negative is infinite */
+           args->lstio_tes_concur <= 0 ||
+           args->lstio_tes_dist <= 0 ||
+           args->lstio_tes_span <= 0)
+               return -EINVAL;
+
+       /* have parameter, check if parameter length is valid */
+       if (args->lstio_tes_param != NULL &&
+           (args->lstio_tes_param_len <= 0 ||
+            args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+       if (name == NULL)
+               return rc;
+
+       LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+       if (srcgrp == NULL)
+               goto out;
+
+       LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+        if (dstgrp == NULL)
+               goto out;
+
+       if (args->lstio_tes_param != NULL) {
+               LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+               if (param == NULL)
+                       goto out;
+       }
+
+       rc = -EFAULT;
+       if (copy_from_user(name,
+                             args->lstio_tes_bat_name,
+                             args->lstio_tes_bat_nmlen) ||
+           copy_from_user(srcgrp,
+                             args->lstio_tes_sgrp_name,
+                             args->lstio_tes_sgrp_nmlen) ||
+           copy_from_user(dstgrp,
+                             args->lstio_tes_dgrp_name,
+                             args->lstio_tes_dgrp_nmlen) ||
+           copy_from_user(param, args->lstio_tes_param,
+                             args->lstio_tes_param_len))
+               goto out;
+
+       rc = lstcon_test_add(name,
+                           args->lstio_tes_type,
+                           args->lstio_tes_loop,
+                           args->lstio_tes_concur,
+                           args->lstio_tes_dist, args->lstio_tes_span,
+                           srcgrp, dstgrp, param, args->lstio_tes_param_len,
+                           &ret, args->lstio_tes_resultp);
+
+       if (ret != 0)
+               rc = (copy_to_user(args->lstio_tes_retp, &ret,
+                                      sizeof(ret))) ? -EFAULT : 0;
+out:
+       if (name != NULL)
+               LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+       if (srcgrp != NULL)
+               LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+       if (dstgrp != NULL)
+               LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+       if (param != NULL)
+               LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+       return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+       char   *buf;
+       int     opc = data->ioc_u32[0];
+       int     rc;
+
+       if (cmd != IOC_LIBCFS_LNETST)
+               return -EINVAL;
+
+       if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(buf, data->ioc_plen1);
+       if (buf == NULL)
+               return -ENOMEM;
+
+       /* copy in parameter */
+       if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+               LIBCFS_FREE(buf, data->ioc_plen1);
+               return -EFAULT;
+       }
+
+       mutex_lock(&console_session.ses_mutex);
+
+       console_session.ses_laststamp = cfs_time_current_sec();
+
+       if (console_session.ses_shutdown) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
+
+       if (console_session.ses_expired)
+               lstcon_session_end();
+
+       if (opc != LSTIO_SESSION_NEW &&
+           console_session.ses_state == LST_SESSION_NONE) {
+               CDEBUG(D_NET, "LST no active session\n");
+               rc = -ESRCH;
+               goto out;
+       }
+
+       memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+       switch (opc) {
+               case LSTIO_SESSION_NEW:
+                       rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+                       break;
+               case LSTIO_SESSION_END:
+                       rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+                       break;
+               case LSTIO_SESSION_INFO:
+                       rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+                       break;
+               case LSTIO_DEBUG:
+                       rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_ADD:
+                       rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_DEL:
+                       rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_UPDATE:
+                       rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+                       break;
+               case LSTIO_NODES_ADD:
+                       rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_LIST:
+                       rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_INFO:
+                       rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_ADD:
+                       rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_START:
+                       rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_STOP:
+                       rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_QUERY:
+                       rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_LIST:
+                       rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_INFO:
+                       rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+                       break;
+               case LSTIO_TEST_ADD:
+                       rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+                       break;
+               case LSTIO_STAT_QUERY:
+                       rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+                       break;
+               default:
+                       rc = -EINVAL;
+       }
+
+       if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+                            sizeof(lstcon_trans_stat_t)))
+               rc = -EFAULT;
+out:
+       mutex_unlock(&console_session.ses_mutex);
+
+       LIBCFS_FREE(buf, data->ioc_plen1);
+
+       return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644 (file)
index 0000000..446de0e
--- /dev/null
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+                          lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+       lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+       LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+       LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+       spin_lock(&rpc->crpc_lock);
+
+       if (crpc->crp_trans == NULL) {
+               /* Orphan RPC is not in any transaction,
+                * I'm just a poor body and nobody loves me */
+               spin_unlock(&rpc->crpc_lock);
+
+               /* release it */
+               lstcon_rpc_put(crpc);
+               return;
+       }
+
+       /* not an orphan RPC */
+       crpc->crp_finished = 1;
+
+       if (crpc->crp_stamp == 0) {
+               /* not aborted */
+               LASSERT (crpc->crp_status == 0);
+
+               crpc->crp_stamp  = cfs_time_current();
+               crpc->crp_status = rpc->crpc_status;
+       }
+
+       /* wakeup (transaction)thread if I'm the last RPC in the transaction */
+       if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+               wake_up(&crpc->crp_trans->tas_waitq);
+
+       spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+               int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+       crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+                                      feats, bulk_npg, bulk_len,
+                                      lstcon_rpc_done, (void *)crpc);
+       if (crpc->crp_rpc == NULL)
+               return -ENOMEM;
+
+       crpc->crp_trans    = NULL;
+       crpc->crp_node     = nd;
+       crpc->crp_posted   = 0;
+       crpc->crp_finished = 0;
+       crpc->crp_unpacked = 0;
+       crpc->crp_status   = 0;
+       crpc->crp_stamp    = 0;
+       crpc->crp_embedded = embedded;
+       INIT_LIST_HEAD(&crpc->crp_link);
+
+       atomic_inc(&console_session.ses_rpc_counter);
+
+       return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+               int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+       lstcon_rpc_t  *crpc = NULL;
+       int         rc;
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       if (!list_empty(&console_session.ses_rpc_freelist)) {
+               crpc = list_entry(console_session.ses_rpc_freelist.next,
+                                     lstcon_rpc_t, crp_link);
+               list_del_init(&crpc->crp_link);
+       }
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       if (crpc == NULL) {
+               LIBCFS_ALLOC(crpc, sizeof(*crpc));
+               if (crpc == NULL)
+                       return -ENOMEM;
+       }
+
+       rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+       if (rc == 0) {
+               *crpcpp = crpc;
+               return 0;
+       }
+
+       LIBCFS_FREE(crpc, sizeof(*crpc));
+
+       return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+       srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+       int       i;
+
+       LASSERT (list_empty(&crpc->crp_link));
+
+       for (i = 0; i < bulk->bk_niov; i++) {
+               if (bulk->bk_iovs[i].kiov_page == NULL)
+                       continue;
+
+               __free_page(bulk->bk_iovs[i].kiov_page);
+       }
+
+       srpc_client_rpc_decref(crpc->crp_rpc);
+
+       if (crpc->crp_embedded) {
+               /* embedded RPC, don't recycle it */
+               memset(crpc, 0, sizeof(*crpc));
+               crpc->crp_embedded = 1;
+
+       } else {
+               spin_lock(&console_session.ses_rpc_lock);
+
+               list_add(&crpc->crp_link,
+                            &console_session.ses_rpc_freelist);
+
+               spin_unlock(&console_session.ses_rpc_lock);
+       }
+
+       /* RPC is not alive now */
+       atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+       lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+       LASSERT (trans != NULL);
+
+       atomic_inc(&trans->tas_remaining);
+       crpc->crp_posted = 1;
+
+       sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+       if (transop == LST_TRANS_SESNEW)
+               return "SESNEW";
+
+       if (transop == LST_TRANS_SESEND)
+               return "SESEND";
+
+       if (transop == LST_TRANS_SESQRY)
+               return "SESQRY";
+
+       if (transop == LST_TRANS_SESPING)
+               return "SESPING";
+
+       if (transop == LST_TRANS_TSBCLIADD)
+               return "TSBCLIADD";
+
+       if (transop == LST_TRANS_TSBSRVADD)
+               return "TSBSRVADD";
+
+       if (transop == LST_TRANS_TSBRUN)
+               return "TSBRUN";
+
+       if (transop == LST_TRANS_TSBSTOP)
+               return "TSBSTOP";
+
+       if (transop == LST_TRANS_TSBCLIQRY)
+               return "TSBCLIQRY";
+
+       if (transop == LST_TRANS_TSBSRVQRY)
+               return "TSBSRVQRY";
+
+       if (transop == LST_TRANS_STATQRY)
+               return "STATQRY";
+
+       return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+                     int transop, lstcon_rpc_trans_t **transpp)
+{
+       lstcon_rpc_trans_t *trans;
+
+       if (translist != NULL) {
+               list_for_each_entry(trans, translist, tas_link) {
+                       /* Can't enqueue two private transaction on
+                        * the same object */
+                       if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+                               return -EPERM;
+               }
+       }
+
+       /* create a trans group */
+       LIBCFS_ALLOC(trans, sizeof(*trans));
+       if (trans == NULL)
+               return -ENOMEM;
+
+       trans->tas_opc = transop;
+
+       if (translist == NULL)
+               INIT_LIST_HEAD(&trans->tas_olink);
+       else
+               list_add_tail(&trans->tas_olink, translist);
+
+       list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+       INIT_LIST_HEAD(&trans->tas_rpcs_list);
+       atomic_set(&trans->tas_remaining, 0);
+       init_waitqueue_head(&trans->tas_waitq);
+
+       spin_lock(&console_session.ses_rpc_lock);
+       trans->tas_features = console_session.ses_features;
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       *transpp = trans;
+       return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+       list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+       crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+       srpc_client_rpc_t *rpc;
+       lstcon_rpc_t      *crpc;
+       lstcon_node_t     *nd;
+
+       list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+               rpc = crpc->crp_rpc;
+
+               spin_lock(&rpc->crpc_lock);
+
+               if (!crpc->crp_posted || /* not posted */
+                   crpc->crp_stamp != 0) { /* rpc done or aborted already */
+                       if (crpc->crp_stamp == 0) {
+                               crpc->crp_stamp = cfs_time_current();
+                               crpc->crp_status = -EINTR;
+                       }
+                       spin_unlock(&rpc->crpc_lock);
+                       continue;
+               }
+
+               crpc->crp_stamp  = cfs_time_current();
+               crpc->crp_status = error;
+
+               spin_unlock(&rpc->crpc_lock);
+
+               sfw_abort_rpc(rpc);
+
+               if  (error != ETIMEDOUT)
+                       continue;
+
+               nd = crpc->crp_node;
+               if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+                       continue;
+
+               nd->nd_stamp = crpc->crp_stamp;
+               nd->nd_state = LST_NODE_DOWN;
+       }
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+       if (console_session.ses_shutdown &&
+           !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+               return 1;
+
+       return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+       lstcon_rpc_t  *crpc;
+       int         rc;
+
+       if (list_empty(&trans->tas_rpcs_list))
+               return 0;
+
+       if (timeout < LST_TRANS_MIN_TIMEOUT)
+               timeout = LST_TRANS_MIN_TIMEOUT;
+
+       CDEBUG(D_NET, "Transaction %s started\n",
+              lstcon_rpc_trans_name(trans->tas_opc));
+
+       /* post all requests */
+       list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+               LASSERT (!crpc->crp_posted);
+
+               lstcon_rpc_post(crpc);
+       }
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       rc = wait_event_interruptible_timeout(trans->tas_waitq,
+                                             lstcon_rpc_trans_check(trans),
+                                             cfs_time_seconds(timeout));
+       rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       if (console_session.ses_shutdown)
+               rc = -ESHUTDOWN;
+
+       if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+               /* treat short timeout as canceled */
+               if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+                       rc = -EINTR;
+
+               lstcon_rpc_trans_abort(trans, rc);
+       }
+
+       CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+              lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+       lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+       return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+       lstcon_node_t   *nd  = crpc->crp_node;
+       srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+       srpc_generic_reply_t *rep;
+
+       LASSERT (nd != NULL && rpc != NULL);
+       LASSERT (crpc->crp_stamp != 0);
+
+       if (crpc->crp_status != 0) {
+               *msgpp = NULL;
+               return crpc->crp_status;
+       }
+
+       *msgpp = &rpc->crpc_replymsg;
+       if (!crpc->crp_unpacked) {
+               sfw_unpack_message(*msgpp);
+               crpc->crp_unpacked = 1;
+       }
+
+       if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+               return 0;
+
+       nd->nd_stamp = crpc->crp_stamp;
+       rep = &(*msgpp)->msg_body.reply;
+
+       if (rep->sid.ses_nid == LNET_NID_ANY)
+               nd->nd_state = LST_NODE_UNKNOWN;
+       else if (lstcon_session_match(rep->sid))
+               nd->nd_state = LST_NODE_ACTIVE;
+       else
+               nd->nd_state = LST_NODE_BUSY;
+
+       return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+       lstcon_rpc_t      *crpc;
+       srpc_msg_t      *rep;
+       int             error;
+
+       LASSERT (stat != NULL);
+
+       memset(stat, 0, sizeof(*stat));
+
+       list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+               lstcon_rpc_stat_total(stat, 1);
+
+               LASSERT (crpc->crp_stamp != 0);
+
+               error = lstcon_rpc_get_reply(crpc, &rep);
+               if (error != 0) {
+                       lstcon_rpc_stat_failure(stat, 1);
+                       if (stat->trs_rpc_errno == 0)
+                               stat->trs_rpc_errno = -error;
+
+                       continue;
+               }
+
+               lstcon_rpc_stat_success(stat, 1);
+
+               lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+       }
+
+       if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+               stat->trs_fwk_errno =
+                     lstcon_session_feats_check(trans->tas_features);
+       }
+
+       CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+                     "RPC error(%d), Framework error(%d)\n",
+              lstcon_rpc_trans_name(trans->tas_opc),
+              lstcon_rpc_stat_success(stat, 0),
+              lstcon_rpc_stat_failure(stat, 0),
+              lstcon_rpc_stat_total(stat, 0),
+              stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+       return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+                            struct list_head *head_up,
+                            lstcon_rpc_readent_func_t readent)
+{
+       struct list_head            tmp;
+       struct list_head           *next;
+       lstcon_rpc_ent_t     *ent;
+       srpc_generic_reply_t *rep;
+       lstcon_rpc_t     *crpc;
+       srpc_msg_t         *msg;
+       lstcon_node_t   *nd;
+       cfs_duration_t  dur;
+       struct timeval  tv;
+       int                error;
+
+       LASSERT (head_up != NULL);
+
+       next = head_up;
+
+       list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+               if (copy_from_user(&tmp, next,
+                                      sizeof(struct list_head)))
+                       return -EFAULT;
+
+               if (tmp.next == head_up)
+                       return 0;
+
+               next = tmp.next;
+
+               ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+               LASSERT (crpc->crp_stamp != 0);
+
+               error = lstcon_rpc_get_reply(crpc, &msg);
+
+               nd = crpc->crp_node;
+
+               dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+                     (cfs_time_t)console_session.ses_id.ses_stamp);
+               cfs_duration_usec(dur, &tv);
+
+               if (copy_to_user(&ent->rpe_peer,
+                                    &nd->nd_id, sizeof(lnet_process_id_t)) ||
+                   copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+                   copy_to_user(&ent->rpe_state,
+                                    &nd->nd_state, sizeof(nd->nd_state)) ||
+                   copy_to_user(&ent->rpe_rpc_errno, &error,
+                                    sizeof(error)))
+                       return -EFAULT;
+
+               if (error != 0)
+                       continue;
+
+               /* RPC is done */
+               rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+               if (copy_to_user(&ent->rpe_sid,
+                                    &rep->sid, sizeof(lst_sid_t)) ||
+                   copy_to_user(&ent->rpe_fwk_errno,
+                                    &rep->status, sizeof(rep->status)))
+                       return -EFAULT;
+
+               if (readent == NULL)
+                       continue;
+
+               if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+                       return error;
+       }
+
+       return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+       srpc_client_rpc_t *rpc;
+       lstcon_rpc_t      *crpc;
+       lstcon_rpc_t      *tmp;
+       int             count = 0;
+
+       list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+                                crp_link) {
+               rpc = crpc->crp_rpc;
+
+               spin_lock(&rpc->crpc_lock);
+
+               /* free it if not posted or finished already */
+               if (!crpc->crp_posted || crpc->crp_finished) {
+                       spin_unlock(&rpc->crpc_lock);
+
+                       list_del_init(&crpc->crp_link);
+                       lstcon_rpc_put(crpc);
+
+                       continue;
+               }
+
+               /* rpcs can be still not callbacked (even LNetMDUnlink is called)
+                * because huge timeout for inaccessible network, don't make
+                * user wait for them, just abandon them, they will be recycled
+                * in callback */
+
+               LASSERT (crpc->crp_status != 0);
+
+               crpc->crp_node  = NULL;
+               crpc->crp_trans = NULL;
+               list_del_init(&crpc->crp_link);
+               count ++;
+
+               spin_unlock(&rpc->crpc_lock);
+
+               atomic_dec(&trans->tas_remaining);
+       }
+
+       LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+       list_del(&trans->tas_link);
+       if (!list_empty(&trans->tas_olink))
+               list_del(&trans->tas_olink);
+
+       CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+              lstcon_rpc_trans_name(trans->tas_opc), count);
+
+       LIBCFS_FREE(trans, sizeof(*trans));
+
+       return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+                  unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_mksn_reqst_t *msrq;
+       srpc_rmsn_reqst_t *rsrq;
+       int             rc;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+               rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+                                    feats, 0, 0, crpc);
+               if (rc != 0)
+                       return rc;
+
+               msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+               msrq->mksn_sid     = console_session.ses_id;
+               msrq->mksn_force   = console_session.ses_force;
+               strncpy(msrq->mksn_name, console_session.ses_name,
+                       strlen(console_session.ses_name));
+               break;
+
+       case LST_TRANS_SESEND:
+               rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+                                    feats, 0, 0, crpc);
+               if (rc != 0)
+                       return rc;
+
+               rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+               rsrq->rmsn_sid = console_session.ses_id;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_debug_reqst_t *drq;
+       int                 rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+       drq->dbg_sid   = console_session.ses_id;
+       drq->dbg_flags = 0;
+
+       return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                  lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+       lstcon_batch_t     *batch;
+       srpc_batch_reqst_t *brq;
+       int                 rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+       brq->bar_sid     = console_session.ses_id;
+       brq->bar_bid     = tsb->tsb_id;
+       brq->bar_testidx = tsb->tsb_index;
+       brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+                          (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+                           SRPC_BATCH_OPC_QUERY);
+
+       if (transop != LST_TRANS_TSBRUN &&
+           transop != LST_TRANS_TSBSTOP)
+               return 0;
+
+       LASSERT (tsb->tsb_index == 0);
+
+       batch = (lstcon_batch_t *)tsb;
+       brq->bar_arg = batch->bat_arg;
+
+       return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_stat_reqst_t *srq;
+       int                rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+       srq->str_sid  = console_session.ses_id;
+       srq->str_type = 0; /* XXX remove it */
+
+       return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+       lnet_process_id_packed_t *pid;
+       int                    i;
+
+       i = idx / SFW_ID_PER_PAGE;
+
+       LASSERT (i < nkiov);
+
+       pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+       return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+                    int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+       lnet_process_id_packed_t *pid;
+       lstcon_ndlink_t   *ndl;
+       lstcon_node_t       *nd;
+       int                    start;
+       int                    end;
+       int                    i = 0;
+
+       LASSERT (dist >= 1);
+       LASSERT (span >= 1);
+       LASSERT (grp->grp_nnode >= 1);
+
+       if (span > grp->grp_nnode)
+               return -EINVAL;
+
+       start = ((idx / dist) * span) % grp->grp_nnode;
+       end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+               nd = ndl->ndl_node;
+               if (i < start) {
+                       i ++;
+                       continue;
+               }
+
+               if (i > (end >= start ? end: grp->grp_nnode))
+                       break;
+
+               pid = lstcon_next_id((i - start), nkiov, kiov);
+               pid->nid = nd->nd_id.nid;
+               pid->pid = nd->nd_id.pid;
+               i++;
+       }
+
+       if (start <= end) /* done */
+               return 0;
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+               if (i > grp->grp_nnode + end)
+                       break;
+
+               nd = ndl->ndl_node;
+               pid = lstcon_next_id((i - start), nkiov, kiov);
+               pid->nid = nd->nd_id.nid;
+               pid->pid = nd->nd_id.pid;
+               i++;
+       }
+
+       return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+       test_ping_req_t *prq = &req->tsr_u.ping;
+
+       prq->png_size   = param->png_size;
+       prq->png_flags  = param->png_flags;
+       /* TODO dest */
+       return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+       test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+       brq->blk_opc    = param->blk_opc;
+       brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+       brq->blk_flags  = param->blk_flags;
+
+       return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+       test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+       brq->blk_opc    = param->blk_opc;
+       brq->blk_flags  = param->blk_flags;
+       brq->blk_len    = param->blk_size;
+       brq->blk_offset = 0; /* reserved */
+
+       return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                   lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+       lstcon_group_t    *sgrp = test->tes_src_grp;
+       lstcon_group_t    *dgrp = test->tes_dst_grp;
+       srpc_test_reqst_t *trq;
+       srpc_bulk_t       *bulk;
+       int             i;
+       int                npg = 0;
+       int                nob = 0;
+       int                rc  = 0;
+
+       if (transop == LST_TRANS_TSBCLIADD) {
+               npg = sfw_id_pages(test->tes_span);
+               nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+                     npg * PAGE_CACHE_SIZE :
+                     sizeof(lnet_process_id_packed_t) * test->tes_span;
+       }
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+       if (rc != 0)
+               return rc;
+
+       trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+       if (transop == LST_TRANS_TSBSRVADD) {
+               int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+               int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+               int nmax = (ndist + nspan - 1) / nspan;
+
+               trq->tsr_ndest = 0;
+               trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+       } else {
+               bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+               for (i = 0; i < npg; i++) {
+                       int     len;
+
+                       LASSERT(nob > 0);
+
+                       len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+                             PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+                       nob -= len;
+
+                       bulk->bk_iovs[i].kiov_offset = 0;
+                       bulk->bk_iovs[i].kiov_len    = len;
+                       bulk->bk_iovs[i].kiov_page   =
+                               alloc_page(GFP_IOFS);
+
+                       if (bulk->bk_iovs[i].kiov_page == NULL) {
+                               lstcon_rpc_put(*crpc);
+                               return -ENOMEM;
+                       }
+               }
+
+               bulk->bk_sink = 0;
+
+               LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+               rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+                                         test->tes_cliidx++,
+                                         test->tes_dist,
+                                         test->tes_span,
+                                         npg, &bulk->bk_iovs[0]);
+               if (rc != 0) {
+                       lstcon_rpc_put(*crpc);
+                       return rc;
+               }
+
+               trq->tsr_ndest = test->tes_span;
+               trq->tsr_loop  = test->tes_loop;
+       }
+
+       trq->tsr_sid    = console_session.ses_id;
+       trq->tsr_bid    = test->tes_hdr.tsb_id;
+       trq->tsr_concur     = test->tes_concur;
+       trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+       trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+       switch (test->tes_type) {
+       case LST_TEST_PING:
+               trq->tsr_service = SRPC_SERVICE_PING;
+               rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+                                        &test->tes_param[0], trq);
+               break;
+
+       case LST_TEST_BULK:
+               trq->tsr_service = SRPC_SERVICE_BRW;
+               if ((feats & LST_FEAT_BULK_LEN) == 0) {
+                       rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+                                                   &test->tes_param[0], trq);
+               } else {
+                       rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+                                                   &test->tes_param[0], trq);
+               }
+
+               break;
+       default:
+               LBUG();
+               break;
+       }
+
+       return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+                        lstcon_node_t *nd, srpc_msg_t *reply)
+{
+       srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+       int                status   = mksn_rep->mksn_status;
+
+       if (status == 0 &&
+           (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               mksn_rep->mksn_status = EPROTO;
+               status = EPROTO;
+       }
+
+       if (status == EPROTO) {
+               CNETERR("session protocol error from %s: %u\n",
+                       libcfs_nid2str(nd->nd_id.nid),
+                       reply->msg_ses_feats);
+       }
+
+       if (status != 0)
+               return status;
+
+       if (!trans->tas_feats_updated) {
+               trans->tas_feats_updated = 1;
+               trans->tas_features = reply->msg_ses_feats;
+       }
+
+       if (reply->msg_ses_feats != trans->tas_features) {
+               CNETERR("Framework features %x from %s is different with "
+                       "features on this transaction: %x\n",
+                        reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+                        trans->tas_features);
+               status = mksn_rep->mksn_status = EPROTO;
+       }
+
+       if (status == 0) {
+               /* session timeout on remote node */
+               nd->nd_timeout = mksn_rep->mksn_timeout;
+       }
+
+       return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+                     lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+       srpc_rmsn_reply_t  *rmsn_rep;
+       srpc_debug_reply_t *dbg_rep;
+       srpc_batch_reply_t *bat_rep;
+       srpc_test_reply_t  *test_rep;
+       srpc_stat_reply_t  *stat_rep;
+       int              rc = 0;
+
+       switch (trans->tas_opc) {
+       case LST_TRANS_SESNEW:
+               rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+               if (rc == 0) {
+                       lstcon_sesop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_sesop_stat_failure(stat, 1);
+               break;
+
+       case LST_TRANS_SESEND:
+               rmsn_rep = &msg->msg_body.rmsn_reply;
+               /* ESRCH is not an error for end session */
+               if (rmsn_rep->rmsn_status == 0 ||
+                   rmsn_rep->rmsn_status == ESRCH) {
+                       lstcon_sesop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_sesop_stat_failure(stat, 1);
+               rc = rmsn_rep->rmsn_status;
+               break;
+
+       case LST_TRANS_SESQRY:
+       case LST_TRANS_SESPING:
+               dbg_rep = &msg->msg_body.dbg_reply;
+
+               if (dbg_rep->dbg_status == ESRCH) {
+                       lstcon_sesqry_stat_unknown(stat, 1);
+                       return;
+               }
+
+               if (lstcon_session_match(dbg_rep->dbg_sid))
+                       lstcon_sesqry_stat_active(stat, 1);
+               else
+                       lstcon_sesqry_stat_busy(stat, 1);
+               return;
+
+       case LST_TRANS_TSBRUN:
+       case LST_TRANS_TSBSTOP:
+               bat_rep = &msg->msg_body.bat_reply;
+
+               if (bat_rep->bar_status == 0) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               if (bat_rep->bar_status == EPERM &&
+                   trans->tas_opc == LST_TRANS_TSBSTOP) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_tsbop_stat_failure(stat, 1);
+               rc = bat_rep->bar_status;
+               break;
+
+       case LST_TRANS_TSBCLIQRY:
+       case LST_TRANS_TSBSRVQRY:
+               bat_rep = &msg->msg_body.bat_reply;
+
+               if (bat_rep->bar_active != 0)
+                       lstcon_tsbqry_stat_run(stat, 1);
+               else
+                       lstcon_tsbqry_stat_idle(stat, 1);
+
+               if (bat_rep->bar_status == 0)
+                       return;
+
+               lstcon_tsbqry_stat_failure(stat, 1);
+               rc = bat_rep->bar_status;
+               break;
+
+       case LST_TRANS_TSBCLIADD:
+       case LST_TRANS_TSBSRVADD:
+               test_rep = &msg->msg_body.tes_reply;
+
+               if (test_rep->tsr_status == 0) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_tsbop_stat_failure(stat, 1);
+               rc = test_rep->tsr_status;
+               break;
+
+       case LST_TRANS_STATQRY:
+               stat_rep = &msg->msg_body.stat_reply;
+
+               if (stat_rep->str_status == 0) {
+                       lstcon_statqry_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_statqry_stat_failure(stat, 1);
+               rc = stat_rep->str_status;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       if (stat->trs_fwk_errno == 0)
+               stat->trs_fwk_errno = rc;
+
+       return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+                       struct list_head *translist, int transop,
+                       void *arg, lstcon_rpc_cond_func_t condition,
+                       lstcon_rpc_trans_t **transpp)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_ndlink_t    *ndl;
+       lstcon_node_t      *nd;
+       lstcon_rpc_t       *rpc;
+       unsigned            feats;
+       int              rc;
+
+       /* Creating session RPG for list of nodes */
+
+       rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction %d: %d\n", transop, rc);
+               return rc;
+       }
+
+       feats = trans->tas_features;
+       list_for_each_entry(ndl, ndlist, ndl_link) {
+               rc = condition == NULL ? 1 :
+                    condition(transop, ndl->ndl_node, arg);
+
+               if (rc == 0)
+                       continue;
+
+               if (rc < 0) {
+                       CDEBUG(D_NET, "Condition error while creating RPC "
+                                     " for transaction %d: %d\n", transop, rc);
+                       break;
+               }
+
+               nd = ndl->ndl_node;
+
+               switch (transop) {
+               case LST_TRANS_SESNEW:
+               case LST_TRANS_SESEND:
+                       rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+                       break;
+               case LST_TRANS_SESQRY:
+               case LST_TRANS_SESPING:
+                       rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+                       break;
+               case LST_TRANS_TSBCLIADD:
+               case LST_TRANS_TSBSRVADD:
+                       rc = lstcon_testrpc_prep(nd, transop, feats,
+                                                (lstcon_test_t *)arg, &rpc);
+                       break;
+               case LST_TRANS_TSBRUN:
+               case LST_TRANS_TSBSTOP:
+               case LST_TRANS_TSBCLIQRY:
+               case LST_TRANS_TSBSRVQRY:
+                       rc = lstcon_batrpc_prep(nd, transop, feats,
+                                               (lstcon_tsb_hdr_t *)arg, &rpc);
+                       break;
+               case LST_TRANS_STATQRY:
+                       rc = lstcon_statrpc_prep(nd, feats, &rpc);
+                       break;
+               default:
+                       rc = -EINVAL;
+                       break;
+               }
+
+               if (rc != 0) {
+                       CERROR("Failed to create RPC for transaction %s: %d\n",
+                              lstcon_rpc_trans_name(transop), rc);
+                       break;
+               }
+
+               lstcon_rpc_trans_addreq(trans, rpc);
+       }
+
+       if (rc == 0) {
+               *transpp = trans;
+               return 0;
+       }
+
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+       stt_timer_t     *ptimer = (stt_timer_t *)arg;
+       lstcon_rpc_trans_t *trans;
+       lstcon_rpc_t       *crpc;
+       srpc_msg_t       *rep;
+       srpc_debug_reqst_t *drq;
+       lstcon_ndlink_t    *ndl;
+       lstcon_node_t      *nd;
+       time_t        intv;
+       int              count = 0;
+       int              rc;
+
+       /* RPC pinger is a special case of transaction,
+        * it's called by timer at 8 seconds interval.
+        */
+       mutex_lock(&console_session.ses_mutex);
+
+       if (console_session.ses_shutdown || console_session.ses_expired) {
+               mutex_unlock(&console_session.ses_mutex);
+               return;
+       }
+
+       if (!console_session.ses_expired &&
+           cfs_time_current_sec() - console_session.ses_laststamp >
+           (time_t)console_session.ses_timeout)
+               console_session.ses_expired = 1;
+
+       trans = console_session.ses_ping;
+
+       LASSERT (trans != NULL);
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+               nd = ndl->ndl_node;
+
+               if (console_session.ses_expired) {
+                       /* idle console, end session on all nodes */
+                       if (nd->nd_state != LST_NODE_ACTIVE)
+                               continue;
+
+                       rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+                                               trans->tas_features, &crpc);
+                       if (rc != 0) {
+                               CERROR("Out of memory\n");
+                               break;
+                       }
+
+                       lstcon_rpc_trans_addreq(trans, crpc);
+                       lstcon_rpc_post(crpc);
+
+                       continue;
+               }
+
+               crpc = &nd->nd_ping;
+
+               if (crpc->crp_rpc != NULL) {
+                       LASSERT (crpc->crp_trans == trans);
+                       LASSERT (!list_empty(&crpc->crp_link));
+
+                       spin_lock(&crpc->crp_rpc->crpc_lock);
+
+                       LASSERT(crpc->crp_posted);
+
+                       if (!crpc->crp_finished) {
+                               /* in flight */
+                               spin_unlock(&crpc->crp_rpc->crpc_lock);
+                               continue;
+                       }
+
+                       spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+                       lstcon_rpc_get_reply(crpc, &rep);
+
+                       list_del_init(&crpc->crp_link);
+
+                       lstcon_rpc_put(crpc);
+               }
+
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       continue;
+
+               intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+                                                    nd->nd_stamp));
+               if (intv < (time_t)nd->nd_timeout / 2)
+                       continue;
+
+               rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+                                    trans->tas_features, 0, 0, 1, crpc);
+               if (rc != 0) {
+                       CERROR("Out of memory\n");
+                       break;
+               }
+
+               drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+               drq->dbg_sid   = console_session.ses_id;
+               drq->dbg_flags = 0;
+
+               lstcon_rpc_trans_addreq(trans, crpc);
+               lstcon_rpc_post(crpc);
+
+               count ++;
+       }
+
+       if (console_session.ses_expired) {
+               mutex_unlock(&console_session.ses_mutex);
+               return;
+       }
+
+       CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+       ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+       stt_add_timer(ptimer);
+
+       mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+       stt_timer_t    *ptimer;
+       int          rc;
+
+       LASSERT (list_empty(&console_session.ses_rpc_freelist));
+       LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+       rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+                                  &console_session.ses_ping);
+       if (rc != 0) {
+               CERROR("Failed to create console pinger\n");
+               return rc;
+       }
+
+       ptimer = &console_session.ses_ping_timer;
+       ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+       stt_add_timer(ptimer);
+
+       return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+       LASSERT (console_session.ses_shutdown);
+
+       stt_del_timer(&console_session.ses_ping_timer);
+
+       lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+       lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+       lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+       memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+       console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_rpc_t       *crpc;
+       struct list_head         *pacer;
+       struct list_head          zlist;
+
+       /* Called with hold of global mutex */
+
+       LASSERT (console_session.ses_shutdown);
+
+       while (!list_empty(&console_session.ses_trans_list)) {
+               list_for_each(pacer, &console_session.ses_trans_list) {
+                       trans = list_entry(pacer, lstcon_rpc_trans_t,
+                                              tas_link);
+
+                       CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+                              lstcon_rpc_trans_name(trans->tas_opc));
+
+                       wake_up(&trans->tas_waitq);
+               }
+
+               mutex_unlock(&console_session.ses_mutex);
+
+               CWARN("Session is shutting down, "
+                     "waiting for termination of transactions\n");
+               cfs_pause(cfs_time_seconds(1));
+
+               mutex_lock(&console_session.ses_mutex);
+       }
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+                      console_session.ses_rpc_lock,
+                      "Network is not accessable or target is down, "
+                      "waiting for %d console RPCs to being recycled\n",
+                      atomic_read(&console_session.ses_rpc_counter));
+
+       list_add(&zlist, &console_session.ses_rpc_freelist);
+       list_del_init(&console_session.ses_rpc_freelist);
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       while (!list_empty(&zlist)) {
+               crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+               list_del(&crpc->crp_link);
+               LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+       }
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+       INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+       console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+       console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+       console_session.ses_ping = NULL;
+
+       spin_lock_init(&console_session.ses_rpc_lock);
+       atomic_set(&console_session.ses_rpc_counter, 0);
+       INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+       return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+       LASSERT (list_empty(&console_session.ses_rpc_freelist));
+       LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644 (file)
index 0000000..9aba24a
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+       struct list_head               crp_link;       /* chain on rpc transaction */
+       srpc_client_rpc_t       *crp_rpc;       /* client rpc */
+       struct lstcon_node      *crp_node;       /* destination node */
+       struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+       unsigned int             crp_posted:1;   /* rpc is posted */
+       unsigned int             crp_finished:1; /* rpc is finished */
+       unsigned int             crp_unpacked:1; /* reply is unpacked */
+       /** RPC is embedded in other structure and can't free it */
+       unsigned int             crp_embedded:1;
+       int                   crp_status;     /* console rpc errors */
+       cfs_time_t             crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+       struct list_head            tas_olink;     /* link chain on owner list */
+       struct list_head            tas_link;      /* link chain on global list */
+       int                tas_opc;       /* operation code of transaction */
+       /* features mask is uptodate */
+       unsigned              tas_feats_updated;
+       /* test features mask */
+       unsigned              tas_features;
+       wait_queue_head_t          tas_waitq;     /* wait queue head */
+       atomic_t          tas_remaining; /* # of un-scheduled rpcs */
+       struct list_head            tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW       (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND       (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY       0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN       (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+                       unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+                       unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                       struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                        struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+                        lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+                          int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+                            struct list_head *translist, int transop,
+                            void *arg, lstcon_rpc_cond_func_t condition,
+                            lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+                          lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+                                 struct list_head *head_up,
+                                 lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644 (file)
index 0000000..78e8d04
--- /dev/null
@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)             \
+do {                                               \
+       if ((nd)->nd_state == LST_NODE_ACTIVE)    \
+               (p)->nle_nactive ++;                \
+       else if ((nd)->nd_state == LST_NODE_BUSY)       \
+               (p)->nle_nbusy ++;                    \
+       else if ((nd)->nd_state == LST_NODE_DOWN)       \
+               (p)->nle_ndown ++;                    \
+       else                                        \
+               (p)->nle_nunknown ++;              \
+       (p)->nle_nnode ++;                            \
+} while (0)
+
+lstcon_session_t       console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+       LASSERT (nd->nd_ref >= 1);
+
+       nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+       lstcon_ndlink_t *ndl;
+       unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+       LASSERT (id.nid != LNET_NID_ANY);
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+               if (ndl->ndl_node->nd_id.nid != id.nid ||
+                   ndl->ndl_node->nd_id.pid != id.pid)
+                       continue;
+
+               lstcon_node_get(ndl->ndl_node);
+               *ndpp = ndl->ndl_node;
+               return 0;
+       }
+
+       if (!create)
+               return -ENOENT;
+
+       LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+       if (*ndpp == NULL)
+               return -ENOMEM;
+
+       ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+       ndl->ndl_node = *ndpp;
+
+       ndl->ndl_node->nd_ref   = 1;
+       ndl->ndl_node->nd_id    = id;
+       ndl->ndl_node->nd_stamp = cfs_time_current();
+       ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+       ndl->ndl_node->nd_timeout = 0;
+       memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+       /* queued in global hash & list, no refcount is taken by
+        * global hash & list, if caller release his refcount,
+        * node will be released */
+       list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+       list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+       return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+       lstcon_ndlink_t  *ndl;
+
+       LASSERT (nd->nd_ref > 0);
+
+       if (--nd->nd_ref > 0)
+               return;
+
+       ndl = (lstcon_ndlink_t *)(nd + 1);
+
+       LASSERT (!list_empty(&ndl->ndl_link));
+       LASSERT (!list_empty(&ndl->ndl_hlink));
+
+       /* remove from session */
+       list_del(&ndl->ndl_link);
+       list_del(&ndl->ndl_hlink);
+
+       LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+                  lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+       unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+       lstcon_ndlink_t *ndl;
+       lstcon_node_t   *nd;
+       int           rc;
+
+       if (id.nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       /* search in hash */
+       list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+               if (ndl->ndl_node->nd_id.nid != id.nid ||
+                   ndl->ndl_node->nd_id.pid != id.pid)
+                       continue;
+
+               *ndlpp = ndl;
+               return 0;
+       }
+
+       if (create == 0)
+               return -ENOENT;
+
+       /* find or create in session hash */
+       rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+       if (rc != 0)
+               return rc;
+
+       LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+       if (ndl == NULL) {
+               lstcon_node_put(nd);
+               return -ENOMEM;
+       }
+
+       *ndlpp = ndl;
+
+       ndl->ndl_node = nd;
+       INIT_LIST_HEAD(&ndl->ndl_link);
+       list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+       return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+       LASSERT (list_empty(&ndl->ndl_link));
+       LASSERT (!list_empty(&ndl->ndl_hlink));
+
+       list_del(&ndl->ndl_hlink); /* delete from hash */
+       lstcon_node_put(ndl->ndl_node);
+
+       LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+       lstcon_group_t *grp;
+       int          i;
+
+       LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+                                  grp_ndl_hash[LST_NODE_HASHSIZE]));
+       if (grp == NULL)
+               return -ENOMEM;
+
+       memset(grp, 0, offsetof(lstcon_group_t,
+                               grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+       grp->grp_ref = 1;
+       if (name != NULL)
+               strcpy(grp->grp_name, name);
+
+       INIT_LIST_HEAD(&grp->grp_link);
+       INIT_LIST_HEAD(&grp->grp_ndl_list);
+       INIT_LIST_HEAD(&grp->grp_trans_list);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++)
+               INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+       *grpp = grp;
+
+       return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+       grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+       lstcon_ndlink_t *ndl;
+       lstcon_ndlink_t *tmp;
+
+       list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+               if ((ndl->ndl_node->nd_state & keep) == 0)
+                       lstcon_group_ndlink_release(grp, ndl);
+       }
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+       int     i;
+
+       if (--grp->grp_ref > 0)
+               return;
+
+       if (!list_empty(&grp->grp_link))
+               list_del(&grp->grp_link);
+
+       lstcon_group_drain(grp, 0);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+       }
+
+       LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+                                 grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+       lstcon_group_t   *grp;
+
+       list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+               if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+                       continue;
+
+               lstcon_group_addref(grp);  /* +1 ref for caller */
+               *grpp = grp;
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+       lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+                        lstcon_ndlink_t **ndlpp, int create)
+{
+       int     rc;
+
+       rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+       if (rc != 0)
+               return rc;
+
+       if (!list_empty(&(*ndlpp)->ndl_link))
+               return 0;
+
+       list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+       grp->grp_nnode ++;
+
+       return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+       list_del_init(&ndl->ndl_link);
+       lstcon_ndlink_release(ndl);
+       grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+                        lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+       unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+                          LST_NODE_HASHSIZE;
+
+       list_del(&ndl->ndl_hlink);
+       list_del(&ndl->ndl_link);
+       old->grp_nnode --;
+
+       list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+       list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+       new->grp_nnode ++;
+
+       return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+       lstcon_ndlink_t *ndl;
+
+       while (!list_empty(&old->grp_ndl_list)) {
+               ndl = list_entry(old->grp_ndl_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               lstcon_group_ndlink_move(old, new, ndl);
+       }
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+               if (nd->nd_state == LST_NODE_ACTIVE)
+                       return 0;
+               break;
+
+       case LST_TRANS_SESEND:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return 0;
+
+               if (grp != NULL && nd->nd_ref > 1)
+                       return 0;
+               break;
+
+       case LST_TRANS_SESQRY:
+               break;
+
+       default:
+               LBUG();
+       }
+
+       return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+                     lstcon_rpc_ent_t *ent_up)
+{
+       srpc_debug_reply_t *rep;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+       case LST_TRANS_SESEND:
+               return 0;
+
+       case LST_TRANS_SESQRY:
+               rep = &msg->msg_body.dbg_reply;
+
+               if (copy_to_user(&ent_up->rpe_priv[0],
+                                    &rep->dbg_timeout, sizeof(int)) ||
+                   copy_to_user(&ent_up->rpe_payload[0],
+                                    &rep->dbg_name, LST_NAME_SIZE))
+                       return -EFAULT;
+
+               return 0;
+
+       default:
+               LBUG();
+       }
+
+       return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+                      int count, lnet_process_id_t *ids_up,
+                      unsigned *featp, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t      *trans;
+       lstcon_ndlink_t  *ndl;
+       lstcon_group_t    *tmp;
+       lnet_process_id_t       id;
+       int                   i;
+       int                   rc;
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0 ; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* skip if it's in this group already */
+               rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+               if (rc == 0)
+                       continue;
+
+               /* add to tmp group */
+               rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+               if (rc != 0) {
+                       CERROR("Can't create ndlink, out of memory\n");
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                    &tmp->grp_trans_list, LST_TRANS_SESNEW,
+                                    tmp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       /* post all RPCs */
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_sesrpc_readent);
+       *featp = trans->tas_features;
+
+       /* destroy all RPGs */
+       lstcon_rpc_trans_destroy(trans);
+
+       lstcon_group_move(tmp, grp);
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+                         int count, lnet_process_id_t *ids_up,
+                         struct list_head *result_up)
+{
+       lstcon_rpc_trans_t     *trans;
+       lstcon_ndlink_t *ndl;
+       lstcon_group_t   *tmp;
+       lnet_process_id_t       id;
+       int                  rc;
+       int                  i;
+
+       /* End session and remove node from the group */
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       goto error;
+               }
+
+               /* move node to tmp group */
+               if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+                       lstcon_group_ndlink_move(grp, tmp, ndl);
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                    &tmp->grp_trans_list, LST_TRANS_SESEND,
+                                    tmp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               goto error;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* release nodes anyway, because we can't rollback status */
+       lstcon_group_put(tmp);
+
+       return rc;
+error:
+       lstcon_group_move(tmp, grp);
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+       lstcon_group_t *grp;
+       int          rc;
+
+       rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+       if (rc != 0) {
+               /* find a group with same name */
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       rc = lstcon_group_alloc(name, &grp);
+       if (rc != 0) {
+               CERROR("Can't allocate descriptor for group %s\n", name);
+               return -ENOMEM;
+       }
+
+       list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+       return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+                unsigned *featp, struct list_head *result_up)
+{
+       lstcon_group_t   *grp;
+       int                  rc;
+
+       LASSERT (count > 0);
+       LASSERT (ids_up != NULL);
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by other threads or test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+
+               return -EBUSY;
+       }
+
+       rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_group_t     *grp;
+       int              rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by others threads or test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &grp->grp_trans_list, LST_TRANS_SESEND,
+                                    grp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       lstcon_rpc_trans_destroy(trans);
+
+       lstcon_group_put(grp);
+       /* -ref for session, it's destroyed,
+        * status can't be rolled back, destroy group anway */
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+       lstcon_group_t *grp = NULL;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+               LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+       lstcon_group_drain(grp, args);
+
+       lstcon_group_put(grp);
+       /* release empty group */
+       if (list_empty(&grp->grp_ndl_list))
+               lstcon_group_put(grp);
+
+       return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+                   lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+       lstcon_group_t *grp = NULL;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+       lstcon_group_put(grp);
+       /* release empty group */
+       if (list_empty(&grp->grp_ndl_list))
+               lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t      *trans;
+       lstcon_group_t    *grp;
+       int                   rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       /* re-invite all inactive nodes int the group */
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &grp->grp_trans_list, LST_TRANS_SESNEW,
+                                    grp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               /* local error, return */
+               CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* -ref for me */
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+       lstcon_group_t *grp;
+
+       LASSERT (index >= 0);
+       LASSERT (name_up != NULL);
+
+       list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+               if (index-- == 0) {
+                       return copy_to_user(name_up, grp->grp_name, len) ?
+                              -EFAULT : 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+                   int *count_p, lstcon_node_ent_t *dents_up)
+{
+       lstcon_ndlink_t  *ndl;
+       lstcon_node_t    *nd;
+       int            count = 0;
+       int            index = 0;
+
+       LASSERT (index_p != NULL && count_p != NULL);
+       LASSERT (dents_up != NULL);
+       LASSERT (*index_p >= 0);
+       LASSERT (*count_p > 0);
+
+       list_for_each_entry(ndl, head, ndl_link) {
+               if (index++ < *index_p)
+                       continue;
+
+               if (count >= *count_p)
+                       break;
+
+               nd = ndl->ndl_node;
+               if (copy_to_user(&dents_up[count].nde_id,
+                                    &nd->nd_id, sizeof(nd->nd_id)) ||
+                   copy_to_user(&dents_up[count].nde_state,
+                                    &nd->nd_state, sizeof(nd->nd_state)))
+                       return -EFAULT;
+
+               count ++;
+       }
+
+       if (index <= *index_p)
+               return -ENOENT;
+
+       *count_p = count;
+       *index_p = index;
+
+       return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+                 int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+       lstcon_ndlist_ent_t *gentp;
+       lstcon_group_t      *grp;
+       lstcon_ndlink_t     *ndl;
+       int               rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (dents_up != 0) {
+               /* verbose query */
+               rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+                                        index_p, count_p, dents_up);
+               lstcon_group_put(grp);
+
+               return rc;
+       }
+
+       /* non-verbose query */
+       LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+       if (gentp == NULL) {
+               CERROR("Can't allocate ndlist_ent\n");
+               lstcon_group_put(grp);
+
+               return -ENOMEM;
+       }
+
+       memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+       rc = copy_to_user(gents_p, gentp,
+                             sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+       LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+       lstcon_group_put(grp);
+
+       return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+       lstcon_batch_t   *bat;
+
+       list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+               if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+                       *batpp = bat;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+       lstcon_batch_t   *bat;
+       int            i;
+       int            rc;
+
+       rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+       if (rc != 0) {
+               CDEBUG(D_NET, "Batch %s already exists\n", name);
+               return rc;
+       }
+
+       LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+       if (bat == NULL) {
+               CERROR("Can't allocate descriptor for batch %s\n", name);
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(bat->bat_cli_hash,
+                    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       if (bat->bat_cli_hash == NULL) {
+               CERROR("Can't allocate hash for batch %s\n", name);
+               LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(bat->bat_srv_hash,
+                    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       if (bat->bat_srv_hash == NULL) {
+               CERROR("Can't allocate hash for batch %s\n", name);
+               LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+               LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+               return -ENOMEM;
+       }
+
+       strcpy(bat->bat_name, name);
+       bat->bat_hdr.tsb_index = 0;
+       bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+       bat->bat_ntest = 0;
+       bat->bat_state = LST_BATCH_IDLE;
+
+       INIT_LIST_HEAD(&bat->bat_cli_list);
+       INIT_LIST_HEAD(&bat->bat_srv_list);
+       INIT_LIST_HEAD(&bat->bat_test_list);
+       INIT_LIST_HEAD(&bat->bat_trans_list);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+               INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+       }
+
+       list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+       return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+       lstcon_batch_t    *bat;
+
+       LASSERT (name_up != NULL);
+       LASSERT (index >= 0);
+
+       list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+               if (index-- == 0) {
+                       return copy_to_user(name_up,bat->bat_name, len) ?
+                              -EFAULT: 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+                 int testidx, int *index_p, int *ndent_p,
+                 lstcon_node_ent_t *dents_up)
+{
+       lstcon_test_batch_ent_t *entp;
+       struct list_head              *clilst;
+       struct list_head              *srvlst;
+       lstcon_test_t      *test = NULL;
+       lstcon_batch_t    *bat;
+       lstcon_ndlink_t  *ndl;
+       int                   rc;
+
+       rc = lstcon_batch_find(name, &bat);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       if (testidx > 0) {
+               /* query test, test index start from 1 */
+               list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+                       if (testidx-- == 1)
+                               break;
+               }
+
+               if (testidx > 0) {
+                       CDEBUG(D_NET, "Can't find specified test in batch\n");
+                       return -ENOENT;
+               }
+       }
+
+       clilst = (test == NULL) ? &bat->bat_cli_list :
+                                 &test->tes_src_grp->grp_ndl_list;
+       srvlst = (test == NULL) ? &bat->bat_srv_list :
+                                 &test->tes_dst_grp->grp_ndl_list;
+
+       if (dents_up != NULL) {
+               rc = lstcon_nodes_getent((server ? srvlst: clilst),
+                                        index_p, ndent_p, dents_up);
+               return rc;
+       }
+
+       /* non-verbose query */
+       LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+       if (entp == NULL)
+               return -ENOMEM;
+
+       memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+       if (test == NULL) {
+               entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+               entp->u.tbe_batch.bae_state = bat->bat_state;
+
+       } else {
+
+               entp->u.tbe_test.tse_type   = test->tes_type;
+               entp->u.tbe_test.tse_loop   = test->tes_loop;
+               entp->u.tbe_test.tse_concur = test->tes_concur;
+       }
+
+       list_for_each_entry(ndl, clilst, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+       list_for_each_entry(ndl, srvlst, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+       rc = copy_to_user(ent_up, entp,
+                             sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+       LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+       return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       switch (transop) {
+       case LST_TRANS_TSBRUN:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return -ENETDOWN;
+               break;
+
+       case LST_TRANS_TSBSTOP:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return 0;
+               break;
+
+       case LST_TRANS_TSBCLIQRY:
+       case LST_TRANS_TSBSRVQRY:
+               break;
+       }
+
+       return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+               struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+                                    &bat->bat_trans_list, transop,
+                                    bat, lstcon_batrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       if (lstcon_batch_find(name, &bat) != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       bat->bat_arg = timeout;
+
+       rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+       /* mark batch as running if it's started in any node */
+       if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+               bat->bat_state = LST_BATCH_RUNNING;
+
+       return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       if (lstcon_batch_find(name, &bat) != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       bat->bat_arg = force;
+
+       rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+       /* mark batch as stopped if all RPCs finished */
+       if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+               bat->bat_state = LST_BATCH_IDLE;
+
+       return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+       lstcon_ndlink_t    *ndl;
+       lstcon_test_t      *test;
+       int              i;
+
+       list_del(&bat->bat_link);
+
+       while (!list_empty(&bat->bat_test_list)) {
+               test = list_entry(bat->bat_test_list.next,
+                                     lstcon_test_t, tes_link);
+               LASSERT (list_empty(&test->tes_trans_list));
+
+               list_del(&test->tes_link);
+
+               lstcon_group_put(test->tes_src_grp);
+               lstcon_group_put(test->tes_dst_grp);
+
+               LIBCFS_FREE(test, offsetof(lstcon_test_t,
+                                          tes_param[test->tes_paramlen]));
+       }
+
+       LASSERT (list_empty(&bat->bat_trans_list));
+
+       while (!list_empty(&bat->bat_cli_list)) {
+               ndl = list_entry(bat->bat_cli_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               list_del_init(&ndl->ndl_link);
+
+               lstcon_ndlink_release(ndl);
+       }
+
+       while (!list_empty(&bat->bat_srv_list)) {
+               ndl = list_entry(bat->bat_srv_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               list_del_init(&ndl->ndl_link);
+
+               lstcon_ndlink_release(ndl);
+       }
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&bat->bat_cli_hash[i]));
+               LASSERT (list_empty(&bat->bat_srv_hash[i]));
+       }
+
+       LIBCFS_FREE(bat->bat_cli_hash,
+                   sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       LIBCFS_FREE(bat->bat_srv_hash,
+                   sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       lstcon_test_t    *test;
+       lstcon_batch_t   *batch;
+       lstcon_ndlink_t  *ndl;
+       struct list_head       *hash;
+       struct list_head       *head;
+
+       test = (lstcon_test_t *)arg;
+       LASSERT (test != NULL);
+
+       batch = test->tes_batch;
+       LASSERT (batch != NULL);
+
+       if (test->tes_oneside &&
+           transop == LST_TRANS_TSBSRVADD)
+               return 0;
+
+       if (nd->nd_state != LST_NODE_ACTIVE)
+               return -ENETDOWN;
+
+       if (transop == LST_TRANS_TSBCLIADD) {
+               hash = batch->bat_cli_hash;
+               head = &batch->bat_cli_list;
+
+       } else {
+               LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+               hash = batch->bat_srv_hash;
+               head = &batch->bat_srv_list;
+       }
+
+       LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+       if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+               return -ENOMEM;
+
+       if (list_empty(&ndl->ndl_link))
+               list_add_tail(&ndl->ndl_link, head);
+
+       return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t     *trans;
+       lstcon_group_t   *grp;
+       int                  transop;
+       int                  rc;
+
+       LASSERT (test->tes_src_grp != NULL);
+       LASSERT (test->tes_dst_grp != NULL);
+
+       transop = LST_TRANS_TSBSRVADD;
+       grp  = test->tes_dst_grp;
+again:
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &test->tes_trans_list, transop,
+                                    test, lstcon_testrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+           lstcon_trans_stat()->trs_fwk_errno != 0) {
+               lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+               lstcon_rpc_trans_destroy(trans);
+               /* return if any error */
+               CDEBUG(D_NET, "Failed to add test %s, "
+                             "RPC error %d, framework error %d\n",
+                      transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+                      lstcon_trans_stat()->trs_rpc_errno,
+                      lstcon_trans_stat()->trs_fwk_errno);
+
+               return rc;
+       }
+
+       lstcon_rpc_trans_destroy(trans);
+
+       if (transop == LST_TRANS_TSBCLIADD)
+               return rc;
+
+       transop = LST_TRANS_TSBCLIADD;
+       grp = test->tes_src_grp;
+       test->tes_cliidx = 0;
+
+       /* requests to test clients */
+       goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+               int dist, int span, char *src_name, char * dst_name,
+               void *param, int paramlen, int *retp,
+               struct list_head *result_up)
+{
+       lstcon_group_t  *src_grp = NULL;
+       lstcon_group_t  *dst_grp = NULL;
+       lstcon_test_t   *test    = NULL;
+       lstcon_batch_t  *batch;
+       int           rc;
+
+       rc = lstcon_batch_find(name, &batch);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return rc;
+       }
+
+       if (batch->bat_state != LST_BATCH_IDLE) {
+               CDEBUG(D_NET, "Can't change running batch %s\n", name);
+               return rc;
+       }
+
+       rc = lstcon_group_find(src_name, &src_grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", src_name);
+               goto out;
+       }
+
+       rc = lstcon_group_find(dst_name, &dst_grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+               goto out;
+       }
+
+       if (dst_grp->grp_userland)
+               *retp = 1;
+
+       LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+       if (!test) {
+               CERROR("Can't allocate test descriptor\n");
+               rc = -ENOMEM;
+
+               goto out;
+       }
+
+       memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+       test->tes_hdr.tsb_id    = batch->bat_hdr.tsb_id;
+       test->tes_batch  = batch;
+       test->tes_type    = type;
+       test->tes_oneside       = 0; /* TODO */
+       test->tes_loop    = loop;
+       test->tes_concur        = concur;
+       test->tes_stop_onerr    = 1; /* TODO */
+       test->tes_span    = span;
+       test->tes_dist    = dist;
+       test->tes_cliidx        = 0; /* just used for creating RPC */
+       test->tes_src_grp       = src_grp;
+       test->tes_dst_grp       = dst_grp;
+       INIT_LIST_HEAD(&test->tes_trans_list);
+
+       if (param != NULL) {
+               test->tes_paramlen = paramlen;
+               memcpy(&test->tes_param[0], param, paramlen);
+       }
+
+       rc = lstcon_test_nodes_add(test, result_up);
+
+       if (rc != 0)
+               goto out;
+
+       if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+           lstcon_trans_stat()->trs_fwk_errno != 0)
+               CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+       /* add to test list anyway, so user can check what's going on */
+       list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+       batch->bat_ntest ++;
+       test->tes_hdr.tsb_index = batch->bat_ntest;
+
+       /*  hold groups so nobody can change them */
+       return rc;
+out:
+       if (test != NULL)
+               LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+       if (dst_grp != NULL)
+               lstcon_group_put(dst_grp);
+
+       if (src_grp != NULL)
+               lstcon_group_put(src_grp);
+
+       return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+       lstcon_test_t *test;
+
+       list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+               if (idx == test->tes_hdr.tsb_index) {
+                       *testpp = test;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+                     lstcon_rpc_ent_t *ent_up)
+{
+       srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+       LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+                transop == LST_TRANS_TSBSRVQRY);
+
+       /* positive errno, framework error code */
+       if (copy_to_user(&ent_up->rpe_priv[0],
+                            &rep->bar_active, sizeof(rep->bar_active)))
+               return -EFAULT;
+
+       return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+                       int timeout, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       struct list_head         *translist;
+       struct list_head         *ndlist;
+       lstcon_tsb_hdr_t   *hdr;
+       lstcon_batch_t     *batch;
+       lstcon_test_t      *test = NULL;
+       int              transop;
+       int              rc;
+
+       rc = lstcon_batch_find(name, &batch);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch: %s\n", name);
+               return rc;
+       }
+
+       if (testidx == 0) {
+               translist = &batch->bat_trans_list;
+               ndlist    = &batch->bat_cli_list;
+               hdr       = &batch->bat_hdr;
+
+       } else {
+               /* query specified test only */
+               rc = lstcon_test_find(batch, testidx, &test);
+               if (rc != 0) {
+                       CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+                       return rc;
+               }
+
+               translist = &test->tes_trans_list;
+               ndlist    = &test->tes_src_grp->grp_ndl_list;
+               hdr       = &test->tes_hdr;
+       }
+
+       transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+                                    lstcon_batrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, timeout);
+
+       if (testidx == 0 && /* query a batch, not a test */
+           lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+           lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+               /* all RPCs finished, and no active test */
+               batch->bat_state = LST_BATCH_IDLE;
+       }
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_tsbrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+                      lstcon_rpc_ent_t *ent_up)
+{
+       srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+       sfw_counters_t    *sfwk_stat;
+       srpc_counters_t   *srpc_stat;
+       lnet_counters_t   *lnet_stat;
+
+       if (rep->str_status != 0)
+               return 0;
+
+       sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+       srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+       lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+       if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+           copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+           copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+               return -EFAULT;
+
+       return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+                  int timeout, struct list_head *result_up)
+{
+       struct list_head          head;
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       INIT_LIST_HEAD(&head);
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+                                    LST_TRANS_STATQRY, NULL, NULL, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_statrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+       lstcon_group_t     *grp;
+       int              rc;
+
+       rc = lstcon_group_find(grp_name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+               return rc;
+       }
+
+       rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+                 int timeout, struct list_head *result_up)
+{
+       lstcon_ndlink_t  *ndl;
+       lstcon_group_t    *tmp;
+       lnet_process_id_t       id;
+       int                   i;
+       int                   rc;
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0 ; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* add to tmp group */
+               rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+               if (rc != 0) {
+                       CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+                              "Failed to find or create %s: %d\n",
+                              libcfs_id2str(id), rc);
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+                   struct list_head *translist,
+                   int timeout, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+                                    NULL, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_sesrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+       return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+                                  NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+                  int client, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       rc = lstcon_batch_find(name, &bat);
+       if (rc != 0)
+               return -ENOENT;
+
+       rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+                                         &bat->bat_srv_list,
+                                NULL, timeout, result_up);
+
+       return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+                  struct list_head *result_up)
+{
+       lstcon_group_t *grp;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0)
+               return -ENOENT;
+
+       rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                timeout, result_up);
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+                  int count, lnet_process_id_t *ids_up,
+                  struct list_head *result_up)
+{
+       lnet_process_id_t  id;
+       lstcon_ndlink_t   *ndl;
+       lstcon_group_t    *grp;
+       int             i;
+       int             rc;
+
+       rc = lstcon_group_alloc(NULL, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Out of memory\n");
+               return rc;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* node is added to tmp group */
+               rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+               if (rc != 0) {
+                       CERROR("Can't create node link\n");
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                timeout, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+       return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+               console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+       lnet_process_id_t      id;
+
+       LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+       LNetGetId(1, &id);
+       sid->ses_nid   = id.nid;
+       sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+                  int timeout, int force, lst_sid_t *sid_up)
+{
+       int     rc = 0;
+       int     i;
+
+       if (console_session.ses_state != LST_SESSION_NONE) {
+               /* session exists */
+               if (!force) {
+                       CNETERR("Session %s already exists\n",
+                               console_session.ses_name);
+                       return -EEXIST;
+               }
+
+               rc = lstcon_session_end();
+
+               /* lstcon_session_end() only return local error */
+               if  (rc != 0)
+                       return rc;
+       }
+
+       if ((feats & ~LST_FEATS_MASK) != 0) {
+               CNETERR("Unknown session features %x\n",
+                       (feats & ~LST_FEATS_MASK));
+               return -EINVAL;
+       }
+
+       for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+               LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+       lstcon_new_session_id(&console_session.ses_id);
+
+       console_session.ses_key     = key;
+       console_session.ses_state   = LST_SESSION_ACTIVE;
+       console_session.ses_force   = !!force;
+       console_session.ses_features = feats;
+       console_session.ses_feats_updated = 0;
+       console_session.ses_timeout = (timeout <= 0) ?
+                                     LST_CONSOLE_TIMEOUT : timeout;
+       strcpy(console_session.ses_name, name);
+
+       rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+       if (rc != 0)
+               return rc;
+
+       rc = lstcon_rpc_pinger_start();
+       if (rc != 0) {
+               lstcon_batch_t *bat = NULL;
+
+               lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+               lstcon_batch_destroy(bat);
+
+               return rc;
+       }
+
+       if (copy_to_user(sid_up, &console_session.ses_id,
+                            sizeof(lst_sid_t)) == 0)
+               return rc;
+
+       lstcon_session_end();
+
+       return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+                   lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+       lstcon_ndlist_ent_t *entp;
+       lstcon_ndlink_t     *ndl;
+       int               rc = 0;
+
+       if (console_session.ses_state != LST_SESSION_ACTIVE)
+               return -ESRCH;
+
+       LIBCFS_ALLOC(entp, sizeof(*entp));
+       if (entp == NULL)
+               return -ENOMEM;
+
+       memset(entp, 0, sizeof(*entp));
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+       if (copy_to_user(sid_up, &console_session.ses_id,
+                            sizeof(lst_sid_t)) ||
+           copy_to_user(key_up, &console_session.ses_key,
+                            sizeof(*key_up)) ||
+           copy_to_user(featp, &console_session.ses_features,
+                            sizeof(*featp)) ||
+           copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+           copy_to_user(name_up, console_session.ses_name, len))
+               rc = -EFAULT;
+
+       LIBCFS_FREE(entp, sizeof(*entp));
+
+       return rc;
+}
+
+int
+lstcon_session_end()
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_group_t     *grp;
+       lstcon_batch_t     *bat;
+       int              rc = 0;
+
+       LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+       rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+                                    NULL, LST_TRANS_SESEND, NULL,
+                                    lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       console_session.ses_shutdown = 1;
+
+       lstcon_rpc_pinger_stop();
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* User can do nothing even rpc failed, so go on */
+
+       /* waiting for orphan rpcs to die */
+       lstcon_rpc_cleanup_wait();
+
+       console_session.ses_id    = LST_INVALID_SID;
+       console_session.ses_state = LST_SESSION_NONE;
+       console_session.ses_key   = 0;
+       console_session.ses_force = 0;
+       console_session.ses_feats_updated = 0;
+
+       /* destroy all batches */
+       while (!list_empty(&console_session.ses_bat_list)) {
+               bat = list_entry(console_session.ses_bat_list.next,
+                                    lstcon_batch_t, bat_link);
+
+               lstcon_batch_destroy(bat);
+       }
+
+       /* destroy all groups */
+       while (!list_empty(&console_session.ses_grp_list)) {
+               grp = list_entry(console_session.ses_grp_list.next,
+                                    lstcon_group_t, grp_link);
+               LASSERT (grp->grp_ref == 1);
+
+               lstcon_group_put(grp);
+       }
+
+       /* all nodes should be released */
+       LASSERT (list_empty(&console_session.ses_ndl_list));
+
+       console_session.ses_shutdown = 0;
+       console_session.ses_expired  = 0;
+
+       return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+       int rc = 0;
+
+       if ((feats & ~LST_FEATS_MASK) != 0) {
+               CERROR("Can't support these features: %x\n",
+                      (feats & ~LST_FEATS_MASK));
+               return -EPROTO;
+       }
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       if (!console_session.ses_feats_updated) {
+               console_session.ses_feats_updated = 1;
+               console_session.ses_features = feats;
+       }
+
+       if (console_session.ses_features != feats)
+               rc = -EPROTO;
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       if (rc != 0) {
+               CERROR("remote features %x do not match with "
+                      "session features %x of console\n",
+                      feats, console_session.ses_features);
+       }
+
+       return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+       srpc_msg_t      *rep  = &rpc->srpc_replymsg;
+       srpc_msg_t      *req  = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+       srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+       lstcon_group_t    *grp  = NULL;
+       lstcon_ndlink_t   *ndl;
+       int             rc   = 0;
+
+       sfw_unpack_message(req);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       jrep->join_sid = console_session.ses_id;
+
+       if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+               jrep->join_status = ESRCH;
+               goto out;
+       }
+
+       if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+               jrep->join_status = EPROTO;
+               goto out;
+       }
+
+       if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+            !lstcon_session_match(jreq->join_sid)) {
+               jrep->join_status = EBUSY;
+               goto out;
+       }
+
+       if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+               rc = lstcon_group_alloc(jreq->join_group, &grp);
+               if (rc != 0) {
+                       CERROR("Out of memory\n");
+                       goto out;
+               }
+
+               list_add_tail(&grp->grp_link,
+                                 &console_session.ses_grp_list);
+               lstcon_group_addref(grp);
+       }
+
+       if (grp->grp_ref > 2) {
+               /* Group in using */
+               jrep->join_status = EBUSY;
+               goto out;
+       }
+
+       rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+       if (rc == 0) {
+               jrep->join_status = EEXIST;
+               goto out;
+       }
+
+       rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               goto out;
+       }
+
+       ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+       ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+       if (grp->grp_userland == 0)
+               grp->grp_userland = 1;
+
+       strcpy(jrep->join_session, console_session.ses_name);
+       jrep->join_timeout = console_session.ses_timeout;
+       jrep->join_status  = 0;
+
+out:
+       rep->msg_ses_feats = console_session.ses_features;
+       if (grp != NULL)
+               lstcon_group_put(grp);
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+       /* initialize selftest console acceptor service table */
+       lstcon_acceptor_service.sv_name    = "join session";
+       lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+       lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+       lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+       int     i;
+       int     rc;
+
+       memset(&console_session, 0, sizeof(lstcon_session_t));
+
+       console_session.ses_id              = LST_INVALID_SID;
+       console_session.ses_state           = LST_SESSION_NONE;
+       console_session.ses_timeout         = 0;
+       console_session.ses_force           = 0;
+       console_session.ses_expired         = 0;
+       console_session.ses_feats_updated   = 0;
+       console_session.ses_features        = LST_FEATS_MASK;
+       console_session.ses_laststamp       = cfs_time_current_sec();
+
+       mutex_init(&console_session.ses_mutex);
+
+       INIT_LIST_HEAD(&console_session.ses_ndl_list);
+       INIT_LIST_HEAD(&console_session.ses_grp_list);
+       INIT_LIST_HEAD(&console_session.ses_bat_list);
+       INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+       LIBCFS_ALLOC(console_session.ses_ndl_hash,
+                    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+       if (console_session.ses_ndl_hash == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+               INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+       /* initialize acceptor service table */
+       lstcon_init_acceptor_service();
+
+       rc = srpc_add_service(&lstcon_acceptor_service);
+       LASSERT (rc != -EBUSY);
+       if (rc != 0) {
+               LIBCFS_FREE(console_session.ses_ndl_hash,
+                           sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+               return rc;
+       }
+
+       rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+                                     lstcon_acceptor_service.sv_wi_total);
+       if (rc != 0) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+       if (rc == 0) {
+               lstcon_rpc_module_init();
+               return 0;
+       }
+
+out:
+       srpc_shutdown_service(&lstcon_acceptor_service);
+       srpc_remove_service(&lstcon_acceptor_service);
+
+       LIBCFS_FREE(console_session.ses_ndl_hash,
+                   sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+       srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+       return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+       int     i;
+
+       libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       srpc_shutdown_service(&lstcon_acceptor_service);
+       srpc_remove_service(&lstcon_acceptor_service);
+
+       if (console_session.ses_state != LST_SESSION_NONE)
+               lstcon_session_end();
+
+       lstcon_rpc_module_fini();
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       LASSERT (list_empty(&console_session.ses_ndl_list));
+       LASSERT (list_empty(&console_session.ses_grp_list));
+       LASSERT (list_empty(&console_session.ses_bat_list));
+       LASSERT (list_empty(&console_session.ses_trans_list));
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+       }
+
+       LIBCFS_FREE(console_session.ses_ndl_hash,
+                   sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+       srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644 (file)
index 0000000..e61b266
--- /dev/null
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+       lnet_process_id_t    nd_id;       /* id of the node */
+       int               nd_ref;        /* reference count */
+       int               nd_state;       /* state of the node */
+       int               nd_timeout;     /* session timeout */
+       cfs_time_t         nd_stamp;       /* timestamp of last replied RPC */
+       struct lstcon_rpc    nd_ping;   /* ping rpc */
+} lstcon_node_t;                               /*** node descriptor */
+
+typedef struct {
+       struct list_head           ndl_link;       /* chain on list */
+       struct list_head           ndl_hlink;      /* chain on hash */
+       lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;                           /*** node link descriptor */
+
+typedef struct {
+       struct list_head           grp_link;       /* chain on global group list */
+       int               grp_ref;      /* reference count */
+       int               grp_userland;   /* has userland nodes */
+       int               grp_nnode;      /* # of nodes */
+       char             grp_name[LST_NAME_SIZE]; /* group name */
+
+       struct list_head           grp_trans_list; /* transaction list */
+       struct list_head           grp_ndl_list;   /* nodes list */
+       struct list_head           grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;                  /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE   0xB0      /* idle batch */
+#define LST_BATCH_RUNNING       0xB1       /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+       lst_bid_t              tsb_id;   /* batch ID */
+       int                  tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+       lstcon_tsb_hdr_t        bat_hdr;        /* test_batch header */
+       struct list_head              bat_link;       /* chain on session's batches list */
+       int                  bat_ntest;      /* # of test */
+       int                  bat_state;      /* state of the batch */
+       int                  bat_arg;   /* parameter for run|stop, timeout for run, force for stop */
+       char                bat_name[LST_NAME_SIZE]; /* name of batch */
+
+       struct list_head              bat_test_list;  /* list head of tests (lstcon_test_t) */
+       struct list_head              bat_trans_list; /* list head of transaction */
+       struct list_head              bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+       struct list_head             *bat_cli_hash;   /* hash table of client nodes */
+       struct list_head              bat_srv_list;   /* list head of server nodes */
+       struct list_head             *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;                           /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+       lstcon_tsb_hdr_t      tes_hdr;  /* test batch header */
+       struct list_head            tes_link;       /* chain on batch's tests list */
+       lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+       int                tes_type;       /* type of the test, i.e: bulk, ping */
+       int                tes_stop_onerr; /* stop on error */
+       int                tes_oneside;    /* one-sided test */
+       int                tes_concur;     /* concurrency */
+       int                tes_loop;       /* loop count */
+       int                tes_dist;       /* nodes distribution of target group */
+       int                tes_span;       /* nodes span of target group */
+       int                tes_cliidx;     /* client index, used for RPC creating */
+
+       struct list_head  tes_trans_list; /* transaction list */
+       lstcon_group_t       *tes_src_grp;    /* group run the test */
+       lstcon_group_t       *tes_dst_grp;    /* target group */
+
+       int                tes_paramlen;   /* test parameter length */
+       char              tes_param[0];   /* test parameter */
+} lstcon_test_t;                               /*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503         /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239         /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE       0x0          /* no session */
+#define LST_SESSION_ACTIVE      0x1         /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300         /* default console timeout */
+
+typedef struct {
+       struct mutex            ses_mutex;      /* only 1 thread in session */
+       lst_sid_t              ses_id;   /* global session id */
+       int                  ses_key;   /* local session key */
+       int                  ses_state;      /* state of session */
+       int                  ses_timeout;    /* timeout in seconds */
+       time_t            ses_laststamp;  /* last operation stamp (seconds) */
+       /** tests features of the session */
+       unsigned                ses_features;
+       /** features are synced with remote test nodes */
+       unsigned                ses_feats_updated:1;
+       /** force creating */
+       unsigned                ses_force:1;
+       /** session is shutting down */
+       unsigned                ses_shutdown:1;
+       /** console is timedout */
+       unsigned                ses_expired:1;
+       __u64              ses_id_cookie;  /* batch id cookie */
+       char                ses_name[LST_NAME_SIZE];  /* session name */
+       lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+       stt_timer_t          ses_ping_timer; /* timer for pinger */
+       lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+       struct list_head              ses_trans_list; /* global list of transaction */
+       struct list_head              ses_grp_list;   /* global list of groups */
+       struct list_head              ses_bat_list;   /* global list of batches */
+       struct list_head              ses_ndl_list;   /* global list of nodes */
+       struct list_head             *ses_ndl_hash;   /* hash table of nodes */
+
+       spinlock_t        ses_rpc_lock;   /* serialize */
+       atomic_t            ses_rpc_counter;/* # of initialized RPCs */
+       struct list_head              ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;                         /*** session descriptor */
+
+extern lstcon_session_t         console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+       return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+       unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+       return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+                             int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+                              lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+                             int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+                             struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+                             struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+                           unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+                              struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+                            int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+                           struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+                            struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+                                  int client, int timeout,
+                                  struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+                            int server, int testidx, int *index_p,
+                            int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+                            struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+                            int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+                          int dist, int span, char *src_name, char * dst_name,
+                          void *param, int paramlen, int *retp,
+                          struct list_head *result_up);
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644 (file)
index 0000000..483c785
--- /dev/null
@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+               "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+               "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)             \
+do {                               \
+       __swab64s(&(id).nid);      \
+       __swab32s(&(id).pid);      \
+} while (0)
+
+#define sfw_unpack_sid(sid)         \
+do {                               \
+       __swab64s(&(sid).ses_nid);      \
+       __swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)     \
+do {                                 \
+       __swab32s(&(fc).running_ms);      \
+       __swab32s(&(fc).active_batches);  \
+       __swab32s(&(fc).zombie_sessions); \
+       __swab32s(&(fc).brw_errors);      \
+       __swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {                               \
+       __swab32s(&(rc).errors);        \
+       __swab32s(&(rc).rpcs_sent);     \
+       __swab32s(&(rc).rpcs_rcvd);     \
+       __swab32s(&(rc).rpcs_dropped);  \
+       __swab32s(&(rc).rpcs_expired);  \
+       __swab64s(&(rc).bulk_get);      \
+       __swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {                               \
+       __swab32s(&(lc).errors);        \
+       __swab32s(&(lc).msgs_max);      \
+       __swab32s(&(lc).msgs_alloc);    \
+       __swab32s(&(lc).send_count);    \
+       __swab32s(&(lc).recv_count);    \
+       __swab32s(&(lc).drop_count);    \
+       __swab32s(&(lc).route_count);   \
+       __swab64s(&(lc).send_length);   \
+       __swab64s(&(lc).recv_length);   \
+       __swab64s(&(lc).drop_length);   \
+       __swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+       struct list_head         fw_zombie_rpcs;     /* RPCs to be recycled */
+       struct list_head         fw_zombie_sessions; /* stopping sessions */
+       struct list_head         fw_tests;         /* registered test cases */
+       atomic_t       fw_nzombies;     /* # zombie sessions */
+       spinlock_t         fw_lock;             /* serialise */
+       sfw_session_t     *fw_session;          /* _the_ session */
+       int                fw_shuttingdown;     /* shutdown in progress */
+       srpc_server_rpc_t *fw_active_srpc;      /* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+       sfw_test_case_t *tsc;
+
+       LASSERT (id <= SRPC_SERVICE_MAX_ID);
+       LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               if (tsc->tsc_srv_service->sv_id == id)
+                       return tsc;
+       }
+
+       return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+       sfw_test_case_t *tsc;
+
+       if (sfw_find_test_case(service->sv_id) != NULL) {
+               CERROR ("Failed to register test %s (%d)\n",
+                       service->sv_name, service->sv_id);
+               return -EEXIST;
+       }
+
+       LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+       if (tsc == NULL)
+               return -ENOMEM;
+
+       memset(tsc, 0, sizeof(sfw_test_case_t));
+       tsc->tsc_cli_ops     = cliops;
+       tsc->tsc_srv_service = service;
+
+       list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+       return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       stt_timer_t   *timer = &sn->sn_timer;
+
+       LASSERT (!sfw_data.fw_shuttingdown);
+
+       if (sn == NULL || sn->sn_timeout == 0)
+               return;
+
+       LASSERT (!sn->sn_timer_active);
+
+       sn->sn_timer_active = 1;
+       timer->stt_expires = cfs_time_add(sn->sn_timeout,
+                                         cfs_time_current_sec());
+       stt_add_timer(timer);
+       return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       if (sn == NULL || !sn->sn_timer_active)
+               return 0;
+
+       LASSERT (sn->sn_timeout != 0);
+
+       if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+               sn->sn_timer_active = 0;
+               return 0;
+       }
+
+       return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       int         nactive = 0;
+       sfw_batch_t   *tsb;
+       sfw_test_case_t *tsc;
+
+       if (sn == NULL) return;
+
+       LASSERT (!sn->sn_timer_active);
+
+       sfw_data.fw_session = NULL;
+       atomic_inc(&sfw_data.fw_nzombies);
+       list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+               srpc_abort_service(tsc->tsc_srv_service);
+       }
+
+       spin_lock(&sfw_data.fw_lock);
+
+       list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+               if (sfw_batch_active(tsb)) {
+                       nactive++;
+                       sfw_stop_batch(tsb, 1);
+               }
+       }
+
+       if (nactive != 0)
+               return;   /* wait for active batches to stop */
+
+       list_del_init(&sn->sn_list);
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_destroy_session(sn);
+
+       spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+       sfw_session_t *sn = data;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       LASSERT (sn->sn_timer_active);
+       LASSERT (sn == sfw_data.fw_session);
+
+       CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+              libcfs_nid2str(sn->sn_id.ses_nid),
+              sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+       sn->sn_timer_active = 0;
+       sfw_deactivate_session();
+
+       spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+                unsigned features, const char *name)
+{
+       stt_timer_t *timer = &sn->sn_timer;
+
+       memset(sn, 0, sizeof(sfw_session_t));
+       INIT_LIST_HEAD(&sn->sn_list);
+       INIT_LIST_HEAD(&sn->sn_batches);
+       atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
+       atomic_set(&sn->sn_brw_errors, 0);
+       atomic_set(&sn->sn_ping_errors, 0);
+       strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+       sn->sn_timer_active = 0;
+       sn->sn_id          = sid;
+       sn->sn_features     = features;
+       sn->sn_timeout      = session_timeout;
+       sn->sn_started      = cfs_time_current();
+
+       timer->stt_data = sn;
+       timer->stt_func = sfw_session_expired;
+       INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv     = rpc->srpc_scd->scd_svc;
+       int                     status  = rpc->srpc_status;
+
+       CDEBUG (D_NET,
+               "Incoming framework RPC done: "
+               "service %s, peer %s, status %s:%d\n",
+               sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+               swi_state2str(rpc->srpc_wi.swi_state),
+               status);
+
+       if (rpc->srpc_bulk != NULL)
+               sfw_free_pages(rpc);
+       return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+       LASSERT (rpc->crpc_bulk.bk_niov == 0);
+       LASSERT (list_empty(&rpc->crpc_list));
+       LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+       CDEBUG (D_NET,
+               "Outgoing framework RPC done: "
+               "service %d, peer %s, status %s:%d:%d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(rpc->crpc_wi.swi_state),
+               rpc->crpc_aborted, rpc->crpc_status);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       /* my callers must finish all RPCs before shutting me down */
+       LASSERT(!sfw_data.fw_shuttingdown);
+       list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+       spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       sfw_batch_t   *bat;
+
+       LASSERT (sn != NULL);
+
+       list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+               if (bat->bat_id.bat_id == bid.bat_id)
+                       return bat;
+       }
+
+       return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       sfw_batch_t   *bat;
+
+       LASSERT (sn != NULL);
+
+       bat = sfw_find_batch(bid);
+       if (bat != NULL)
+               return bat;
+
+       LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+       if (bat == NULL)
+               return NULL;
+
+       bat->bat_error    = 0;
+       bat->bat_session  = sn;
+       bat->bat_id       = bid;
+       atomic_set(&bat->bat_nactive, 0);
+       INIT_LIST_HEAD(&bat->bat_tests);
+
+       list_add_tail(&bat->bat_list, &sn->sn_batches);
+       return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+       sfw_session_t  *sn = sfw_data.fw_session;
+       sfw_counters_t *cnt = &reply->str_fw;
+       sfw_batch_t    *bat;
+       struct timeval  tv;
+
+       reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->str_sid.ses_nid == LNET_NID_ANY) {
+               reply->str_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+               reply->str_status = ESRCH;
+               return 0;
+       }
+
+       lnet_counters_get(&reply->str_lnet);
+       srpc_get_counters(&reply->str_rpc);
+
+       /* send over the msecs since the session was started
+        - with 32 bits to send, this is ~49 days */
+       cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+                                      sn->sn_started), &tv);
+
+       cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+       cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+       cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+       cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+       cnt->active_batches = 0;
+       list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+               if (atomic_read(&bat->bat_nactive) > 0)
+                       cnt->active_batches++;
+       }
+
+       reply->str_status = 0;
+       return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+                                         msg_body.mksn_reqst);
+       int            cplen = 0;
+
+       if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+               reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+               reply->mksn_status = EINVAL;
+               return 0;
+       }
+
+       if (sn != NULL) {
+               reply->mksn_status  = 0;
+               reply->mksn_sid     = sn->sn_id;
+               reply->mksn_timeout = sn->sn_timeout;
+
+               if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+                       atomic_inc(&sn->sn_refcount);
+                       return 0;
+               }
+
+               if (!request->mksn_force) {
+                       reply->mksn_status = EBUSY;
+                       cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+                                       sizeof(reply->mksn_name));
+                       if (cplen >= sizeof(reply->mksn_name))
+                               return -E2BIG;
+                       return 0;
+               }
+       }
+
+       /* reject the request if it requires unknown features
+        * NB: old version will always accept all features because it's not
+        * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+        * harmless because it will return zero feature to console, and it's
+        * console's responsibility to make sure all nodes in a session have
+        * same feature mask. */
+       if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               reply->mksn_status = EPROTO;
+               return 0;
+       }
+
+       /* brand new or create by force */
+       LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+       if (sn == NULL) {
+               CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+               return -ENOMEM;
+       }
+
+       sfw_init_session(sn, request->mksn_sid,
+                        msg->msg_ses_feats, &request->mksn_name[0]);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       sfw_deactivate_session();
+       LASSERT(sfw_data.fw_session == NULL);
+       sfw_data.fw_session = sn;
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       reply->mksn_status  = 0;
+       reply->mksn_sid     = sn->sn_id;
+       reply->mksn_timeout = sn->sn_timeout;
+       return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+               reply->rmsn_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+               reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+               return 0;
+       }
+
+       if (!atomic_dec_and_test(&sn->sn_refcount)) {
+               reply->rmsn_status = 0;
+               return 0;
+       }
+
+       spin_lock(&sfw_data.fw_lock);
+       sfw_deactivate_session();
+       spin_unlock(&sfw_data.fw_lock);
+
+       reply->rmsn_status = 0;
+       reply->rmsn_sid    = LST_INVALID_SID;
+       LASSERT(sfw_data.fw_session == NULL);
+       return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       if (sn == NULL) {
+               reply->dbg_status = ESRCH;
+               reply->dbg_sid    = LST_INVALID_SID;
+               return 0;
+       }
+
+       reply->dbg_status  = 0;
+       reply->dbg_sid     = sn->sn_id;
+       reply->dbg_timeout = sn->sn_timeout;
+       if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+           >= sizeof(reply->dbg_name))
+               return -E2BIG;
+
+       return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+       sfw_test_unit_t     *tsu = rpc->crpc_priv;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+       /* Called with hold of tsi->tsi_lock */
+       LASSERT (list_empty(&rpc->crpc_list));
+       list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+       struct sfw_test_case    *tsc = sfw_find_test_case(tsi->tsi_service);
+       struct srpc_service     *svc = tsc->tsc_srv_service;
+       int                     nbuf;
+
+       nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+       return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+       struct sfw_test_case    *tsc;
+       struct srpc_service     *svc;
+       int                     nbuf;
+       int                     rc;
+
+       LASSERT(tsi != NULL);
+       tsc = sfw_find_test_case(tsi->tsi_service);
+       nbuf = sfw_test_buffers(tsi);
+       LASSERT(tsc != NULL);
+       svc = tsc->tsc_srv_service;
+
+       if (tsi->tsi_is_client) {
+               tsi->tsi_ops = tsc->tsc_cli_ops;
+               return 0;
+       }
+
+       rc = srpc_service_add_buffers(svc, nbuf);
+       if (rc != 0) {
+               CWARN("Failed to reserve enough buffers: "
+                     "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+               /* NB: this error handler is not strictly correct, because
+                * it may release more buffers than already allocated,
+                * but it doesn't matter because request portal should
+                * be lazy portal and will grow buffers if necessary. */
+               srpc_service_remove_buffers(svc, nbuf);
+               return -ENOMEM;
+       }
+
+       CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+              nbuf * (srpc_serv_is_framework(svc) ?
+                      1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+       return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+       struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+       LASSERT(tsc != NULL);
+
+       if (tsi->tsi_is_client)
+               return;
+
+       /* shrink buffers, because request portal is lazy portal
+        * which can grow buffers at runtime so we may leave
+        * some buffers behind, but never mind... */
+       srpc_service_remove_buffers(tsc->tsc_srv_service,
+                                   sfw_test_buffers(tsi));
+       return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+       srpc_client_rpc_t *rpc;
+       sfw_test_unit_t   *tsu;
+
+       if (!tsi->tsi_is_client) goto clean;
+
+       tsi->tsi_ops->tso_fini(tsi);
+
+       LASSERT (!tsi->tsi_stopping);
+       LASSERT (list_empty(&tsi->tsi_active_rpcs));
+       LASSERT (!sfw_test_active(tsi));
+
+       while (!list_empty(&tsi->tsi_units)) {
+               tsu = list_entry(tsi->tsi_units.next,
+                                    sfw_test_unit_t, tsu_list);
+               list_del(&tsu->tsu_list);
+               LIBCFS_FREE(tsu, sizeof(*tsu));
+       }
+
+       while (!list_empty(&tsi->tsi_free_rpcs)) {
+               rpc = list_entry(tsi->tsi_free_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       }
+
+clean:
+       sfw_unload_test(tsi);
+       LIBCFS_FREE(tsi, sizeof(*tsi));
+       return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+       sfw_test_instance_t *tsi;
+
+       LASSERT (!sfw_batch_active(tsb));
+       LASSERT (list_empty(&tsb->bat_list));
+
+       while (!list_empty(&tsb->bat_tests)) {
+               tsi = list_entry(tsb->bat_tests.next,
+                                    sfw_test_instance_t, tsi_list);
+               list_del_init(&tsi->tsi_list);
+               sfw_destroy_test_instance(tsi);
+       }
+
+       LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+       return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+       sfw_batch_t *batch;
+
+       LASSERT (list_empty(&sn->sn_list));
+       LASSERT (sn != sfw_data.fw_session);
+
+       while (!list_empty(&sn->sn_batches)) {
+               batch = list_entry(sn->sn_batches.next,
+                                      sfw_batch_t, bat_list);
+               list_del_init(&batch->bat_list);
+               sfw_destroy_batch(batch);
+       }
+
+       LIBCFS_FREE(sn, sizeof(*sn));
+       atomic_dec(&sfw_data.fw_nzombies);
+       return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+       srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+       LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+       LASSERT (req->tsr_is_client);
+
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+       if (req->tsr_service == SRPC_SERVICE_BRW) {
+               if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+                       test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+                       __swab32s(&bulk->blk_opc);
+                       __swab32s(&bulk->blk_npg);
+                       __swab32s(&bulk->blk_flags);
+
+               } else {
+                       test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+                       __swab16s(&bulk->blk_opc);
+                       __swab16s(&bulk->blk_flags);
+                       __swab32s(&bulk->blk_offset);
+                       __swab32s(&bulk->blk_len);
+               }
+
+               return;
+       }
+
+       if (req->tsr_service == SRPC_SERVICE_PING) {
+               test_ping_req_t *ping = &req->tsr_u.ping;
+
+               __swab32s(&ping->png_size);
+               __swab32s(&ping->png_flags);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+       srpc_msg_t        *msg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+       srpc_bulk_t      *bk = rpc->srpc_bulk;
+       int               ndest = req->tsr_ndest;
+       sfw_test_unit_t     *tsu;
+       sfw_test_instance_t *tsi;
+       int               i;
+       int               rc;
+
+       LIBCFS_ALLOC(tsi, sizeof(*tsi));
+       if (tsi == NULL) {
+               CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+                       tsb->bat_id.bat_id);
+               return -ENOMEM;
+       }
+
+       memset(tsi, 0, sizeof(*tsi));
+       spin_lock_init(&tsi->tsi_lock);
+       atomic_set(&tsi->tsi_nactive, 0);
+       INIT_LIST_HEAD(&tsi->tsi_units);
+       INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+       INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+       tsi->tsi_stopping      = 0;
+       tsi->tsi_batch   = tsb;
+       tsi->tsi_loop     = req->tsr_loop;
+       tsi->tsi_concur = req->tsr_concur;
+       tsi->tsi_service       = req->tsr_service;
+       tsi->tsi_is_client     = !!(req->tsr_is_client);
+       tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+       rc = sfw_load_test(tsi);
+       if (rc != 0) {
+               LIBCFS_FREE(tsi, sizeof(*tsi));
+               return rc;
+       }
+
+       LASSERT (!sfw_batch_active(tsb));
+
+       if (!tsi->tsi_is_client) {
+               /* it's test server, just add it to tsb */
+               list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+               return 0;
+       }
+
+       LASSERT (bk != NULL);
+       LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+       LASSERT((unsigned int)bk->bk_len >=
+               sizeof(lnet_process_id_packed_t) * ndest);
+
+       sfw_unpack_addtest_req(msg);
+       memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+       for (i = 0; i < ndest; i++) {
+               lnet_process_id_packed_t *dests;
+               lnet_process_id_packed_t  id;
+               int                    j;
+
+               dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+               LASSERT (dests != NULL);  /* my pages are within KVM always */
+               id = dests[i % SFW_ID_PER_PAGE];
+               if (msg->msg_magic != SRPC_MSG_MAGIC)
+                       sfw_unpack_id(id);
+
+               for (j = 0; j < tsi->tsi_concur; j++) {
+                       LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+                       if (tsu == NULL) {
+                               rc = -ENOMEM;
+                               CERROR ("Can't allocate tsu for %d\n",
+                                       tsi->tsi_service);
+                               goto error;
+                       }
+
+                       tsu->tsu_dest.nid = id.nid;
+                       tsu->tsu_dest.pid = id.pid;
+                       tsu->tsu_instance = tsi;
+                       tsu->tsu_private  = NULL;
+                       list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+               }
+       }
+
+       rc = tsi->tsi_ops->tso_init(tsi);
+       if (rc == 0) {
+               list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+               return 0;
+       }
+
+error:
+       LASSERT (rc != 0);
+       sfw_destroy_test_instance(tsi);
+       return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_batch_t      *tsb = tsi->tsi_batch;
+       sfw_session_t       *sn = tsb->bat_session;
+
+       LASSERT (sfw_test_active(tsi));
+
+       if (!atomic_dec_and_test(&tsi->tsi_nactive))
+               return;
+
+       /* the test instance is done */
+       spin_lock(&tsi->tsi_lock);
+
+       tsi->tsi_stopping = 0;
+
+       spin_unlock(&tsi->tsi_lock);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+           sn == sfw_data.fw_session) {                  /* sn also active */
+               spin_unlock(&sfw_data.fw_lock);
+               return;
+       }
+
+       LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+       list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+               if (sfw_batch_active(tsb)) {
+                       spin_unlock(&sfw_data.fw_lock);
+                       return;
+               }
+       }
+
+       list_del_init(&sn->sn_list);
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_destroy_session(sn);
+       return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+       sfw_test_unit_t     *tsu = rpc->crpc_priv;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       int               done = 0;
+
+       tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+       spin_lock(&tsi->tsi_lock);
+
+       LASSERT (sfw_test_active(tsi));
+       LASSERT (!list_empty(&rpc->crpc_list));
+
+       list_del_init(&rpc->crpc_list);
+
+       /* batch is stopping or loop is done or get error */
+       if (tsi->tsi_stopping ||
+           tsu->tsu_loop == 0 ||
+           (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+               done = 1;
+
+       /* dec ref for poster */
+       srpc_client_rpc_decref(rpc);
+
+       spin_unlock(&tsi->tsi_lock);
+
+       if (!done) {
+               swi_schedule_workitem(&tsu->tsu_worker);
+               return;
+       }
+
+       sfw_test_unit_done(tsu);
+       return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+                   unsigned features, int nblk, int blklen,
+                   srpc_client_rpc_t **rpcpp)
+{
+       srpc_client_rpc_t   *rpc = NULL;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+       spin_lock(&tsi->tsi_lock);
+
+       LASSERT (sfw_test_active(tsi));
+
+       if (!list_empty(&tsi->tsi_free_rpcs)) {
+               /* pick request from buffer */
+               rpc = list_entry(tsi->tsi_free_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+               list_del_init(&rpc->crpc_list);
+       }
+
+       spin_unlock(&tsi->tsi_lock);
+
+       if (rpc == NULL) {
+               rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+                                            blklen, sfw_test_rpc_done,
+                                            sfw_test_rpc_fini, tsu);
+       } else {
+               srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+                                    blklen, sfw_test_rpc_done,
+                                    sfw_test_rpc_fini, tsu);
+       }
+
+       if (rpc == NULL) {
+               CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+               return -ENOMEM;
+       }
+
+       rpc->crpc_reqstmsg.msg_ses_feats = features;
+       *rpcpp = rpc;
+
+       return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+       sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       srpc_client_rpc_t   *rpc = NULL;
+
+       LASSERT (wi == &tsu->tsu_worker);
+
+       if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+               LASSERT (rpc == NULL);
+               goto test_done;
+       }
+
+       LASSERT (rpc != NULL);
+
+       spin_lock(&tsi->tsi_lock);
+
+       if (tsi->tsi_stopping) {
+               list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+               spin_unlock(&tsi->tsi_lock);
+               goto test_done;
+       }
+
+       if (tsu->tsu_loop > 0)
+               tsu->tsu_loop--;
+
+       list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+       spin_unlock(&tsi->tsi_lock);
+
+       rpc->crpc_timeout = rpc_timeout;
+
+       spin_lock(&rpc->crpc_lock);
+       srpc_post_rpc(rpc);
+       spin_unlock(&rpc->crpc_lock);
+       return 0;
+
+test_done:
+       /*
+        * No one can schedule me now since:
+        * - previous RPC, if any, has done and
+        * - no new RPC is initiated.
+        * - my batch is still active; no one can run it again now.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       swi_exit_workitem(wi);
+       sfw_test_unit_done(tsu);
+       return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+       swi_workitem_t      *wi;
+       sfw_test_unit_t     *tsu;
+       sfw_test_instance_t *tsi;
+
+       if (sfw_batch_active(tsb)) {
+               CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+                      tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               if (!tsi->tsi_is_client) /* skip server instances */
+                       continue;
+
+               LASSERT (!tsi->tsi_stopping);
+               LASSERT (!sfw_test_active(tsi));
+
+               atomic_inc(&tsb->bat_nactive);
+
+               list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+                       atomic_inc(&tsi->tsi_nactive);
+                       tsu->tsu_loop = tsi->tsi_loop;
+                       wi = &tsu->tsu_worker;
+                       swi_init_workitem(wi, tsu, sfw_run_test,
+                                         lst_sched_test[\
+                                         lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+                       swi_schedule_workitem(wi);
+               }
+       }
+
+       return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+       sfw_test_instance_t *tsi;
+       srpc_client_rpc_t   *rpc;
+
+       if (!sfw_batch_active(tsb)) {
+               CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               spin_lock(&tsi->tsi_lock);
+
+               if (!tsi->tsi_is_client ||
+                   !sfw_test_active(tsi) || tsi->tsi_stopping) {
+                       spin_unlock(&tsi->tsi_lock);
+                       continue;
+               }
+
+               tsi->tsi_stopping = 1;
+
+               if (!force) {
+                       spin_unlock(&tsi->tsi_lock);
+                       continue;
+               }
+
+               /* abort launched rpcs in the test */
+               list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+                       spin_lock(&rpc->crpc_lock);
+
+                       srpc_abort_rpc(rpc, -EINTR);
+
+                       spin_unlock(&rpc->crpc_lock);
+               }
+
+               spin_unlock(&tsi->tsi_lock);
+       }
+
+       return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+       sfw_test_instance_t *tsi;
+
+       if (testidx < 0)
+               return -EINVAL;
+
+       if (testidx == 0) {
+               reply->bar_active = atomic_read(&tsb->bat_nactive);
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               if (testidx-- > 1)
+                       continue;
+
+               reply->bar_active = atomic_read(&tsi->tsi_nactive);
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+       srpc_free_bulk(rpc->srpc_bulk);
+       rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+               int sink)
+{
+       LASSERT(rpc->srpc_bulk == NULL);
+       LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+       rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+       if (rpc->srpc_bulk == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+       sfw_session_t     *sn = sfw_data.fw_session;
+       srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+       srpc_test_reqst_t *request;
+       int             rc;
+       sfw_batch_t       *bat;
+
+       request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+       reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->tsr_loop == 0 ||
+           request->tsr_concur == 0 ||
+           request->tsr_sid.ses_nid == LNET_NID_ANY ||
+           request->tsr_ndest > SFW_MAX_NDESTS ||
+           (request->tsr_is_client && request->tsr_ndest == 0) ||
+           request->tsr_concur > SFW_MAX_CONCUR ||
+           request->tsr_service > SRPC_SERVICE_MAX_ID ||
+           request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+               reply->tsr_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+           sfw_find_test_case(request->tsr_service) == NULL) {
+               reply->tsr_status = ENOENT;
+               return 0;
+       }
+
+       bat = sfw_bid2batch(request->tsr_bid);
+       if (bat == NULL) {
+               CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+                       rpc->srpc_scd->scd_svc->sv_name,
+                       libcfs_id2str(rpc->srpc_peer));
+               return -ENOMEM;
+       }
+
+       if (sfw_batch_active(bat)) {
+               reply->tsr_status = EBUSY;
+               return 0;
+       }
+
+       if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+               /* rpc will be resumed later in sfw_bulk_ready */
+               int     npg = sfw_id_pages(request->tsr_ndest);
+               int     len;
+
+               if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+                       len = npg * PAGE_CACHE_SIZE;
+
+               } else  {
+                       len = sizeof(lnet_process_id_packed_t) *
+                             request->tsr_ndest;
+               }
+
+               return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+       }
+
+       rc = sfw_add_test_instance(bat, rpc);
+       CDEBUG (rc == 0 ? D_NET : D_WARNING,
+               "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+               rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+               request->tsr_is_client ? "client" : "server",
+               request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+       reply->tsr_status = (rc < 0) ? -rc : rc;
+       return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       int         rc = 0;
+       sfw_batch_t   *bat;
+
+       reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+               reply->bar_status = ESRCH;
+               return 0;
+       }
+
+       bat = sfw_find_batch(request->bar_bid);
+       if (bat == NULL) {
+               reply->bar_status = ENOENT;
+               return 0;
+       }
+
+       switch (request->bar_opc) {
+       case SRPC_BATCH_OPC_RUN:
+               rc = sfw_run_batch(bat);
+               break;
+
+       case SRPC_BATCH_OPC_STOP:
+               rc = sfw_stop_batch(bat, request->bar_arg);
+               break;
+
+       case SRPC_BATCH_OPC_QUERY:
+               rc = sfw_query_batch(bat, request->bar_testidx, reply);
+               break;
+
+       default:
+               return -EINVAL; /* drop it */
+       }
+
+       reply->bar_status = (rc < 0) ? -rc : rc;
+       return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       srpc_msg_t     *reply   = &rpc->srpc_replymsg;
+       srpc_msg_t     *request = &rpc->srpc_reqstbuf->buf_msg;
+       unsigned        features = LST_FEATS_MASK;
+       int             rc = 0;
+
+       LASSERT(sfw_data.fw_active_srpc == NULL);
+       LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (sfw_data.fw_shuttingdown) {
+               spin_unlock(&sfw_data.fw_lock);
+               return -ESHUTDOWN;
+       }
+
+       /* Remove timer to avoid racing with it or expiring active session */
+       if (sfw_del_session_timer() != 0) {
+               CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+               spin_unlock(&sfw_data.fw_lock);
+               return -EAGAIN;
+       }
+
+       sfw_data.fw_active_srpc = rpc;
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_unpack_message(request);
+       LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+       /* rpc module should have checked this */
+       LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+       if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+           sv->sv_id != SRPC_SERVICE_DEBUG) {
+               sfw_session_t *sn = sfw_data.fw_session;
+
+               if (sn != NULL &&
+                   sn->sn_features != request->msg_ses_feats) {
+                       CNETERR("Features of framework RPC don't match "
+                               "features of current session: %x/%x\n",
+                               request->msg_ses_feats, sn->sn_features);
+                       reply->msg_body.reply.status = EPROTO;
+                       reply->msg_body.reply.sid    = sn->sn_id;
+                       goto out;
+               }
+
+       } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               /* NB: at this point, old version will ignore features and
+                * create new session anyway, so console should be able
+                * to handle this */
+               reply->msg_body.reply.status = EPROTO;
+               goto out;
+       }
+
+       switch(sv->sv_id) {
+       default:
+               LBUG ();
+       case SRPC_SERVICE_TEST:
+               rc = sfw_add_test(rpc);
+               break;
+
+       case SRPC_SERVICE_BATCH:
+               rc = sfw_control_batch(&request->msg_body.bat_reqst,
+                                      &reply->msg_body.bat_reply);
+               break;
+
+       case SRPC_SERVICE_QUERY_STAT:
+               rc = sfw_get_stats(&request->msg_body.stat_reqst,
+                                  &reply->msg_body.stat_reply);
+               break;
+
+       case SRPC_SERVICE_DEBUG:
+               rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+                                      &reply->msg_body.dbg_reply);
+               break;
+
+       case SRPC_SERVICE_MAKE_SESSION:
+               rc = sfw_make_session(&request->msg_body.mksn_reqst,
+                                     &reply->msg_body.mksn_reply);
+               break;
+
+       case SRPC_SERVICE_REMOVE_SESSION:
+               rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+                                       &reply->msg_body.rmsn_reply);
+               break;
+       }
+
+       if (sfw_data.fw_session != NULL)
+               features = sfw_data.fw_session->sn_features;
+ out:
+       reply->msg_ses_feats = features;
+       rpc->srpc_done = sfw_server_rpc_done;
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!sfw_data.fw_shuttingdown)
+               sfw_add_session_timer();
+
+       sfw_data.fw_active_srpc = NULL;
+       spin_unlock(&sfw_data.fw_lock);
+       return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       int                     rc;
+
+       LASSERT(rpc->srpc_bulk != NULL);
+       LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+       LASSERT(sfw_data.fw_active_srpc == NULL);
+       LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (status != 0) {
+               CERROR("Bulk transfer failed for RPC: "
+                      "service %s, peer %s, status %d\n",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+               spin_unlock(&sfw_data.fw_lock);
+               return -EIO;
+       }
+
+       if (sfw_data.fw_shuttingdown) {
+               spin_unlock(&sfw_data.fw_lock);
+               return -ESHUTDOWN;
+       }
+
+       if (sfw_del_session_timer() != 0) {
+               CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+               spin_unlock(&sfw_data.fw_lock);
+               return -EAGAIN;
+       }
+
+       sfw_data.fw_active_srpc = rpc;
+       spin_unlock(&sfw_data.fw_lock);
+
+       rc = sfw_add_test(rpc);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!sfw_data.fw_shuttingdown)
+               sfw_add_session_timer();
+
+       sfw_data.fw_active_srpc = NULL;
+       spin_unlock(&sfw_data.fw_lock);
+       return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+              unsigned features, int nbulkiov, int bulklen,
+              void (*done)(srpc_client_rpc_t *), void *priv)
+{
+       srpc_client_rpc_t *rpc = NULL;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       LASSERT (!sfw_data.fw_shuttingdown);
+       LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+               rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+
+               srpc_init_client_rpc(rpc, peer, service, 0, 0,
+                                    done, sfw_client_rpc_fini, priv);
+       }
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       if (rpc == NULL) {
+               rpc = srpc_create_client_rpc(peer, service,
+                                            nbulkiov, bulklen, done,
+                                            nbulkiov != 0 ?  NULL :
+                                            sfw_client_rpc_fini,
+                                            priv);
+       }
+
+       if (rpc != NULL) /* "session" is concept in framework */
+               rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+       return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       /* srpc module should guarantee I wouldn't get crap */
+       LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+       if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+               srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+               __swab32s(&req->str_type);
+               __swab64s(&req->str_rpyid);
+               sfw_unpack_sid(req->str_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+               srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+               __swab32s(&rep->str_status);
+               sfw_unpack_sid(rep->str_sid);
+               sfw_unpack_fw_counters(rep->str_fw);
+               sfw_unpack_rpc_counters(rep->str_rpc);
+               sfw_unpack_lnet_counters(rep->str_lnet);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+               srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+               __swab64s(&req->mksn_rpyid);
+               __swab32s(&req->mksn_force);
+               sfw_unpack_sid(req->mksn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+               srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+               __swab32s(&rep->mksn_status);
+               __swab32s(&rep->mksn_timeout);
+               sfw_unpack_sid(rep->mksn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+               srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+               __swab64s(&req->rmsn_rpyid);
+               sfw_unpack_sid(req->rmsn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+               srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+               __swab32s(&rep->rmsn_status);
+               sfw_unpack_sid(rep->rmsn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+               srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+               __swab64s(&req->dbg_rpyid);
+               __swab32s(&req->dbg_flags);
+               sfw_unpack_sid(req->dbg_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+               srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+               __swab32s(&rep->dbg_nbatch);
+               __swab32s(&rep->dbg_timeout);
+               sfw_unpack_sid(rep->dbg_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+               srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+               __swab32s(&req->bar_opc);
+               __swab64s(&req->bar_rpyid);
+               __swab32s(&req->bar_testidx);
+               __swab32s(&req->bar_arg);
+               sfw_unpack_sid(req->bar_sid);
+               __swab64s(&req->bar_bid.bat_id);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+               srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+               __swab32s(&rep->bar_status);
+               sfw_unpack_sid(rep->bar_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+               srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+               __swab64s(&req->tsr_rpyid);
+               __swab64s(&req->tsr_bulkid);
+               __swab32s(&req->tsr_loop);
+               __swab32s(&req->tsr_ndest);
+               __swab32s(&req->tsr_concur);
+               __swab32s(&req->tsr_service);
+               sfw_unpack_sid(req->tsr_sid);
+               __swab64s(&req->tsr_bid.bat_id);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+               srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+               __swab32s(&rep->tsr_status);
+               sfw_unpack_sid(rep->tsr_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+               srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+               __swab64s(&req->join_rpyid);
+               sfw_unpack_sid(req->join_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+               srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+               __swab32s(&rep->join_status);
+               __swab32s(&rep->join_timeout);
+               sfw_unpack_sid(rep->join_sid);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+       LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       spin_lock(&rpc->crpc_lock);
+       srpc_abort_rpc(rpc, -EINTR);
+       spin_unlock(&rpc->crpc_lock);
+       return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+       spin_lock(&rpc->crpc_lock);
+
+       LASSERT (!rpc->crpc_closed);
+       LASSERT (!rpc->crpc_aborted);
+       LASSERT (list_empty(&rpc->crpc_list));
+       LASSERT (!sfw_data.fw_shuttingdown);
+
+       rpc->crpc_timeout = rpc_timeout;
+       srpc_post_rpc(rpc);
+
+       spin_unlock(&rpc->crpc_lock);
+       return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+       {
+               /* sv_id */    SRPC_SERVICE_DEBUG,
+               /* sv_name */  "debug",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_QUERY_STAT,
+               /* sv_name */  "query stats",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+               /* sv_name */  "make session",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+               /* sv_name */  "remove session",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_BATCH,
+               /* sv_name */  "batch service",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_TEST,
+               /* sv_name */  "test service",
+               0
+       },
+       {
+               /* sv_id */    0,
+               /* sv_name */  NULL,
+               0
+       }
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t  ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t  brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+       int           i;
+       int           rc;
+       int           error;
+       srpc_service_t  *sv;
+       sfw_test_case_t *tsc;
+
+
+       if (session_timeout < 0) {
+               CERROR ("Session timeout must be non-negative: %d\n",
+                       session_timeout);
+               return -EINVAL;
+       }
+
+       if (rpc_timeout < 0) {
+               CERROR ("RPC timeout must be non-negative: %d\n",
+                       rpc_timeout);
+               return -EINVAL;
+       }
+
+       if (session_timeout == 0)
+               CWARN ("Zero session_timeout specified "
+                      "- test sessions never expire.\n");
+
+       if (rpc_timeout == 0)
+               CWARN ("Zero rpc_timeout specified "
+                      "- test RPC never expire.\n");
+
+       memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+       sfw_data.fw_session     = NULL;
+       sfw_data.fw_active_srpc = NULL;
+       spin_lock_init(&sfw_data.fw_lock);
+       atomic_set(&sfw_data.fw_nzombies, 0);
+       INIT_LIST_HEAD(&sfw_data.fw_tests);
+       INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+       INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+       brw_init_test_client();
+       brw_init_test_service();
+       rc = sfw_register_test(&brw_test_service, &brw_test_client);
+       LASSERT (rc == 0);
+
+       ping_init_test_client();
+       ping_init_test_service();
+       rc = sfw_register_test(&ping_test_service, &ping_test_client);
+       LASSERT (rc == 0);
+
+       error = 0;
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               sv = tsc->tsc_srv_service;
+
+               rc = srpc_add_service(sv);
+               LASSERT (rc != -EBUSY);
+               if (rc != 0) {
+                       CWARN ("Failed to add %s service: %d\n",
+                              sv->sv_name, rc);
+                       error = rc;
+               }
+       }
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL) break;
+
+               sv->sv_bulk_ready = NULL;
+               sv->sv_handler    = sfw_handle_server_rpc;
+               sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+               if (sv->sv_id == SRPC_SERVICE_TEST)
+                       sv->sv_bulk_ready = sfw_bulk_ready;
+
+               rc = srpc_add_service(sv);
+               LASSERT (rc != -EBUSY);
+               if (rc != 0) {
+                       CWARN ("Failed to add %s service: %d\n",
+                              sv->sv_name, rc);
+                       error = rc;
+               }
+
+               /* about to sfw_shutdown, no need to add buffer */
+               if (error) continue;
+
+               rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+               if (rc != 0) {
+                       CWARN("Failed to reserve enough buffers: "
+                             "service %s, %d needed: %d\n",
+                             sv->sv_name, sv->sv_wi_total, rc);
+                       error = -ENOMEM;
+               }
+       }
+
+       if (error != 0)
+               sfw_shutdown();
+       return error;
+}
+
+void
+sfw_shutdown (void)
+{
+       srpc_service_t  *sv;
+       sfw_test_case_t *tsc;
+       int              i;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       sfw_data.fw_shuttingdown = 1;
+       lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+                      "waiting for active RPC to finish.\n");
+
+       if (sfw_del_session_timer() != 0)
+               lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+                              "waiting for session timer to explode.\n");
+
+       sfw_deactivate_session();
+       lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+                      sfw_data.fw_lock,
+                      "waiting for %d zombie sessions to die.\n",
+                      atomic_read(&sfw_data.fw_nzombies));
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL)
+                       break;
+
+               srpc_shutdown_service(sv);
+               srpc_remove_service(sv);
+       }
+
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               sv = tsc->tsc_srv_service;
+               srpc_shutdown_service(sv);
+               srpc_remove_service(sv);
+       }
+
+       while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+               srpc_client_rpc_t *rpc;
+
+               rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       }
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL)
+                       break;
+
+               srpc_wait_service_shutdown(sv);
+       }
+
+       while (!list_empty(&sfw_data.fw_tests)) {
+               tsc = list_entry(sfw_data.fw_tests.next,
+                                    sfw_test_case_t, tsc_list);
+
+               srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+               list_del(&tsc->tsc_list);
+               LIBCFS_FREE(tsc, sizeof(*tsc));
+       }
+
+       return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644 (file)
index 0000000..5257e56
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+       LST_INIT_NONE           = 0,
+       LST_INIT_WI_SERIAL,
+       LST_INIT_WI_TEST,
+       LST_INIT_RPC,
+       LST_INIT_FW,
+       LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+       int     i;
+
+       switch (lst_init_step) {
+               case LST_INIT_CONSOLE:
+                       lstcon_console_fini();
+               case LST_INIT_FW:
+                       sfw_shutdown();
+               case LST_INIT_RPC:
+                       srpc_shutdown();
+               case LST_INIT_WI_TEST:
+                       for (i = 0;
+                            i < cfs_cpt_number(lnet_cpt_table()); i++) {
+                               if (lst_sched_test[i] == NULL)
+                                       continue;
+                               cfs_wi_sched_destroy(lst_sched_test[i]);
+                       }
+                       LIBCFS_FREE(lst_sched_test,
+                                   sizeof(lst_sched_test[0]) *
+                                   cfs_cpt_number(lnet_cpt_table()));
+                       lst_sched_test = NULL;
+
+               case LST_INIT_WI_SERIAL:
+                       cfs_wi_sched_destroy(lst_sched_serial);
+                       lst_sched_serial = NULL;
+               case LST_INIT_NONE:
+                       break;
+               default:
+                       LBUG();
+       }
+       return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+       CLASSERT(sizeof(srpc_msg_t) == 160);
+       CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+       CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+       CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+       CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+       CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+       int     nscheds;
+       int     rc;
+       int     i;
+
+       rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+                                1, &lst_sched_serial);
+       if (rc != 0) {
+               CERROR("Failed to create serial WI scheduler for LST\n");
+               return rc;
+       }
+       lst_init_step = LST_INIT_WI_SERIAL;
+
+       nscheds = cfs_cpt_number(lnet_cpt_table());
+       LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+       if (lst_sched_test == NULL)
+               goto error;
+
+       lst_init_step = LST_INIT_WI_TEST;
+       for (i = 0; i < nscheds; i++) {
+               int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+               /* reserve at least one CPU for LND */
+               nthrs = max(nthrs - 1, 1);
+               rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+                                        nthrs, &lst_sched_test[i]);
+               if (rc != 0) {
+                       CERROR("Failed to create CPT affinity WI scheduler "
+                              "%d for LST\n", i);
+                       goto error;
+               }
+       }
+
+       rc = srpc_startup();
+       if (rc != 0) {
+               CERROR("LST can't startup rpc\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_RPC;
+
+       rc = sfw_startup();
+       if (rc != 0) {
+               CERROR("LST can't startup framework\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_FW;
+
+       rc = lstcon_console_init();
+       if (rc != 0) {
+               CERROR("LST can't startup console\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_CONSOLE;
+       return 0;
+error:
+       lnet_selftest_fini();
+       return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);
diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644 (file)
index 0000000..f0f9194
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+       spinlock_t      pnd_lock;       /* serialize */
+       int             pnd_counter;    /* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+       sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+       LASSERT(tsi->tsi_is_client);
+       LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+       spin_lock_init(&lst_ping_data.pnd_lock);
+       lst_ping_data.pnd_counter = 0;
+
+       return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+       sfw_session_t *sn = tsi->tsi_batch->bat_session;
+       int         errors;
+
+       LASSERT (sn != NULL);
+       LASSERT (tsi->tsi_is_client);
+
+       errors = atomic_read(&sn->sn_ping_errors);
+       if (errors)
+               CWARN ("%d pings have failed.\n", errors);
+       else
+               CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+                    lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+       srpc_ping_reqst_t   *req;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+       struct timeval       tv;
+       int                  rc;
+
+       LASSERT(sn != NULL);
+       LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+       rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+       if (rc != 0)
+               return rc;
+
+       req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+       req->pnr_magic = LST_PING_TEST_MAGIC;
+
+       spin_lock(&lst_ping_data.pnd_lock);
+       req->pnr_seq = lst_ping_data.pnd_counter++;
+       spin_unlock(&lst_ping_data.pnd_lock);
+
+       cfs_fs_timeval(&tv);
+       req->pnr_time_sec  = tv.tv_sec;
+       req->pnr_time_usec = tv.tv_usec;
+
+       return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+       srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+       struct timeval       tv;
+
+       LASSERT (sn != NULL);
+
+       if (rpc->crpc_status != 0) {
+               if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                       atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Unable to ping %s (%d): %d\n",
+                       libcfs_id2str(rpc->crpc_dest),
+                       reqst->pnr_seq, rpc->crpc_status);
+               return;
+       }
+
+       if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+               __swab32s(&reply->pnr_seq);
+               __swab32s(&reply->pnr_magic);
+               __swab32s(&reply->pnr_status);
+       }
+
+       if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+               rpc->crpc_status = -EBADMSG;
+               atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Bad magic %u from %s, %u expected.\n",
+                       reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+                       LST_PING_TEST_MAGIC);
+               return;
+       }
+
+       if (reply->pnr_seq != reqst->pnr_seq) {
+               rpc->crpc_status = -EBADMSG;
+               atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Bad seq %u from %s, %u expected.\n",
+                       reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+                       reqst->pnr_seq);
+               return;
+       }
+
+       cfs_fs_timeval(&tv);
+       CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+               (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+                          + (tv.tv_usec - reqst->pnr_time_usec)));
+       return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv  = rpc->srpc_scd->scd_svc;
+       srpc_msg_t      *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_msg_t        *replymsg = &rpc->srpc_replymsg;
+       srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+       srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+       LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+               LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+               __swab32s(&req->pnr_seq);
+               __swab32s(&req->pnr_magic);
+               __swab64s(&req->pnr_time_sec);
+               __swab64s(&req->pnr_time_usec);
+       }
+       LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+       if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+               CERROR ("Unexpect magic %08x from %s\n",
+                       req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+               return -EINVAL;
+       }
+
+       rep->pnr_seq   = req->pnr_seq;
+       rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+       if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               replymsg->msg_ses_feats = LST_FEATS_MASK;
+               rep->pnr_status = EPROTO;
+               return 0;
+       }
+
+       replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+       CDEBUG(D_NET, "Get ping %d from %s\n",
+              req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+       return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+       ping_test_client.tso_init     = ping_client_init;
+       ping_test_client.tso_fini     = ping_client_fini;
+       ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+       ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+       ping_test_service.sv_id       = SRPC_SERVICE_PING;
+       ping_test_service.sv_name     = "ping_test";
+       ping_test_service.sv_handler  = ping_server_handle;
+       ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644 (file)
index 0000000..91d83f4
--- /dev/null
@@ -0,0 +1,1665 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+       SRPC_STATE_NONE,
+       SRPC_STATE_NI_INIT,
+       SRPC_STATE_EQ_INIT,
+       SRPC_STATE_RUNNING,
+       SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+       spinlock_t       rpc_glock;     /* global lock */
+       srpc_service_t  *rpc_services[SRPC_SERVICE_MAX_ID + 1];
+       lnet_handle_eq_t rpc_lnet_eq;   /* _the_ LNet event queue */
+       srpc_state_t     rpc_state;
+       srpc_counters_t  rpc_counters;
+       __u64            rpc_matchbits; /* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+       return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+              SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+       spin_lock(&srpc_data.rpc_glock);
+       *cnt = srpc_data.rpc_counters;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+       spin_lock(&srpc_data.rpc_glock);
+       srpc_data.rpc_counters = *cnt;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+       nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+       LASSERT(nob > 0);
+       LASSERT(i >= 0 && i < bk->bk_niov);
+
+       bk->bk_iovs[i].kiov_offset = 0;
+       bk->bk_iovs[i].kiov_page   = pg;
+       bk->bk_iovs[i].kiov_len    = nob;
+       return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+       int      i;
+       struct page *pg;
+
+       LASSERT (bk != NULL);
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               if (pg == NULL) break;
+
+               __free_page(pg);
+       }
+
+       LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+       return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+       srpc_bulk_t  *bk;
+       struct page  **pages;
+       int           i;
+
+       LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+       LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+                        offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+       if (bk == NULL) {
+               CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+               return NULL;
+       }
+
+       memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+       bk->bk_sink   = sink;
+       bk->bk_len    = bulk_len;
+       bk->bk_niov   = bulk_npg;
+       UNUSED(pages);
+
+       for (i = 0; i < bulk_npg; i++) {
+               struct page *pg;
+               int         nob;
+
+               pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_IOFS);
+               if (pg == NULL) {
+                       CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+                       srpc_free_bulk(bk);
+                       return NULL;
+               }
+
+               nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+               bulk_len -= nob;
+       }
+
+       return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+       __u64 id;
+
+       spin_lock(&srpc_data.rpc_glock);
+       id = srpc_data.rpc_matchbits++;
+       spin_unlock(&srpc_data.rpc_glock);
+       return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+                    struct srpc_service_cd *scd,
+                    struct srpc_buffer *buffer)
+{
+       memset(rpc, 0, sizeof(*rpc));
+       swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+                         srpc_serv_is_framework(scd->scd_svc) ?
+                         lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+       rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+       rpc->srpc_scd      = scd;
+       rpc->srpc_reqstbuf = buffer;
+       rpc->srpc_peer     = buffer->buf_peer;
+       rpc->srpc_self     = buffer->buf_self;
+       LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       struct srpc_buffer      *buf;
+       struct list_head                *q;
+       int                     i;
+
+       if (svc->sv_cpt_data == NULL)
+               return;
+
+       cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+               while (1) {
+                       if (!list_empty(&scd->scd_buf_posted))
+                               q = &scd->scd_buf_posted;
+                       else if (!list_empty(&scd->scd_buf_blocked))
+                               q = &scd->scd_buf_blocked;
+                       else
+                               break;
+
+                       while (!list_empty(q)) {
+                               buf = list_entry(q->next,
+                                                    struct srpc_buffer,
+                                                    buf_list);
+                               list_del(&buf->buf_list);
+                               LIBCFS_FREE(buf, sizeof(*buf));
+                       }
+               }
+
+               LASSERT(list_empty(&scd->scd_rpc_active));
+
+               while (!list_empty(&scd->scd_rpc_free)) {
+                       rpc = list_entry(scd->scd_rpc_free.next,
+                                            struct srpc_server_rpc,
+                                            srpc_list);
+                       list_del(&rpc->srpc_list);
+                       LIBCFS_FREE(rpc, sizeof(*rpc));
+               }
+       }
+
+       cfs_percpt_free(svc->sv_cpt_data);
+       svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+       int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+       return srpc_serv_is_framework(svc) ?
+              max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     nrpcs;
+       int                     i;
+       int                     j;
+
+       svc->sv_shuttingdown = 0;
+
+       svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(struct srpc_service_cd));
+       if (svc->sv_cpt_data == NULL)
+               return -ENOMEM;
+
+       svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+                       1 : cfs_cpt_number(lnet_cpt_table());
+       nrpcs = srpc_service_nrpcs(svc);
+
+       cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+               scd->scd_cpt = i;
+               scd->scd_svc = svc;
+               spin_lock_init(&scd->scd_lock);
+               INIT_LIST_HEAD(&scd->scd_rpc_free);
+               INIT_LIST_HEAD(&scd->scd_rpc_active);
+               INIT_LIST_HEAD(&scd->scd_buf_posted);
+               INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+               scd->scd_ev.ev_data = scd;
+               scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+               /* NB: don't use lst_sched_serial for adding buffer,
+                * see details in srpc_service_add_buffers() */
+               swi_init_workitem(&scd->scd_buf_wi, scd,
+                                 srpc_add_buffer, lst_sched_test[i]);
+
+               if (i != 0 && srpc_serv_is_framework(svc)) {
+                       /* NB: framework service only needs srpc_service_cd for
+                        * one partition, but we allocate for all to make
+                        * it easier to implement, it will waste a little
+                        * memory but nobody should care about this */
+                       continue;
+               }
+
+               for (j = 0; j < nrpcs; j++) {
+                       LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+                                        i, sizeof(*rpc));
+                       if (rpc == NULL) {
+                               srpc_service_fini(svc);
+                               return -ENOMEM;
+                       }
+                       list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+               }
+       }
+
+       return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+       int id = sv->sv_id;
+
+       LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+       if (srpc_service_init(sv) != 0)
+               return -ENOMEM;
+
+       spin_lock(&srpc_data.rpc_glock);
+
+       LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+       if (srpc_data.rpc_services[id] != NULL) {
+               spin_unlock(&srpc_data.rpc_glock);
+               goto failed;
+       }
+
+       srpc_data.rpc_services[id] = sv;
+       spin_unlock(&srpc_data.rpc_glock);
+
+       CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+       return 0;
+
+ failed:
+       srpc_service_fini(sv);
+       return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+       int id = sv->sv_id;
+
+       spin_lock(&srpc_data.rpc_glock);
+
+       if (srpc_data.rpc_services[id] != sv) {
+               spin_unlock(&srpc_data.rpc_glock);
+               return -ENOENT;
+       }
+
+       srpc_data.rpc_services[id] = NULL;
+       spin_unlock(&srpc_data.rpc_glock);
+       return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+                      int len, int options, lnet_process_id_t peer,
+                      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       int              rc;
+       lnet_md_t        md;
+       lnet_handle_me_t meh;
+
+       rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+                         local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+       if (rc != 0) {
+               CERROR ("LNetMEAttach failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               return -ENOMEM;
+       }
+
+       md.threshold = 1;
+       md.user_ptr  = ev;
+       md.start     = buf;
+       md.length    = len;
+       md.options   = options;
+       md.eq_handle = srpc_data.rpc_lnet_eq;
+
+       rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+       if (rc != 0) {
+               CERROR ("LNetMDAttach failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+
+               rc = LNetMEUnlink(meh);
+               LASSERT (rc == 0);
+               return -ENOMEM;
+       }
+
+       CDEBUG (D_NET,
+               "Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+               libcfs_id2str(peer), portal, matchbits);
+       return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+                     int options, lnet_process_id_t peer, lnet_nid_t self,
+                     lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       int       rc;
+       lnet_md_t md;
+
+       md.user_ptr  = ev;
+       md.start     = buf;
+       md.length    = len;
+       md.eq_handle = srpc_data.rpc_lnet_eq;
+       md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+       md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+       rc = LNetMDBind(md, LNET_UNLINK, mdh);
+       if (rc != 0) {
+               CERROR ("LNetMDBind failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               return -ENOMEM;
+       }
+
+       /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+        * they're only meaningful for MDs attached to an ME (i.e. passive
+        * buffers... */
+       if ((options & LNET_MD_OP_PUT) != 0) {
+               rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+                            portal, matchbits, 0, 0);
+       } else {
+               LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+               rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+       }
+
+       if (rc != 0) {
+               CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+                       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+                       libcfs_id2str(peer), portal, matchbits, rc);
+
+               /* The forthcoming unlink event will complete this operation
+                * with failure, so fall through and return success here.
+                */
+               rc = LNetMDUnlink(*mdh);
+               LASSERT (rc == 0);
+       } else {
+               CDEBUG (D_NET,
+                       "Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+                       libcfs_id2str(peer), portal, matchbits);
+       }
+       return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+                       int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       return srpc_post_active_rdma(srpc_serv_portal(service), service,
+                                    buf, len, LNET_MD_OP_PUT, peer,
+                                    LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+                        lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       lnet_process_id_t any = {0};
+
+       any.nid = LNET_NID_ANY;
+       any.pid = LNET_PID_ANY;
+
+       return srpc_post_passive_rdma(srpc_serv_portal(service),
+                                     local, service, buf, len,
+                                     LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+       struct srpc_service     *sv = scd->scd_svc;
+       struct srpc_msg         *msg = &buf->buf_msg;
+       int                     rc;
+
+       LNetInvalidateHandle(&buf->buf_mdh);
+       list_add(&buf->buf_list, &scd->scd_buf_posted);
+       scd->scd_buf_nposted++;
+       spin_unlock(&scd->scd_lock);
+
+       rc = srpc_post_passive_rqtbuf(sv->sv_id,
+                                     !srpc_serv_is_framework(sv),
+                                     msg, sizeof(*msg), &buf->buf_mdh,
+                                     &scd->scd_ev);
+
+       /* At this point, a RPC (new or delayed) may have arrived in
+        * msg and its event handler has been called. So we must add
+        * buf to scd_buf_posted _before_ dropping scd_lock */
+
+       spin_lock(&scd->scd_lock);
+
+       if (rc == 0) {
+               if (!sv->sv_shuttingdown)
+                       return 0;
+
+               spin_unlock(&scd->scd_lock);
+               /* srpc_shutdown_service might have tried to unlink me
+                * when my buf_mdh was still invalid */
+               LNetMDUnlink(buf->buf_mdh);
+               spin_lock(&scd->scd_lock);
+               return 0;
+       }
+
+       scd->scd_buf_nposted--;
+       if (sv->sv_shuttingdown)
+               return rc; /* don't allow to change scd_buf_posted */
+
+       list_del(&buf->buf_list);
+       spin_unlock(&scd->scd_lock);
+
+       LIBCFS_FREE(buf, sizeof(*buf));
+
+       spin_lock(&scd->scd_lock);
+       return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+       struct srpc_service_cd  *scd = wi->swi_workitem.wi_data;
+       struct srpc_buffer      *buf;
+       int                     rc = 0;
+
+       /* it's called by workitem scheduler threads, these threads
+        * should have been set CPT affinity, so buffers will be posted
+        * on CPT local list of Portal */
+       spin_lock(&scd->scd_lock);
+
+       while (scd->scd_buf_adjust > 0 &&
+              !scd->scd_svc->sv_shuttingdown) {
+               scd->scd_buf_adjust--; /* consume it */
+               scd->scd_buf_posting++;
+
+               spin_unlock(&scd->scd_lock);
+
+               LIBCFS_ALLOC(buf, sizeof(*buf));
+               if (buf == NULL) {
+                       CERROR("Failed to add new buf to service: %s\n",
+                              scd->scd_svc->sv_name);
+                       spin_lock(&scd->scd_lock);
+                       rc = -ENOMEM;
+                       break;
+               }
+
+               spin_lock(&scd->scd_lock);
+               if (scd->scd_svc->sv_shuttingdown) {
+                       spin_unlock(&scd->scd_lock);
+                       LIBCFS_FREE(buf, sizeof(*buf));
+
+                       spin_lock(&scd->scd_lock);
+                       rc = -ESHUTDOWN;
+                       break;
+               }
+
+               rc = srpc_service_post_buffer(scd, buf);
+               if (rc != 0)
+                       break; /* buf has been freed inside */
+
+               LASSERT(scd->scd_buf_posting > 0);
+               scd->scd_buf_posting--;
+               scd->scd_buf_total++;
+               scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+       }
+
+       if (rc != 0) {
+               scd->scd_buf_err_stamp = cfs_time_current_sec();
+               scd->scd_buf_err = rc;
+
+               LASSERT(scd->scd_buf_posting > 0);
+               scd->scd_buf_posting--;
+       }
+
+       spin_unlock(&scd->scd_lock);
+       return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+       struct srpc_service_cd  *scd;
+       int                     rc = 0;
+       int                     i;
+
+       LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               scd->scd_buf_err = 0;
+               scd->scd_buf_err_stamp = 0;
+               scd->scd_buf_posting = 0;
+               scd->scd_buf_adjust = nbuffer;
+               /* start to post buffers */
+               swi_schedule_workitem(&scd->scd_buf_wi);
+               spin_unlock(&scd->scd_lock);
+
+               /* framework service only post buffer for one partition  */
+               if (srpc_serv_is_framework(sv))
+                       break;
+       }
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+               /*
+                * NB: srpc_service_add_buffers() can be called inside
+                * thread context of lst_sched_serial, and we don't normally
+                * allow to sleep inside thread context of WI scheduler
+                * because it will block current scheduler thread from doing
+                * anything else, even worse, it could deadlock if it's
+                * waiting on result from another WI of the same scheduler.
+                * However, it's safe at here because scd_buf_wi is scheduled
+                * by thread in a different WI scheduler (lst_sched_test),
+                * so we don't have any risk of deadlock, though this could
+                * block all WIs pending on lst_sched_serial for a moment
+                * which is not good but not fatal.
+                */
+               lst_wait_until(scd->scd_buf_err != 0 ||
+                              (scd->scd_buf_adjust == 0 &&
+                               scd->scd_buf_posting == 0),
+                              scd->scd_lock, "waiting for adding buffer\n");
+
+               if (scd->scd_buf_err != 0 && rc == 0)
+                       rc = scd->scd_buf_err;
+
+               spin_unlock(&scd->scd_lock);
+       }
+
+       return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+       struct srpc_service_cd  *scd;
+       int                     num;
+       int                     i;
+
+       LASSERT(!sv->sv_shuttingdown);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               num = scd->scd_buf_total + scd->scd_buf_posting;
+               scd->scd_buf_adjust -= min(nbuffer, num);
+
+               spin_unlock(&scd->scd_lock);
+       }
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     i;
+
+       LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+               if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+                       return 0;
+
+               if (scd->scd_buf_nposted > 0) {
+                       CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+                              scd->scd_buf_nposted);
+                       spin_unlock(&scd->scd_lock);
+                       return 0;
+               }
+
+               if (list_empty(&scd->scd_rpc_active)) {
+                       spin_unlock(&scd->scd_lock);
+                       continue;
+               }
+
+               rpc = list_entry(scd->scd_rpc_active.next,
+                                    struct srpc_server_rpc, srpc_list);
+               CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+                       "wi %s scheduled %d running %d, "
+                       "ev fired %d type %d status %d lnet %d\n",
+                       rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                       swi_state2str(rpc->srpc_wi.swi_state),
+                       rpc->srpc_wi.swi_workitem.wi_scheduled,
+                       rpc->srpc_wi.swi_workitem.wi_running,
+                       rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+                       rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+               spin_unlock(&scd->scd_lock);
+               return 0;
+       }
+
+       /* no lock needed from now on */
+       srpc_service_fini(sv);
+       return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+       if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+               if (srpc_service_post_buffer(scd, buf) != 0) {
+                       CWARN("Failed to post %s buffer\n",
+                             scd->scd_svc->sv_name);
+               }
+               return;
+       }
+
+       /* service is shutting down, or we want to recycle some buffers */
+       scd->scd_buf_total--;
+
+       if (scd->scd_buf_adjust < 0) {
+               scd->scd_buf_adjust++;
+               if (scd->scd_buf_adjust < 0 &&
+                   scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+                       CDEBUG(D_INFO,
+                              "Try to recyle %d buffers but nothing left\n",
+                              scd->scd_buf_adjust);
+                       scd->scd_buf_adjust = 0;
+               }
+       }
+
+       spin_unlock(&scd->scd_lock);
+       LIBCFS_FREE(buf, sizeof(*buf));
+       spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     i;
+
+       CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+              sv->sv_id, sv->sv_name);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               /* schedule in-flight RPCs to notice the abort, NB:
+                * racing with incoming RPCs; complete fix should make test
+                * RPCs carry session ID in its headers */
+               list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+                       rpc->srpc_aborted = 1;
+                       swi_schedule_workitem(&rpc->srpc_wi);
+               }
+
+               spin_unlock(&scd->scd_lock);
+       }
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       srpc_buffer_t           *buf;
+       int                     i;
+
+       CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+              sv->sv_id, sv->sv_name);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+               spin_lock(&scd->scd_lock);
+
+       sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+               spin_unlock(&scd->scd_lock);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               /* schedule in-flight RPCs to notice the shutdown */
+               list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+                       swi_schedule_workitem(&rpc->srpc_wi);
+
+               spin_unlock(&scd->scd_lock);
+
+               /* OK to traverse scd_buf_posted without lock, since no one
+                * touches scd_buf_posted now */
+               list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+                       LNetMDUnlink(buf->buf_mdh);
+       }
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+       srpc_event_t *ev = &rpc->crpc_reqstev;
+       int        rc;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REQUEST_SENT;
+
+       rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+                                    &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+                                    &rpc->crpc_reqstmdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+       srpc_event_t *ev = &rpc->crpc_replyev;
+       __u64   *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+       int        rc;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REPLY_RCVD;
+
+       *id = srpc_next_id();
+
+       rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                   &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+                                   LNET_MD_OP_PUT, rpc->crpc_dest,
+                                   &rpc->crpc_replymdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+       srpc_bulk_t  *bk = &rpc->crpc_bulk;
+       srpc_event_t *ev = &rpc->crpc_bulkev;
+       __u64   *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+       int        rc;
+       int        opt;
+
+       LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+       if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+       opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+       opt |= LNET_MD_KIOV;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+       *id = srpc_next_id();
+
+       rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                   &bk->bk_iovs[0], bk->bk_niov, opt,
+                                   rpc->crpc_dest, &bk->bk_mdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+       srpc_event_t  *ev = &rpc->srpc_ev;
+       srpc_bulk_t   *bk = rpc->srpc_bulk;
+       __u64     id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+       int         rc;
+       int         opt;
+
+       LASSERT (bk != NULL);
+
+       opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+       opt |= LNET_MD_KIOV;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+       rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+                                  &bk->bk_iovs[0], bk->bk_niov, opt,
+                                  rpc->srpc_peer, rpc->srpc_self,
+                                  &bk->bk_mdh, ev);
+       if (rc != 0)
+               ev->ev_fired = 1;  /* no more event expected */
+       return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv  = scd->scd_svc;
+       srpc_buffer_t           *buffer;
+
+       LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+       rpc->srpc_status = status;
+
+       CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+               "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+               rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+               swi_state2str(rpc->srpc_wi.swi_state), status);
+
+       if (status != 0) {
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.rpcs_dropped++;
+               spin_unlock(&srpc_data.rpc_glock);
+       }
+
+       if (rpc->srpc_done != NULL)
+               (*rpc->srpc_done) (rpc);
+       LASSERT(rpc->srpc_bulk == NULL);
+
+       spin_lock(&scd->scd_lock);
+
+       if (rpc->srpc_reqstbuf != NULL) {
+               /* NB might drop sv_lock in srpc_service_recycle_buffer, but
+                * sv won't go away for scd_rpc_active must not be empty */
+               srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+               rpc->srpc_reqstbuf = NULL;
+       }
+
+       list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+       /*
+        * No one can schedule me now since:
+        * - I'm not on scd_rpc_active.
+        * - all LNet events have been fired.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       LASSERT(rpc->srpc_ev.ev_fired);
+       swi_exit_workitem(&rpc->srpc_wi);
+
+       if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+               buffer = list_entry(scd->scd_buf_blocked.next,
+                                       srpc_buffer_t, buf_list);
+               list_del(&buffer->buf_list);
+
+               srpc_init_server_rpc(rpc, scd, buffer);
+               list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+               swi_schedule_workitem(&rpc->srpc_wi);
+       } else {
+               list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+       }
+
+       spin_unlock(&scd->scd_lock);
+       return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+       struct srpc_server_rpc  *rpc = wi->swi_workitem.wi_data;
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv = scd->scd_svc;
+       srpc_event_t            *ev = &rpc->srpc_ev;
+       int                     rc = 0;
+
+       LASSERT(wi == &rpc->srpc_wi);
+
+       spin_lock(&scd->scd_lock);
+
+       if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+               spin_unlock(&scd->scd_lock);
+
+               if (rpc->srpc_bulk != NULL)
+                       LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+               LNetMDUnlink(rpc->srpc_replymdh);
+
+               if (ev->ev_fired) { /* no more event, OK to finish */
+                       srpc_server_rpc_done(rpc, -ESHUTDOWN);
+                       return 1;
+               }
+               return 0;
+       }
+
+       spin_unlock(&scd->scd_lock);
+
+       switch (wi->swi_state) {
+       default:
+               LBUG ();
+       case SWI_STATE_NEWBORN: {
+               srpc_msg_t         *msg;
+               srpc_generic_reply_t *reply;
+
+               msg = &rpc->srpc_reqstbuf->buf_msg;
+               reply = &rpc->srpc_replymsg.msg_body.reply;
+
+               if (msg->msg_magic == 0) {
+                       /* moaned already in srpc_lnet_ev_handler */
+                       srpc_server_rpc_done(rpc, EBADMSG);
+                       return 1;
+               }
+
+               srpc_unpack_msg_hdr(msg);
+               if (msg->msg_version != SRPC_MSG_VERSION) {
+                       CWARN("Version mismatch: %u, %u expected, from %s\n",
+                             msg->msg_version, SRPC_MSG_VERSION,
+                             libcfs_id2str(rpc->srpc_peer));
+                       reply->status = EPROTO;
+                       /* drop through and send reply */
+               } else {
+                       reply->status = 0;
+                       rc = (*sv->sv_handler)(rpc);
+                       LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+                       if (rc != 0) {
+                               srpc_server_rpc_done(rpc, rc);
+                               return 1;
+                       }
+               }
+
+               wi->swi_state = SWI_STATE_BULK_STARTED;
+
+               if (rpc->srpc_bulk != NULL) {
+                       rc = srpc_do_bulk(rpc);
+                       if (rc == 0)
+                               return 0; /* wait for bulk */
+
+                       LASSERT (ev->ev_fired);
+                       ev->ev_status = rc;
+               }
+       }
+       case SWI_STATE_BULK_STARTED:
+               LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+               if (rpc->srpc_bulk != NULL) {
+                       rc = ev->ev_status;
+
+                       if (sv->sv_bulk_ready != NULL)
+                               rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+                       if (rc != 0) {
+                               srpc_server_rpc_done(rpc, rc);
+                               return 1;
+                       }
+               }
+
+               wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+               rc = srpc_send_reply(rpc);
+               if (rc == 0)
+                       return 0; /* wait for reply */
+               srpc_server_rpc_done(rpc, rc);
+               return 1;
+
+       case SWI_STATE_REPLY_SUBMITTED:
+               if (!ev->ev_fired) {
+                       CERROR("RPC %p: bulk %p, service %d\n",
+                              rpc, rpc->srpc_bulk, sv->sv_id);
+                       CERROR("Event: status %d, type %d, lnet %d\n",
+                              ev->ev_status, ev->ev_type, ev->ev_lnet);
+                       LASSERT (ev->ev_fired);
+               }
+
+               wi->swi_state = SWI_STATE_DONE;
+               srpc_server_rpc_done(rpc, ev->ev_status);
+               return 1;
+       }
+
+       return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+       srpc_client_rpc_t *rpc = data;
+
+       CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+              rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+              rpc->crpc_timeout);
+
+       spin_lock(&rpc->crpc_lock);
+
+       rpc->crpc_timeout = 0;
+       srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+       spin_unlock(&rpc->crpc_lock);
+
+       spin_lock(&srpc_data.rpc_glock);
+       srpc_data.rpc_counters.rpcs_expired++;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+       stt_timer_t *timer = &rpc->crpc_timer;
+
+       if (rpc->crpc_timeout == 0) return;
+
+       INIT_LIST_HEAD(&timer->stt_list);
+       timer->stt_data    = rpc;
+       timer->stt_func    = srpc_client_rpc_expired;
+       timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+                                         cfs_time_current_sec());
+       stt_add_timer(timer);
+       return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+       /* timer not planted or already exploded */
+       if (rpc->crpc_timeout == 0)
+               return;
+
+       /* timer sucessfully defused */
+       if (stt_del_timer(&rpc->crpc_timer))
+               return;
+
+       /* timer detonated, wait for it to explode */
+       while (rpc->crpc_timeout != 0) {
+               spin_unlock(&rpc->crpc_lock);
+
+               schedule();
+
+               spin_lock(&rpc->crpc_lock);
+       }
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+       swi_workitem_t *wi = &rpc->crpc_wi;
+
+       LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+       spin_lock(&rpc->crpc_lock);
+
+       rpc->crpc_closed = 1;
+       if (rpc->crpc_status == 0)
+               rpc->crpc_status = status;
+
+       srpc_del_client_rpc_timer(rpc);
+
+       CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+               "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+       /*
+        * No one can schedule me now since:
+        * - RPC timer has been defused.
+        * - all LNet events have been fired.
+        * - crpc_closed has been set, preventing srpc_abort_rpc from
+        *   scheduling me.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       LASSERT (!srpc_event_pending(rpc));
+       swi_exit_workitem(wi);
+
+       spin_unlock(&rpc->crpc_lock);
+
+       (*rpc->crpc_done)(rpc);
+       return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+       int             rc = 0;
+       srpc_client_rpc_t *rpc;
+       srpc_msg_t      *reply;
+       int             do_bulk;
+
+       LASSERT(wi != NULL);
+
+       rpc = wi->swi_workitem.wi_data;
+
+       LASSERT (rpc != NULL);
+       LASSERT (wi == &rpc->crpc_wi);
+
+       reply = &rpc->crpc_replymsg;
+       do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+       spin_lock(&rpc->crpc_lock);
+
+       if (rpc->crpc_aborted) {
+               spin_unlock(&rpc->crpc_lock);
+               goto abort;
+       }
+
+       spin_unlock(&rpc->crpc_lock);
+
+       switch (wi->swi_state) {
+       default:
+               LBUG ();
+       case SWI_STATE_NEWBORN:
+               LASSERT (!srpc_event_pending(rpc));
+
+               rc = srpc_prepare_reply(rpc);
+               if (rc != 0) {
+                       srpc_client_rpc_done(rpc, rc);
+                       return 1;
+               }
+
+               rc = srpc_prepare_bulk(rpc);
+               if (rc != 0) break;
+
+               wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+               rc = srpc_send_request(rpc);
+               break;
+
+       case SWI_STATE_REQUEST_SUBMITTED:
+               /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+                * order; however, they're processed in a strict order:
+                * rqt, rpy, and bulk. */
+               if (!rpc->crpc_reqstev.ev_fired) break;
+
+               rc = rpc->crpc_reqstev.ev_status;
+               if (rc != 0) break;
+
+               wi->swi_state = SWI_STATE_REQUEST_SENT;
+               /* perhaps more events, fall thru */
+       case SWI_STATE_REQUEST_SENT: {
+               srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+               if (!rpc->crpc_replyev.ev_fired) break;
+
+               rc = rpc->crpc_replyev.ev_status;
+               if (rc != 0) break;
+
+               srpc_unpack_msg_hdr(reply);
+               if (reply->msg_type != type ||
+                   (reply->msg_magic != SRPC_MSG_MAGIC &&
+                    reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                       CWARN ("Bad message from %s: type %u (%d expected),"
+                              " magic %u (%d expected).\n",
+                              libcfs_id2str(rpc->crpc_dest),
+                              reply->msg_type, type,
+                              reply->msg_magic, SRPC_MSG_MAGIC);
+                       rc = -EBADMSG;
+                       break;
+               }
+
+               if (do_bulk && reply->msg_body.reply.status != 0) {
+                       CWARN ("Remote error %d at %s, unlink bulk buffer in "
+                              "case peer didn't initiate bulk transfer\n",
+                              reply->msg_body.reply.status,
+                              libcfs_id2str(rpc->crpc_dest));
+                       LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+               }
+
+               wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+       }
+       case SWI_STATE_REPLY_RECEIVED:
+               if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+               rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+               /* Bulk buffer was unlinked due to remote error. Clear error
+                * since reply buffer still contains valid data.
+                * NB rpc->crpc_done shouldn't look into bulk data in case of
+                * remote error. */
+               if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+                   rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+                       rc = 0;
+
+               wi->swi_state = SWI_STATE_DONE;
+               srpc_client_rpc_done(rpc, rc);
+               return 1;
+       }
+
+       if (rc != 0) {
+               spin_lock(&rpc->crpc_lock);
+               srpc_abort_rpc(rpc, rc);
+               spin_unlock(&rpc->crpc_lock);
+       }
+
+abort:
+       if (rpc->crpc_aborted) {
+               LNetMDUnlink(rpc->crpc_reqstmdh);
+               LNetMDUnlink(rpc->crpc_replymdh);
+               LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+               if (!srpc_event_pending(rpc)) {
+                       srpc_client_rpc_done(rpc, -EINTR);
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+                       int nbulkiov, int bulklen,
+                       void (*rpc_done)(srpc_client_rpc_t *),
+                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+       srpc_client_rpc_t *rpc;
+
+       LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+                                  crpc_bulk.bk_iovs[nbulkiov]));
+       if (rpc == NULL)
+               return NULL;
+
+       srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+                            bulklen, rpc_done, rpc_fini, priv);
+       return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+       LASSERT (why != 0);
+
+       if (rpc->crpc_aborted || /* already aborted */
+           rpc->crpc_closed)    /* callback imminent */
+               return;
+
+       CDEBUG (D_NET,
+               "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(rpc->crpc_wi.swi_state), why);
+
+       rpc->crpc_aborted = 1;
+       rpc->crpc_status  = why;
+       swi_schedule_workitem(&rpc->crpc_wi);
+       return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT (!rpc->crpc_aborted);
+       LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+       CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+               libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+               rpc->crpc_timeout);
+
+       srpc_add_client_rpc_timer(rpc);
+       swi_schedule_workitem(&rpc->crpc_wi);
+       return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+       srpc_event_t            *ev = &rpc->srpc_ev;
+       struct srpc_msg         *msg = &rpc->srpc_replymsg;
+       struct srpc_buffer      *buffer = rpc->srpc_reqstbuf;
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv = scd->scd_svc;
+       __u64                   rpyid;
+       int                     rc;
+
+       LASSERT(buffer != NULL);
+       rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+       spin_lock(&scd->scd_lock);
+
+       if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+               /* Repost buffer before replying since test client
+                * might send me another RPC once it gets the reply */
+               if (srpc_service_post_buffer(scd, buffer) != 0)
+                       CWARN("Failed to repost %s buffer\n", sv->sv_name);
+               rpc->srpc_reqstbuf = NULL;
+       }
+
+       spin_unlock(&scd->scd_lock);
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REPLY_SENT;
+
+       msg->msg_magic   = SRPC_MSG_MAGIC;
+       msg->msg_version = SRPC_MSG_VERSION;
+       msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+       rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+                                  sizeof(*msg), LNET_MD_OP_PUT,
+                                  rpc->srpc_peer, rpc->srpc_self,
+                                  &rpc->srpc_replymdh, ev);
+       if (rc != 0)
+               ev->ev_fired = 1;  /* no more event expected */
+       return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+       struct srpc_service_cd  *scd;
+       srpc_event_t      *rpcev = ev->md.user_ptr;
+       srpc_client_rpc_t *crpc;
+       srpc_server_rpc_t *srpc;
+       srpc_buffer_t     *buffer;
+       srpc_service_t    *sv;
+       srpc_msg_t      *msg;
+       srpc_msg_type_t    type;
+
+       LASSERT (!in_interrupt());
+
+       if (ev->status != 0) {
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.errors++;
+               spin_unlock(&srpc_data.rpc_glock);
+       }
+
+       rpcev->ev_lnet = ev->type;
+
+       switch (rpcev->ev_type) {
+       default:
+               CERROR("Unknown event: status %d, type %d, lnet %d\n",
+                      rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+               LBUG ();
+       case SRPC_REQUEST_SENT:
+               if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+                       spin_lock(&srpc_data.rpc_glock);
+                       srpc_data.rpc_counters.rpcs_sent++;
+                       spin_unlock(&srpc_data.rpc_glock);
+               }
+       case SRPC_REPLY_RCVD:
+       case SRPC_BULK_REQ_RCVD:
+               crpc = rpcev->ev_data;
+
+               if (rpcev != &crpc->crpc_reqstev &&
+                   rpcev != &crpc->crpc_replyev &&
+                   rpcev != &crpc->crpc_bulkev) {
+                       CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+                              rpcev, crpc, &crpc->crpc_reqstev,
+                              &crpc->crpc_replyev, &crpc->crpc_bulkev);
+                       CERROR("Bad event: status %d, type %d, lnet %d\n",
+                              rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                       LBUG ();
+               }
+
+               spin_lock(&crpc->crpc_lock);
+
+               LASSERT(rpcev->ev_fired == 0);
+               rpcev->ev_fired  = 1;
+               rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+                                               -EINTR : ev->status;
+               swi_schedule_workitem(&crpc->crpc_wi);
+
+               spin_unlock(&crpc->crpc_lock);
+               break;
+
+       case SRPC_REQUEST_RCVD:
+               scd = rpcev->ev_data;
+               sv = scd->scd_svc;
+
+               LASSERT(rpcev == &scd->scd_ev);
+
+               spin_lock(&scd->scd_lock);
+
+               LASSERT (ev->unlinked);
+               LASSERT (ev->type == LNET_EVENT_PUT ||
+                        ev->type == LNET_EVENT_UNLINK);
+               LASSERT (ev->type != LNET_EVENT_UNLINK ||
+                        sv->sv_shuttingdown);
+
+               buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+               buffer->buf_peer = ev->initiator;
+               buffer->buf_self = ev->target.nid;
+
+               LASSERT(scd->scd_buf_nposted > 0);
+               scd->scd_buf_nposted--;
+
+               if (sv->sv_shuttingdown) {
+                       /* Leave buffer on scd->scd_buf_nposted since
+                        * srpc_finish_service needs to traverse it. */
+                       spin_unlock(&scd->scd_lock);
+                       break;
+               }
+
+               if (scd->scd_buf_err_stamp != 0 &&
+                   scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+                       /* re-enable adding buffer */
+                       scd->scd_buf_err_stamp = 0;
+                       scd->scd_buf_err = 0;
+               }
+
+               if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+                   scd->scd_buf_adjust == 0 &&
+                   scd->scd_buf_nposted < scd->scd_buf_low) {
+                       scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+                                                 SFW_TEST_WI_MIN);
+                       swi_schedule_workitem(&scd->scd_buf_wi);
+               }
+
+               list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+               msg = &buffer->buf_msg;
+               type = srpc_service2request(sv->sv_id);
+
+               if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+                   (msg->msg_type != type &&
+                    msg->msg_type != __swab32(type)) ||
+                   (msg->msg_magic != SRPC_MSG_MAGIC &&
+                    msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                       CERROR ("Dropping RPC (%s) from %s: "
+                               "status %d mlength %d type %u magic %u.\n",
+                               sv->sv_name, libcfs_id2str(ev->initiator),
+                               ev->status, ev->mlength,
+                               msg->msg_type, msg->msg_magic);
+
+                       /* NB can't call srpc_service_recycle_buffer here since
+                        * it may call LNetM[DE]Attach. The invalid magic tells
+                        * srpc_handle_rpc to drop this RPC */
+                       msg->msg_magic = 0;
+               }
+
+               if (!list_empty(&scd->scd_rpc_free)) {
+                       srpc = list_entry(scd->scd_rpc_free.next,
+                                             struct srpc_server_rpc,
+                                             srpc_list);
+                       list_del(&srpc->srpc_list);
+
+                       srpc_init_server_rpc(srpc, scd, buffer);
+                       list_add_tail(&srpc->srpc_list,
+                                         &scd->scd_rpc_active);
+                       swi_schedule_workitem(&srpc->srpc_wi);
+               } else {
+                       list_add_tail(&buffer->buf_list,
+                                         &scd->scd_buf_blocked);
+               }
+
+               spin_unlock(&scd->scd_lock);
+
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.rpcs_rcvd++;
+               spin_unlock(&srpc_data.rpc_glock);
+               break;
+
+       case SRPC_BULK_GET_RPLD:
+               LASSERT (ev->type == LNET_EVENT_SEND ||
+                        ev->type == LNET_EVENT_REPLY ||
+                        ev->type == LNET_EVENT_UNLINK);
+
+               if (!ev->unlinked)
+                       break; /* wait for final event */
+
+       case SRPC_BULK_PUT_SENT:
+               if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+                       spin_lock(&srpc_data.rpc_glock);
+
+                       if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+                               srpc_data.rpc_counters.bulk_get += ev->mlength;
+                       else
+                               srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+                       spin_unlock(&srpc_data.rpc_glock);
+               }
+       case SRPC_REPLY_SENT:
+               srpc = rpcev->ev_data;
+               scd  = srpc->srpc_scd;
+
+               LASSERT(rpcev == &srpc->srpc_ev);
+
+               spin_lock(&scd->scd_lock);
+
+               rpcev->ev_fired  = 1;
+               rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+                                  -EINTR : ev->status;
+               swi_schedule_workitem(&srpc->srpc_wi);
+
+               spin_unlock(&scd->scd_lock);
+               break;
+       }
+}
+
+
+int
+srpc_startup (void)
+{
+       int rc;
+
+       memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+       spin_lock_init(&srpc_data.rpc_glock);
+
+       /* 1 second pause to avoid timestamp reuse */
+       cfs_pause(cfs_time_seconds(1));
+       srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+       srpc_data.rpc_state = SRPC_STATE_NONE;
+
+       rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+       if (rc < 0) {
+               CERROR ("LNetNIInit() has failed: %d\n", rc);
+               return rc;
+       }
+
+       srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+       LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+       rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+       if (rc != 0) {
+               CERROR("LNetEQAlloc() has failed: %d\n", rc);
+               goto bail;
+       }
+
+       rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+       LASSERT(rc == 0);
+       rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+       LASSERT(rc == 0);
+
+       srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+       rc = stt_startup();
+
+bail:
+       if (rc != 0)
+               srpc_shutdown();
+       else
+               srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+       return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+       int i;
+       int rc;
+       int state;
+
+       state = srpc_data.rpc_state;
+       srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+       switch (state) {
+       default:
+               LBUG ();
+       case SRPC_STATE_RUNNING:
+               spin_lock(&srpc_data.rpc_glock);
+
+               for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+                       srpc_service_t *sv = srpc_data.rpc_services[i];
+
+                       LASSERTF (sv == NULL,
+                                 "service not empty: id %d, name %s\n",
+                                 i, sv->sv_name);
+               }
+
+               spin_unlock(&srpc_data.rpc_glock);
+
+               stt_shutdown();
+
+       case SRPC_STATE_EQ_INIT:
+               rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+               rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+               LASSERT (rc == 0);
+               rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+               LASSERT (rc == 0); /* the EQ should have no user by now */
+
+       case SRPC_STATE_NI_INIT:
+               LNetNIFini();
+       }
+
+       return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644 (file)
index 0000000..b905d49
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+       SRPC_MSG_MKSN_REQST     = 0,
+       SRPC_MSG_MKSN_REPLY     = 1,
+       SRPC_MSG_RMSN_REQST     = 2,
+       SRPC_MSG_RMSN_REPLY     = 3,
+       SRPC_MSG_BATCH_REQST    = 4,
+       SRPC_MSG_BATCH_REPLY    = 5,
+       SRPC_MSG_STAT_REQST     = 6,
+       SRPC_MSG_STAT_REPLY     = 7,
+       SRPC_MSG_TEST_REQST     = 8,
+       SRPC_MSG_TEST_REPLY     = 9,
+       SRPC_MSG_DEBUG_REQST    = 10,
+       SRPC_MSG_DEBUG_REPLY    = 11,
+       SRPC_MSG_BRW_REQST      = 12,
+       SRPC_MSG_BRW_REPLY      = 13,
+       SRPC_MSG_PING_REQST     = 14,
+       SRPC_MSG_PING_REPLY     = 15,
+       SRPC_MSG_JOIN_REQST     = 16,
+       SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+       __u64                   rpyid;          /* reply buffer matchbits */
+       __u64                   bulkid;         /* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+       __u32              status;
+       lst_sid_t              sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+       __u64                   mksn_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              mksn_sid;        /* session id */
+       __u32                   mksn_force;      /* use brute force */
+       char                    mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;                 /* make session request */
+
+typedef struct {
+       __u32              mksn_status;      /* session status */
+       lst_sid_t              mksn_sid;         /* session id */
+       __u32              mksn_timeout;     /* session timeout */
+       char                    mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+       __u64                   rmsn_rpyid;      /* reply buffer matchbits */
+       lst_sid_t               rmsn_sid;       /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+       __u32                   rmsn_status;
+       lst_sid_t               rmsn_sid;       /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+       __u64                   join_rpyid;     /* reply buffer matchbits */
+       lst_sid_t              join_sid;       /* session id to join */
+       char                join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+       __u32              join_status;    /* returned status */
+       lst_sid_t              join_sid;       /* session id */
+       __u32                   join_timeout;   /* # seconds' inactivity to expire */
+       char                join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+       __u64              dbg_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              dbg_sid; /* session id */
+       __u32              dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+       __u32              dbg_status;     /* returned code */
+       lst_sid_t              dbg_sid; /* session id */
+       __u32              dbg_timeout;    /* session timeout */
+       __u32              dbg_nbatch;     /* # of batches in the node */
+       char                dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+       __u64              bar_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              bar_sid; /* session id */
+       lst_bid_t              bar_bid; /* batch id */
+       __u32              bar_opc;     /* create/start/stop batch */
+       __u32              bar_testidx;    /* index of test */
+       __u32              bar_arg;     /* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+       __u32              bar_status;     /* status of request */
+       lst_sid_t              bar_sid; /* session id */
+       __u32              bar_active;     /* # of active tests in batch/test */
+       __u32              bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+       __u64              str_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              str_sid; /* session id */
+       __u32              str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+       __u32              str_status;
+       lst_sid_t              str_sid;
+       sfw_counters_t    str_fw;
+       srpc_counters_t  str_rpc;
+       lnet_counters_t  str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+       __u32              blk_opc;     /* bulk operation code */
+       __u32              blk_npg;     /* # of pages */
+       __u32              blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+       /** bulk operation code */
+       __u16                   blk_opc;
+       /** data check flags */
+       __u16                   blk_flags;
+       /** data length */
+       __u32                   blk_len;
+       /** reserved: offset */
+       __u32              blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+       __u32                   png_size;       /* size of ping message */
+       __u32                   png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+       __u64                   tsr_rpyid;      /* reply buffer matchbits */
+       __u64                   tsr_bulkid;     /* bulk buffer matchbits */
+       lst_sid_t               tsr_sid;        /* session id */
+       lst_bid_t               tsr_bid;        /* batch id */
+       __u32                   tsr_service;    /* test type: bulk|ping|... */
+       /* test client loop count or # server buffers needed */
+       __u32                   tsr_loop;
+       __u32                   tsr_concur;     /* concurrency of test */
+       __u8                    tsr_is_client;  /* is test client or not */
+       __u8                    tsr_stop_onerr; /* stop on error */
+       __u32                   tsr_ndest;      /* # of dest nodes */
+
+       union {
+               test_ping_req_t         ping;
+               test_bulk_req_t         bulk_v0;
+               test_bulk_req_v1_t      bulk_v1;
+       }               tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+       __u32                   tsr_status;     /* returned code */
+       lst_sid_t               tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+       __u64              pnr_rpyid;
+       __u32              pnr_magic;
+       __u32              pnr_seq;
+       __u64              pnr_time_sec;
+       __u64              pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+       __u32              pnr_status;
+       __u32              pnr_magic;
+       __u32              pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+       __u64              brw_rpyid;      /* reply buffer matchbits */
+       __u64              brw_bulkid;     /* bulk buffer matchbits */
+       __u32              brw_rw;       /* read or write */
+       __u32              brw_len;     /* bulk data len */
+       __u32              brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+       __u32              brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC           0xeeb0f00d
+#define SRPC_MSG_VERSION               1
+
+typedef struct srpc_msg {
+       /** magic number */
+       __u32   msg_magic;
+       /** message version number */
+       __u32   msg_version;
+       /** type of message body: srpc_msg_type_t */
+       __u32   msg_type;
+       __u32   msg_reserved0;
+       __u32   msg_reserved1;
+       /** test session features */
+       __u32   msg_ses_feats;
+       union {
+               srpc_generic_reqst_t reqst;
+               srpc_generic_reply_t reply;
+
+               srpc_mksn_reqst_t    mksn_reqst;
+               srpc_mksn_reply_t    mksn_reply;
+               srpc_rmsn_reqst_t    rmsn_reqst;
+               srpc_rmsn_reply_t    rmsn_reply;
+               srpc_debug_reqst_t   dbg_reqst;
+               srpc_debug_reply_t   dbg_reply;
+               srpc_batch_reqst_t   bat_reqst;
+               srpc_batch_reply_t   bat_reply;
+               srpc_stat_reqst_t    stat_reqst;
+               srpc_stat_reply_t    stat_reply;
+               srpc_test_reqst_t    tes_reqst;
+               srpc_test_reply_t    tes_reply;
+               srpc_join_reqst_t    join_reqst;
+               srpc_join_reply_t    join_reply;
+
+               srpc_ping_reqst_t    ping_reqst;
+               srpc_ping_reply_t    ping_reply;
+               srpc_brw_reqst_t     brw_reqst;
+               srpc_brw_reply_t     brw_reply;
+       }     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       /* We do not swap the magic number here as it is needed to
+          determine whether the body needs to be swapped. */
+       /* __swab32s(&msg->msg_magic); */
+       __swab32s(&msg->msg_type);
+       __swab32s(&msg->msg_version);
+       __swab32s(&msg->msg_ses_feats);
+       __swab32s(&msg->msg_reserved0);
+       __swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644 (file)
index 0000000..8053b05
--- /dev/null
@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN                0
+#define SWI_STATE_REPLY_SUBMITTED        1
+#define SWI_STATE_REPLY_SENT          2
+#define SWI_STATE_REQUEST_SUBMITTED    3
+#define SWI_STATE_REQUEST_SENT      4
+#define SWI_STATE_REPLY_RECEIVED          5
+#define SWI_STATE_BULK_STARTED      6
+#define SWI_STATE_DONE              10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG           0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH           3
+#define SRPC_SERVICE_TEST             4
+#define SRPC_SERVICE_QUERY_STAT         5
+#define SRPC_SERVICE_JOIN             6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW               11
+#define SRPC_SERVICE_PING             12
+#define SRPC_SERVICE_MAX_ID         12
+
+#define SRPC_REQUEST_PORTAL         50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL               52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+       switch (service) {
+       default:
+               LBUG ();
+       case SRPC_SERVICE_DEBUG:
+               return SRPC_MSG_DEBUG_REQST;
+
+       case SRPC_SERVICE_MAKE_SESSION:
+               return SRPC_MSG_MKSN_REQST;
+
+       case SRPC_SERVICE_REMOVE_SESSION:
+               return SRPC_MSG_RMSN_REQST;
+
+       case SRPC_SERVICE_BATCH:
+               return SRPC_MSG_BATCH_REQST;
+
+       case SRPC_SERVICE_TEST:
+               return SRPC_MSG_TEST_REQST;
+
+       case SRPC_SERVICE_QUERY_STAT:
+               return SRPC_MSG_STAT_REQST;
+
+       case SRPC_SERVICE_BRW:
+               return SRPC_MSG_BRW_REQST;
+
+       case SRPC_SERVICE_PING:
+               return SRPC_MSG_PING_REQST;
+
+       case SRPC_SERVICE_JOIN:
+               return SRPC_MSG_JOIN_REQST;
+       }
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+       return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+       SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+       SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+       SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+       SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+       SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+       SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+       SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+       srpc_event_type_t ev_type;   /* what's up */
+       lnet_event_kind_t ev_lnet;   /* LNet event type */
+       int            ev_fired;  /* LNet event fired? */
+       int            ev_status; /* LNet event status */
+       void         *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+       int           bk_len;  /* len of bulk data */
+       lnet_handle_md_t bk_mdh;
+       int           bk_sink; /* sink/source */
+       int           bk_niov; /* # iov in bk_iovs */
+       lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+       struct list_head           buf_list; /* chain on srpc_service::*_msgq */
+       srpc_msg_t         buf_msg;
+       lnet_handle_md_t     buf_mdh;
+       lnet_nid_t         buf_self;
+       lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+       struct cfs_wi_sched     *swi_sched;
+       cfs_workitem_t       swi_workitem;
+       swi_action_t     swi_action;
+       int               swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+       /* chain on srpc_service::*_rpcq */
+       struct list_head                srpc_list;
+       struct srpc_service_cd *srpc_scd;
+       swi_workitem_t       srpc_wi;
+       srpc_event_t     srpc_ev;      /* bulk/reply event */
+       lnet_nid_t         srpc_self;
+       lnet_process_id_t    srpc_peer;
+       srpc_msg_t         srpc_replymsg;
+       lnet_handle_md_t     srpc_replymdh;
+       srpc_buffer_t       *srpc_reqstbuf;
+       srpc_bulk_t      *srpc_bulk;
+
+       unsigned int     srpc_aborted; /* being given up */
+       int               srpc_status;
+       void           (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+       struct list_head                crpc_list;      /* chain on user's lists */
+       spinlock_t              crpc_lock;      /* serialize */
+       int               crpc_service;
+       atomic_t         crpc_refcount;
+       int               crpc_timeout; /* # seconds to wait for reply */
+       stt_timer_t       crpc_timer;
+       swi_workitem_t       crpc_wi;
+       lnet_process_id_t    crpc_dest;
+
+       void           (*crpc_done)(struct srpc_client_rpc *);
+       void           (*crpc_fini)(struct srpc_client_rpc *);
+       int               crpc_status;    /* completion status */
+       void            *crpc_priv;      /* caller data */
+
+       /* state flags */
+       unsigned int     crpc_aborted:1; /* being given up */
+       unsigned int     crpc_closed:1;  /* completed */
+
+       /* RPC events */
+       srpc_event_t     crpc_bulkev;    /* bulk event */
+       srpc_event_t     crpc_reqstev;   /* request event */
+       srpc_event_t     crpc_replyev;   /* reply event */
+
+       /* bulk, request(reqst), and reply exchanged on wire */
+       srpc_msg_t         crpc_reqstmsg;
+       srpc_msg_t         crpc_replymsg;
+       lnet_handle_md_t     crpc_reqstmdh;
+       lnet_handle_md_t     crpc_replymdh;
+       srpc_bulk_t       crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)                                     \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)                                 \
+do {                                                               \
+       CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",                  \
+              (rpc), libcfs_id2str((rpc)->crpc_dest),            \
+              atomic_read(&(rpc)->crpc_refcount));              \
+       LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+       atomic_inc(&(rpc)->crpc_refcount);                        \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)                                 \
+do {                                                               \
+       CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",                  \
+              (rpc), libcfs_id2str((rpc)->crpc_dest),            \
+              atomic_read(&(rpc)->crpc_refcount));              \
+       LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+       if (atomic_dec_and_test(&(rpc)->crpc_refcount))      \
+               srpc_destroy_client_rpc(rpc);                      \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+                                  (rpc)->crpc_reqstev.ev_fired == 0 || \
+                                  (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+       /** serialize */
+       spinlock_t              scd_lock;
+       /** backref to service */
+       struct srpc_service     *scd_svc;
+       /** event buffer */
+       srpc_event_t            scd_ev;
+       /** free RPC descriptors */
+       struct list_head                scd_rpc_free;
+       /** in-flight RPCs */
+       struct list_head                scd_rpc_active;
+       /** workitem for posting buffer */
+       swi_workitem_t          scd_buf_wi;
+       /** CPT id */
+       int                     scd_cpt;
+       /** error code for scd_buf_wi */
+       int                     scd_buf_err;
+       /** timestamp for scd_buf_err */
+       unsigned long      scd_buf_err_stamp;
+       /** total # request buffers */
+       int                     scd_buf_total;
+       /** # posted request buffers */
+       int                     scd_buf_nposted;
+       /** in progress of buffer posting */
+       int                     scd_buf_posting;
+       /** allocate more buffers if scd_buf_nposted < scd_buf_low */
+       int                     scd_buf_low;
+       /** increase/decrease some buffers */
+       int                     scd_buf_adjust;
+       /** posted message buffers */
+       struct list_head                scd_buf_posted;
+       /** blocked for RPC descriptor */
+       struct list_head                scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN                256
+#define SFW_TEST_WI_MAX                2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA      64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN                16
+#define SFW_FRWK_WI_MAX                256
+
+typedef struct srpc_service {
+       int                     sv_id;          /* service id */
+       const char              *sv_name;       /* human readable name */
+       int                     sv_wi_total;    /* total server workitems */
+       int                     sv_shuttingdown;
+       int                     sv_ncpts;
+       /* percpt data for srpc_service */
+       struct srpc_service_cd  **sv_cpt_data;
+       /* Service callbacks:
+        * - sv_handler: process incoming RPC request
+        * - sv_bulk_ready: notify bulk data
+        */
+       int           (*sv_handler) (srpc_server_rpc_t *);
+       int           (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+       struct list_head        sn_list;    /* chain on fw_zombie_sessions */
+       lst_sid_t        sn_id;      /* unique identifier */
+       unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+       int            sn_timer_active;
+       unsigned int      sn_features;
+       stt_timer_t       sn_timer;
+       struct list_head        sn_batches; /* list of batches */
+       char          sn_name[LST_NAME_SIZE];
+       atomic_t      sn_refcount;
+       atomic_t      sn_brw_errors;
+       atomic_t      sn_ping_errors;
+       cfs_time_t      sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+                                      (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+       struct list_head        bat_list;      /* chain on sn_batches */
+       lst_bid_t        bat_id;        /* batch id */
+       int            bat_error;     /* error code of batch */
+       sfw_session_t    *bat_session;   /* batch's session */
+       atomic_t      bat_nactive;   /* # of active tests */
+       struct list_head        bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+       int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+       void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+       int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+                            lnet_process_id_t dest,
+                            srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+       void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+                            srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+       struct list_head              tsi_list;  /* chain on batch */
+       int                  tsi_service;      /* test type */
+       sfw_batch_t         *tsi_batch; /* batch */
+       sfw_test_client_ops_t  *tsi_ops;          /* test client operations */
+
+       /* public parameter for all test units */
+       unsigned int            tsi_is_client:1;     /* is test client */
+       unsigned int            tsi_stoptsu_onerr:1; /* stop tsu on error */
+       int                  tsi_concur;          /* concurrency */
+       int                  tsi_loop;      /* loop count */
+
+       /* status of test instance */
+       spinlock_t              tsi_lock;         /* serialize */
+       unsigned int            tsi_stopping:1;   /* test is stopping */
+       atomic_t            tsi_nactive;      /* # of active test unit */
+       struct list_head              tsi_units;        /* test units */
+       struct list_head              tsi_free_rpcs;    /* free rpcs */
+       struct list_head              tsi_active_rpcs;  /* active rpcs */
+
+       union {
+               test_ping_req_t         ping;     /* ping parameter */
+               test_bulk_req_t         bulk_v0;  /* bulk parameter */
+               test_bulk_req_v1_t      bulk_v1;  /* bulk v1 parameter */
+       } tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+       struct list_head            tsu_list;    /* chain on lst_test_instance */
+       lnet_process_id_t     tsu_dest;  /* id of dest node */
+       int                tsu_loop;     /* loop count of the test */
+       sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+       void             *tsu_private;      /* private data */
+       swi_workitem_t  tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+       struct list_head              tsc_list;  /* chain on fw_tests */
+       srpc_service_t   *tsc_srv_service;  /* test service */
+       sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+              unsigned features, int nbulkiov, int bulklen,
+              void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+                       lnet_process_id_t peer, unsigned features,
+                       int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+                   int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+                      int nbulkiov, int bulklen,
+                      void (*rpc_done)(srpc_client_rpc_t *),
+                      void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+                            int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+       return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+       swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+       return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+                 swi_action_t action, struct cfs_wi_sched *sched)
+{
+       swi->swi_sched  = sched;
+       swi->swi_action = action;
+       swi->swi_state  = SWI_STATE_NEWBORN;
+       cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+       cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+       cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+       return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT (rpc != NULL);
+       LASSERT (!srpc_event_pending(rpc));
+       LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+       if (rpc->crpc_fini == NULL) {
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       } else {
+               (*rpc->crpc_fini) (rpc);
+       }
+
+       return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+                     int service, int nbulkiov, int bulklen,
+                     void (*rpc_done)(srpc_client_rpc_t *),
+                     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+       LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+       memset(rpc, 0, offsetof(srpc_client_rpc_t,
+                               crpc_bulk.bk_iovs[nbulkiov]));
+
+       INIT_LIST_HEAD(&rpc->crpc_list);
+       swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+                         lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+       spin_lock_init(&rpc->crpc_lock);
+       atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+       rpc->crpc_dest   = peer;
+       rpc->crpc_priv   = priv;
+       rpc->crpc_service      = service;
+       rpc->crpc_bulk.bk_len  = bulklen;
+       rpc->crpc_bulk.bk_niov = nbulkiov;
+       rpc->crpc_done   = rpc_done;
+       rpc->crpc_fini   = rpc_fini;
+       LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+       LNetInvalidateHandle(&rpc->crpc_replymdh);
+       LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+       /* no event is expected at this point */
+       rpc->crpc_bulkev.ev_fired  =
+       rpc->crpc_reqstev.ev_fired =
+       rpc->crpc_replyev.ev_fired = 1;
+
+       rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+       rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+       rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+       return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+       switch(state) {
+               default:
+                       LBUG();
+               STATE2STR(SWI_STATE_NEWBORN);
+               STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+               STATE2STR(SWI_STATE_REPLY_SENT);
+               STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+               STATE2STR(SWI_STATE_REQUEST_SENT);
+               STATE2STR(SWI_STATE_REPLY_RECEIVED);
+               STATE2STR(SWI_STATE_BULK_STARTED);
+               STATE2STR(SWI_STATE_DONE);
+       }
+#undef STATE2STR
+}
+
+#define UNUSED(x)       ( (void)(x) )
+
+
+#define selftest_wait_events() cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)                           \
+do {                                                                   \
+       int __I = 2;                                                    \
+       while (!(cond)) {                                               \
+               CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,               \
+                      fmt, ## __VA_ARGS__);                            \
+               spin_unlock(&(lock));                                   \
+                                                                       \
+               selftest_wait_events();                                 \
+                                                                       \
+               spin_lock(&(lock));                                     \
+       }                                                               \
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+       int i = 2;
+
+       LASSERT(sv->sv_shuttingdown);
+
+       while (srpc_finish_service(sv) == 0) {
+               i++;
+               CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+                       "Waiting for %s service to shutdown...\n",
+                       sv->sv_name);
+               selftest_wait_events();
+       }
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644 (file)
index 0000000..2c07855
--- /dev/null
@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL        3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS        (1 << 7)
+#define STTIMER_SLOT(t)               (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+                                                   (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+       spinlock_t       stt_lock;
+       /* start time of the slot processed previously */
+       cfs_time_t       stt_prev_slot;
+       struct list_head       stt_hash[STTIMER_NSLOTS];
+       int           stt_shuttingdown;
+       wait_queue_head_t      stt_waitq;
+       int           stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+       struct list_head *pos;
+
+       spin_lock(&stt_data.stt_lock);
+
+       LASSERT (stt_data.stt_nthreads > 0);
+       LASSERT (!stt_data.stt_shuttingdown);
+       LASSERT (timer->stt_func != NULL);
+       LASSERT (list_empty(&timer->stt_list));
+       LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+       /* a simple insertion sort */
+       list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+               stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+               if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+                       break;
+       }
+       list_add(&timer->stt_list, pos);
+
+       spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+       int ret = 0;
+
+       spin_lock(&stt_data.stt_lock);
+
+       LASSERT (stt_data.stt_nthreads > 0);
+       LASSERT (!stt_data.stt_shuttingdown);
+
+       if (!list_empty(&timer->stt_list)) {
+               ret = 1;
+               list_del_init(&timer->stt_list);
+       }
+
+       spin_unlock(&stt_data.stt_lock);
+       return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+       int       expired = 0;
+       stt_timer_t *timer;
+
+       while (!list_empty(slot)) {
+               timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+               if (cfs_time_after(timer->stt_expires, now))
+                       break;
+
+               list_del_init(&timer->stt_list);
+               spin_unlock(&stt_data.stt_lock);
+
+               expired++;
+               (*timer->stt_func) (timer->stt_data);
+
+               spin_lock(&stt_data.stt_lock);
+       }
+
+       return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+       int     expired = 0;
+       cfs_time_t now;
+       cfs_time_t this_slot;
+
+       now = cfs_time_current_sec();
+       this_slot = now & STTIMER_SLOTTIMEMASK;
+
+       spin_lock(&stt_data.stt_lock);
+
+       while (cfs_time_aftereq(this_slot, *last)) {
+               expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+               this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+       }
+
+       *last = now & STTIMER_SLOTTIMEMASK;
+       spin_unlock(&stt_data.stt_lock);
+       return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+       int rc = 0;
+       UNUSED(arg);
+
+       SET_BUT_UNUSED(rc);
+
+       cfs_block_allsigs();
+
+       while (!stt_data.stt_shuttingdown) {
+               stt_check_timers(&stt_data.stt_prev_slot);
+
+               rc = wait_event_timeout(stt_data.stt_waitq,
+                                       stt_data.stt_shuttingdown,
+                                       cfs_time_seconds(STTIMER_SLOTTIME));
+       }
+
+       spin_lock(&stt_data.stt_lock);
+       stt_data.stt_nthreads--;
+       spin_unlock(&stt_data.stt_lock);
+       return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+       task_t *task;
+
+       LASSERT(!stt_data.stt_shuttingdown);
+
+       task = kthread_run(stt_timer_main, NULL, "st_timer");
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       spin_lock(&stt_data.stt_lock);
+       stt_data.stt_nthreads++;
+       spin_unlock(&stt_data.stt_lock);
+       return 0;
+}
+
+
+int
+stt_startup (void)
+{
+       int rc = 0;
+       int i;
+
+       stt_data.stt_shuttingdown = 0;
+       stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+       spin_lock_init(&stt_data.stt_lock);
+       for (i = 0; i < STTIMER_NSLOTS; i++)
+               INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+       stt_data.stt_nthreads = 0;
+       init_waitqueue_head(&stt_data.stt_waitq);
+       rc = stt_start_timer_thread();
+       if (rc != 0)
+               CERROR ("Can't spawn timer thread: %d\n", rc);
+
+       return rc;
+}
+
+void
+stt_shutdown (void)
+{
+       int i;
+
+       spin_lock(&stt_data.stt_lock);
+
+       for (i = 0; i < STTIMER_NSLOTS; i++)
+               LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+       stt_data.stt_shuttingdown = 1;
+
+       wake_up(&stt_data.stt_waitq);
+       lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+                      "waiting for %d threads to terminate\n",
+                      stt_data.stt_nthreads);
+
+       spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644 (file)
index 0000000..56dbfe5
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+       struct list_head        stt_list;
+       cfs_time_t      stt_expires;
+       void        (*stt_func) (void *);
+       void         *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
new file mode 100644 (file)
index 0000000..d0a0e08
--- /dev/null
@@ -0,0 +1,33 @@
+config LUSTRE_FS
+       tristate "Lustre file system client support"
+       depends on STAGING && INET && BROKEN
+       select LNET
+       help
+         This option enables Lustre file system client support. Choose Y
+         here if you want to access a Lustre file system cluster. To compile
+         this file system support as a module, choose M here: the module will
+         be called lustre.
+
+         To mount Lustre file systems , you also need to install the user space
+         mount.lustre and other user space commands which can be found in the
+         lustre-client package, available from
+         http://downloads.whamcloud.com/public/lustre/
+
+         Lustre file system is the most popular cluster file system in high
+         performance computing. Source code of both kernel space and user space
+         Lustre components can also be found at
+         http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+         If unsure, say N.
+
+         See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+       int "Lustre obd max ioctl buffer bytes (default 8KB)"
+       depends on LUSTRE_FS
+       default 8192
+       help
+         This option defines the maximum size of buffer in bytes that user space
+         applications can pass to Lustre kernel module through ioctl interface.
+
+         If unsure, use default.
diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile
new file mode 100644 (file)
index 0000000..3fb94fc
--- /dev/null
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) := fid/ lvfs/ obdclass/ ptlrpc/ obdecho/ mgc/ lov/ \
+                          osc/ mdc/ lmv/ llite/ fld/ libcfs/
diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644 (file)
index 0000000..b8d6d21
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_handler.o fid_store.o fid_request.o lproc_fid.o fid_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fid/fid_handler.c b/drivers/staging/lustre/lustre/fid/fid_handler.c
new file mode 100644 (file)
index 0000000..bbbb3cf
--- /dev/null
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+int client_fid_init(struct obd_device *obd,
+                   struct obd_export *exp, enum lu_cli_type type)
+{
+       struct client_obd *cli = &obd->u.cli;
+       char *prefix;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(cli->cl_seq);
+       if (cli->cl_seq == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+       if (prefix == NULL)
+               GOTO(out_free_seq, rc = -ENOMEM);
+
+       snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+       /* Init client side sequence-manager */
+       rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+       OBD_FREE(prefix, MAX_OBD_NAME + 5);
+       if (rc)
+               GOTO(out_free_seq, rc);
+
+       RETURN(rc);
+out_free_seq:
+       OBD_FREE_PTR(cli->cl_seq);
+       cli->cl_seq = NULL;
+       return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       ENTRY;
+
+       if (cli->cl_seq != NULL) {
+               seq_client_fini(cli->cl_seq);
+               OBD_FREE_PTR(cli->cl_seq);
+               cli->cl_seq = NULL;
+       }
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(struct lu_server_seq *seq,
+                      struct lu_client_seq *cli,
+                      const struct lu_env *env)
+{
+       int rc = 0;
+       ENTRY;
+
+       /*
+        * Ask client for new range, assign that range to ->seq_space and write
+        * seq state to backing store should be atomic.
+        */
+       mutex_lock(&seq->lss_mutex);
+
+       if (cli == NULL) {
+               CDEBUG(D_INFO, "%s: Detached sequence client %s\n",
+                      seq->lss_name, cli->lcs_name);
+               seq->lss_cli = cli;
+               GOTO(out_up, rc = 0);
+       }
+
+       if (seq->lss_cli != NULL) {
+               CDEBUG(D_HA, "%s: Sequence controller is already "
+                      "assigned\n", seq->lss_name);
+               GOTO(out_up, rc = -EEXIST);
+       }
+
+       CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+              seq->lss_name, cli->lcs_name);
+
+       seq->lss_cli = cli;
+       cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+       EXIT;
+out_up:
+       mutex_unlock(&seq->lss_mutex);
+       return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+                              struct lu_seq_range *from,
+                              __u64 width)
+{
+       width = min(range_space(from), width);
+       to->lsr_start = from->lsr_start;
+       to->lsr_end = from->lsr_start + width;
+       from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+                                   struct lu_seq_range *out,
+                                   const struct lu_env *env)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc;
+       ENTRY;
+
+       LASSERT(range_is_sane(space));
+
+       if (range_is_exhausted(space)) {
+               CERROR("%s: Sequences space is exhausted\n",
+                      seq->lss_name);
+               RETURN(-ENOSPC);
+       } else {
+               range_alloc(out, space, seq->lss_width);
+       }
+
+       rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+       LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+                     seq->lss_name, rc, PRANGE(out));
+
+       RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lss_mutex);
+       rc = __seq_server_alloc_super(seq, out, env);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+                           struct lu_server_seq *seq)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc;
+
+       range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+       range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+       rc = seq_store_update(env, seq, NULL, 1);
+
+       return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *                 not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not commited, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+                           struct lu_seq_range *out,
+                           struct lu_server_seq *seq)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       struct lu_seq_range *loset = &seq->lss_lowater_set;
+       struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+       int rc = 0;
+
+       if (range_is_zero(loset))
+               __seq_set_init(env, seq);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+               loset->lsr_start = loset->lsr_end;
+
+       if (range_is_exhausted(loset)) {
+               /* reached high water mark. */
+               struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+               int obd_num_clients = dev->ld_obd->obd_num_exports;
+               __u64 set_sz;
+
+               /* calculate new seq width based on number of clients */
+               set_sz = max(seq->lss_set_width,
+                            obd_num_clients * seq->lss_width);
+               set_sz = min(range_space(space), set_sz);
+
+               /* Switch to hiwater range now */
+               *loset = *hiset;
+               /* allocate new hiwater range */
+               range_alloc(hiset, space, set_sz);
+
+               /* update ondisk seq with new *space */
+               rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+       }
+
+       LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset),
+                DRANGE"\n", PRANGE(loset));
+
+       if (rc == 0)
+               range_alloc(out, loset, seq->lss_width);
+
+       RETURN(rc);
+}
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+                                  struct lu_seq_range *out,
+                                  const struct lu_env *env)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc = 0;
+
+       ENTRY;
+
+       LASSERT(range_is_sane(space));
+
+       /* Check if available space ends and allocate new super seq */
+       if (range_is_exhausted(space)) {
+               if (!seq->lss_cli) {
+                       CERROR("%s: No sequence controller is attached.\n",
+                              seq->lss_name);
+                       RETURN(-ENODEV);
+               }
+
+               rc = seq_client_alloc_super(seq->lss_cli, env);
+               if (rc) {
+                       CERROR("%s: Can't allocate super-sequence, rc %d\n",
+                              seq->lss_name, rc);
+                       RETURN(rc);
+               }
+
+               /* Saving new range to allocation space. */
+               *space = seq->lss_cli->lcs_space;
+               LASSERT(range_is_sane(space));
+       }
+
+       rc = range_alloc_set(env, out, seq);
+       if (rc != 0) {
+               CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+                       seq->lss_name, rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+               seq->lss_name, PRANGE(out));
+
+       RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                         struct lu_seq_range *out,
+                         const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lss_mutex);
+       rc = __seq_server_alloc_meta(seq, out, env);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+                            const struct lu_env *env,
+                            __u32 opc, struct lu_seq_range *out)
+{
+       int rc;
+       struct seq_server_site *ss_site;
+       ENTRY;
+
+       ss_site = lu_site2seq(site);
+
+       switch (opc) {
+       case SEQ_ALLOC_META:
+               if (!ss_site->ss_server_seq) {
+                       CERROR("Sequence server is not "
+                              "initialized\n");
+                       RETURN(-EINVAL);
+               }
+               rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+               break;
+       case SEQ_ALLOC_SUPER:
+               if (!ss_site->ss_control_seq) {
+                       CERROR("Sequence controller is not "
+                              "initialized\n");
+                       RETURN(-EINVAL);
+               }
+               rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+static int seq_req_handle(struct ptlrpc_request *req,
+                         const struct lu_env *env,
+                         struct seq_thread_info *info)
+{
+       struct lu_seq_range *out, *tmp;
+       struct lu_site *site;
+       int rc = -EPROTO;
+       __u32 *opc;
+       ENTRY;
+
+       LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY));
+       site = req->rq_export->exp_obd->obd_lu_dev->ld_site;
+       LASSERT(site != NULL);
+
+       rc = req_capsule_server_pack(info->sti_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC);
+       if (opc != NULL) {
+               out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE);
+               if (out == NULL)
+                       RETURN(err_serious(-EPROTO));
+
+               tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE);
+
+               /* seq client passed mdt id, we need to pass that using out
+                * range parameter */
+
+               out->lsr_index = tmp->lsr_index;
+               out->lsr_flags = tmp->lsr_flags;
+               rc = seq_server_handle(site, env, *opc, out);
+       } else
+               rc = err_serious(-EPROTO);
+
+       RETURN(rc);
+}
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+static void seq_thread_info_init(struct ptlrpc_request *req,
+                                struct seq_thread_info *info)
+{
+       info->sti_pill = &req->rq_pill;
+       /* Init request capsule */
+       req_capsule_init(info->sti_pill, req, RCL_SERVER);
+       req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY);
+}
+
+static void seq_thread_info_fini(struct seq_thread_info *info)
+{
+       req_capsule_fini(info->sti_pill);
+}
+
+int seq_handle(struct ptlrpc_request *req)
+{
+       const struct lu_env *env;
+       struct seq_thread_info *info;
+       int rc;
+
+       env = req->rq_svc_thread->t_env;
+       LASSERT(env != NULL);
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       seq_thread_info_init(req, info);
+       rc = seq_req_handle(req, env, info);
+       /* XXX: we don't need replay but MDT assign transno in any case,
+        * remove it manually before reply*/
+       lustre_msg_set_transno(req->rq_repmsg, 0);
+       seq_thread_info_fini(info);
+
+       return rc;
+}
+EXPORT_SYMBOL(seq_handle);
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int seq_query(struct com_thread_info *info)
+{
+       return seq_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(seq_query);
+
+
+#ifdef LPROCFS
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+                                            seq_type_proc_dir,
+                                            NULL, NULL);
+       if (IS_ERR(seq->lss_proc_dir)) {
+               rc = PTR_ERR(seq->lss_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(seq->lss_proc_dir,
+                             seq_server_proc_list, seq);
+       if (rc) {
+               CERROR("%s: Can't init sequence manager "
+                      "proc, rc %d\n", seq->lss_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       seq_server_proc_fini(seq);
+       return rc;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+       ENTRY;
+       if (seq->lss_proc_dir != NULL) {
+               if (!IS_ERR(seq->lss_proc_dir))
+                       lprocfs_remove(&seq->lss_proc_dir);
+               seq->lss_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+       return 0;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+       return;
+}
+#endif
+
+
+int seq_server_init(struct lu_server_seq *seq,
+                   struct dt_device *dev,
+                   const char *prefix,
+                   enum lu_mgr_type type,
+                   struct seq_server_site *ss,
+                   const struct lu_env *env)
+{
+       int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+       ENTRY;
+
+       LASSERT(dev != NULL);
+       LASSERT(prefix != NULL);
+       LASSERT(ss != NULL);
+       LASSERT(ss->ss_lu != NULL);
+
+       seq->lss_cli = NULL;
+       seq->lss_type = type;
+       seq->lss_site = ss;
+       range_init(&seq->lss_space);
+
+       range_init(&seq->lss_lowater_set);
+       range_init(&seq->lss_hiwater_set);
+       seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+       mutex_init(&seq->lss_mutex);
+
+       seq->lss_width = is_srv ?
+               LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+       snprintf(seq->lss_name, sizeof(seq->lss_name),
+                "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+       rc = seq_store_init(seq, env, dev);
+       if (rc)
+               GOTO(out, rc);
+       /* Request backing store for saved sequence info. */
+       rc = seq_store_read(seq, env);
+       if (rc == -ENODATA) {
+
+               /* Nothing is read, init by default value. */
+               seq->lss_space = is_srv ?
+                       LUSTRE_SEQ_ZERO_RANGE:
+                       LUSTRE_SEQ_SPACE_RANGE;
+
+               LASSERT(ss != NULL);
+               seq->lss_space.lsr_index = ss->ss_node_id;
+               LCONSOLE_INFO("%s: No data found "
+                             "on store. Initialize space\n",
+                             seq->lss_name);
+
+               rc = seq_store_update(env, seq, NULL, 0);
+               if (rc) {
+                       CERROR("%s: Can't write space data, "
+                              "rc %d\n", seq->lss_name, rc);
+               }
+       } else if (rc) {
+               CERROR("%s: Can't read space data, rc %d\n",
+                      seq->lss_name, rc);
+               GOTO(out, rc);
+       }
+
+       if (is_srv) {
+               LASSERT(range_is_sane(&seq->lss_space));
+       } else {
+               LASSERT(!range_is_zero(&seq->lss_space) &&
+                       range_is_sane(&seq->lss_space));
+       }
+
+       rc  = seq_server_proc_init(seq);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+out:
+       if (rc)
+               seq_server_fini(seq, env);
+       return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                    const struct lu_env *env)
+{
+       ENTRY;
+
+       seq_server_proc_fini(seq);
+       seq_store_fini(seq, env);
+
+       EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+       if (ss == NULL)
+               RETURN(0);
+
+       if (ss->ss_server_seq) {
+               seq_server_fini(ss->ss_server_seq, env);
+               OBD_FREE_PTR(ss->ss_server_seq);
+               ss->ss_server_seq = NULL;
+       }
+
+       if (ss->ss_control_seq) {
+               seq_server_fini(ss->ss_control_seq, env);
+               OBD_FREE_PTR(ss->ss_control_seq);
+               ss->ss_control_seq = NULL;
+       }
+
+       if (ss->ss_client_seq) {
+               seq_client_fini(ss->ss_client_seq);
+               OBD_FREE_PTR(ss->ss_client_seq);
+               ss->ss_client_seq = NULL;
+       }
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+proc_dir_entry_t *seq_type_proc_dir = NULL;
+
+static int __init fid_mod_init(void)
+{
+       seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+                                            proc_lustre_root,
+                                            NULL, NULL);
+       if (IS_ERR(seq_type_proc_dir))
+               return PTR_ERR(seq_type_proc_dir);
+
+       LU_CONTEXT_KEY_INIT(&seq_thread_key);
+       lu_context_key_register(&seq_thread_key);
+       return 0;
+}
+
+static void __exit fid_mod_exit(void)
+{
+       lu_context_key_degister(&seq_thread_key);
+       if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+               lprocfs_remove(&seq_type_proc_dir);
+               seq_type_proc_dir = NULL;
+       }
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+
+cfs_module(fid, "0.1.0", fid_mod_init, fid_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644 (file)
index 0000000..c3a94f4
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_thread_info {
+       struct req_capsule     *sti_pill;
+       struct lu_seq_range     sti_space;
+       struct lu_buf      sti_buf;
+};
+
+enum {
+       SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+                          const struct lu_env *env);
+
+/* Store API functions. */
+int seq_store_init(struct lu_server_seq *seq,
+                  const struct lu_env *env,
+                  struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+                   const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+                  const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+                    struct lu_seq_range *out, int sync);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars seq_server_proc_list[];
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+
+extern proc_dir_entry_t *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644 (file)
index 0000000..eaff51a
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <lu_object.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID: seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      : 0:32, ino:32          gen:32      0:32
+ * IDIF      : 0:31, 1:1, ost-index:16,  objd:48        0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+       FID_SEQ_NORMAL,
+       (__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+       0,
+       0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+                                      .f_oid = FID_OID_SPECIAL_BFL,
+                                      .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                         .f_oid = FID_OID_DOT_LUSTRE,
+                                         .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                  .f_oid = FID_OID_DOT_LUSTRE_OBF,
+                                  .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644 (file)
index 0000000..fcaaca7
--- /dev/null
@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+                         struct lu_seq_range *output, __u32 opc,
+                         const char *opcname)
+{
+       struct obd_export     *exp = seq->lcs_exp;
+       struct ptlrpc_request *req;
+       struct lu_seq_range   *out, *in;
+       __u32            *op;
+       unsigned int       debug_mask;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+                                       LUSTRE_MDS_VERSION, SEQ_QUERY);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* Init operation code */
+       op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+       *op = opc;
+
+       /* Zero out input range, this is not recovery yet. */
+       in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+       range_init(in);
+
+       ptlrpc_request_set_replen(req);
+
+       in->lsr_index = seq->lcs_space.lsr_index;
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               fld_range_set_mdt(in);
+       else
+               fld_range_set_ost(in);
+
+       if (opc == SEQ_ALLOC_SUPER) {
+               req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+               req->rq_reply_portal = MDC_REPLY_PORTAL;
+               /* During allocating super sequence for data object,
+                * the current thread might hold the export of MDT0(MDT0
+                * precreating objects on this OST), and it will send the
+                * request to MDT0 here, so we can not keep resending the
+                * request here, otherwise if MDT0 is failed(umounted),
+                * it can not release the export of MDT0 */
+               if (seq->lcs_type == LUSTRE_SEQ_DATA)
+                       req->rq_no_delay = req->rq_no_resend = 1;
+               debug_mask = D_CONSOLE;
+       } else {
+               if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+                       req->rq_request_portal = SEQ_METADATA_PORTAL;
+               else
+                       req->rq_request_portal = SEQ_DATA_PORTAL;
+               debug_mask = D_INFO;
+       }
+
+       ptlrpc_at_set_req_timeout(req);
+
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc)
+               GOTO(out_req, rc);
+
+       out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+       *output = *out;
+
+       if (!range_is_sane(output)) {
+               CERROR("%s: Invalid range received from server: "
+                      DRANGE"\n", seq->lcs_name, PRANGE(output));
+               GOTO(out_req, rc = -EINVAL);
+       }
+
+       if (range_is_exhausted(output)) {
+               CERROR("%s: Range received from server is exhausted: "
+                      DRANGE"]\n", seq->lcs_name, PRANGE(output));
+               GOTO(out_req, rc = -EINVAL);
+       }
+
+       CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+                    seq->lcs_name, opcname, PRANGE(output));
+
+       EXIT;
+out_req:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+                          const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lcs_mutex);
+
+       if (seq->lcs_srv) {
+               LASSERT(env != NULL);
+               rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space,
+                                           env);
+       } else {
+               /* Check whether the connection to seq controller has been
+                * setup (lcs_exp != NULL) */
+               if (seq->lcs_exp == NULL) {
+                       mutex_unlock(&seq->lcs_mutex);
+                       RETURN(-EINPROGRESS);
+               }
+
+               rc = seq_client_rpc(seq, &seq->lcs_space,
+                                   SEQ_ALLOC_SUPER, "super");
+       }
+       mutex_unlock(&seq->lcs_mutex);
+       RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+                                struct lu_client_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       if (seq->lcs_srv) {
+               LASSERT(env != NULL);
+               rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+       } else {
+               do {
+                       /* If meta server return -EINPROGRESS or EAGAIN,
+                        * it means meta server might not be ready to
+                        * allocate super sequence from sequence controller
+                        * (MDT0)yet */
+                       rc = seq_client_rpc(seq, &seq->lcs_space,
+                                           SEQ_ALLOC_META, "meta");
+               } while (rc == -EINPROGRESS || rc == -EAGAIN);
+       }
+       RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+                               struct lu_client_seq *seq, seqno_t *seqnr)
+{
+       int rc;
+       ENTRY;
+
+       LASSERT(range_is_sane(&seq->lcs_space));
+
+       if (range_is_exhausted(&seq->lcs_space)) {
+               rc = seq_client_alloc_meta(env, seq);
+               if (rc) {
+                       CERROR("%s: Can't allocate new meta-sequence,"
+                              "rc %d\n", seq->lcs_name, rc);
+                       RETURN(rc);
+               } else {
+                       CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+                              seq->lcs_name, PRANGE(&seq->lcs_space));
+               }
+       } else {
+               rc = 0;
+       }
+
+       LASSERT(!range_is_exhausted(&seq->lcs_space));
+       *seqnr = seq->lcs_space.lsr_start;
+       seq->lcs_space.lsr_start += 1;
+
+       CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name,
+              *seqnr);
+
+       RETURN(rc);
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+                             wait_queue_t *link)
+{
+       if (seq->lcs_update) {
+               add_wait_queue(&seq->lcs_waitq, link);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               mutex_unlock(&seq->lcs_mutex);
+
+               waitq_wait(link, TASK_UNINTERRUPTIBLE);
+
+               mutex_lock(&seq->lcs_mutex);
+               remove_wait_queue(&seq->lcs_waitq, link);
+               set_current_state(TASK_RUNNING);
+               return -EAGAIN;
+       }
+       ++seq->lcs_update;
+       mutex_unlock(&seq->lcs_mutex);
+       return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+       LASSERT(seq->lcs_update == 1);
+       mutex_lock(&seq->lcs_mutex);
+       --seq->lcs_update;
+       wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+                      struct lu_client_seq *seq, seqno_t *seqnr)
+{
+       wait_queue_t link;
+       int rc;
+
+       LASSERT(seqnr != NULL);
+       mutex_lock(&seq->lcs_mutex);
+       init_waitqueue_entry_current(&link);
+
+       while (1) {
+               rc = seq_fid_alloc_prep(seq, &link);
+               if (rc == 0)
+                       break;
+       }
+
+       rc = seq_client_alloc_seq(env, seq, seqnr);
+       if (rc) {
+               CERROR("%s: Can't allocate new sequence, "
+                      "rc %d\n", seq->lcs_name, rc);
+               seq_fid_alloc_fini(seq);
+               mutex_unlock(&seq->lcs_mutex);
+               return rc;
+       }
+
+       CDEBUG(D_INFO, "%s: allocate sequence "
+              "[0x%16.16"LPF64"x]\n", seq->lcs_name, *seqnr);
+
+       /* Since the caller require the whole seq,
+        * so marked this seq to be used */
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+       else
+               seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+       seq->lcs_fid.f_seq = *seqnr;
+       seq->lcs_fid.f_ver = 0;
+       /*
+        * Inform caller that sequence switch is performed to allow it
+        * to setup FLD for it.
+        */
+       seq_fid_alloc_fini(seq);
+       mutex_unlock(&seq->lcs_mutex);
+
+       return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+                        struct lu_client_seq *seq, struct lu_fid *fid)
+{
+       wait_queue_t link;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+       LASSERT(fid != NULL);
+
+       init_waitqueue_entry_current(&link);
+       mutex_lock(&seq->lcs_mutex);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+               seq->lcs_fid.f_oid = seq->lcs_width;
+
+       while (1) {
+               seqno_t seqnr;
+
+               if (!fid_is_zero(&seq->lcs_fid) &&
+                   fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+                       /* Just bump last allocated fid and return to caller. */
+                       seq->lcs_fid.f_oid += 1;
+                       rc = 0;
+                       break;
+               }
+
+               rc = seq_fid_alloc_prep(seq, &link);
+               if (rc)
+                       continue;
+
+               rc = seq_client_alloc_seq(env, seq, &seqnr);
+               if (rc) {
+                       CERROR("%s: Can't allocate new sequence, "
+                              "rc %d\n", seq->lcs_name, rc);
+                       seq_fid_alloc_fini(seq);
+                       mutex_unlock(&seq->lcs_mutex);
+                       RETURN(rc);
+               }
+
+               CDEBUG(D_INFO, "%s: Switch to sequence "
+                      "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr);
+
+               seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+               seq->lcs_fid.f_seq = seqnr;
+               seq->lcs_fid.f_ver = 0;
+
+               /*
+                * Inform caller that sequence switch is performed to allow it
+                * to setup FLD for it.
+                */
+               rc = 1;
+
+               seq_fid_alloc_fini(seq);
+               break;
+       }
+
+       *fid = seq->lcs_fid;
+       mutex_unlock(&seq->lcs_mutex);
+
+       CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+       wait_queue_t link;
+
+       LASSERT(seq != NULL);
+       init_waitqueue_entry_current(&link);
+       mutex_lock(&seq->lcs_mutex);
+
+       while (seq->lcs_update) {
+               add_wait_queue(&seq->lcs_waitq, &link);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               mutex_unlock(&seq->lcs_mutex);
+
+               waitq_wait(&link, TASK_UNINTERRUPTIBLE);
+
+               mutex_lock(&seq->lcs_mutex);
+               remove_wait_queue(&seq->lcs_waitq, &link);
+               set_current_state(TASK_RUNNING);
+       }
+
+       fid_zero(&seq->lcs_fid);
+       /**
+        * this id shld not be used for seq range allocation.
+        * set to -1 for dgb check.
+        */
+
+       seq->lcs_space.lsr_index = -1;
+
+       range_init(&seq->lcs_space);
+       mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq);
+
+#ifdef LPROCFS
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+                                            seq_type_proc_dir,
+                                            NULL, NULL);
+
+       if (IS_ERR(seq->lcs_proc_dir)) {
+               CERROR("%s: LProcFS failed in seq-init\n",
+                      seq->lcs_name);
+               rc = PTR_ERR(seq->lcs_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(seq->lcs_proc_dir,
+                             seq_client_proc_list, seq);
+       if (rc) {
+               CERROR("%s: Can't init sequence manager "
+                      "proc, rc %d\n", seq->lcs_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       seq_client_proc_fini(seq);
+       return rc;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+       ENTRY;
+       if (seq->lcs_proc_dir) {
+               if (!IS_ERR(seq->lcs_proc_dir))
+                       lprocfs_remove(&seq->lcs_proc_dir);
+               seq->lcs_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+       return 0;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+       return;
+}
+#endif
+
+int seq_client_init(struct lu_client_seq *seq,
+                   struct obd_export *exp,
+                   enum lu_cli_type type,
+                   const char *prefix,
+                   struct lu_server_seq *srv)
+{
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+       LASSERT(prefix != NULL);
+
+       seq->lcs_srv = srv;
+       seq->lcs_type = type;
+
+       mutex_init(&seq->lcs_mutex);
+       if (type == LUSTRE_SEQ_METADATA)
+               seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+       else
+               seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+       init_waitqueue_head(&seq->lcs_waitq);
+       /* Make sure that things are clear before work is started. */
+       seq_client_flush(seq);
+
+       if (exp != NULL)
+               seq->lcs_exp = class_export_get(exp);
+       else if (type == LUSTRE_SEQ_METADATA)
+               LASSERT(seq->lcs_srv != NULL);
+
+       snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+                "cli-%s", prefix);
+
+       rc = seq_client_proc_init(seq);
+       if (rc)
+               seq_client_fini(seq);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+       ENTRY;
+
+       seq_client_proc_fini(seq);
+
+       if (seq->lcs_exp != NULL) {
+               class_export_put(seq->lcs_exp);
+               seq->lcs_exp = NULL;
+       }
+
+       seq->lcs_srv = NULL;
+       EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
diff --git a/drivers/staging/lustre/lustre/fid/fid_store.c b/drivers/staging/lustre/lustre/fid/fid_store.c
new file mode 100644 (file)
index 0000000..a90e6e3
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+       struct lu_buf *buf;
+
+       buf = &info->sti_buf;
+       buf->lb_buf = &info->sti_space;
+       buf->lb_len = sizeof(info->sti_space);
+       return buf;
+}
+
+struct seq_update_callback {
+       struct dt_txn_commit_cb suc_cb;
+       struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+                  struct dt_txn_commit_cb *cb, int err)
+{
+       struct seq_update_callback *ccb;
+
+       ccb = container_of0(cb, struct seq_update_callback, suc_cb);
+
+       LASSERT(ccb->suc_seq != NULL);
+
+       ccb->suc_seq->lss_need_sync = 0;
+       OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+       struct seq_update_callback *ccb;
+       struct dt_txn_commit_cb    *dcb;
+       int                        rc;
+
+       OBD_ALLOC_PTR(ccb);
+       if (ccb == NULL)
+               return -ENOMEM;
+
+       ccb->suc_seq       = seq;
+       seq->lss_need_sync = 1;
+
+       dcb            = &ccb->suc_cb;
+       dcb->dcb_func  = seq_update_cb;
+       INIT_LIST_HEAD(&dcb->dcb_linkage);
+       strncpy(dcb->dcb_name, "seq_update_cb", MAX_COMMIT_CB_STR_LEN);
+       dcb->dcb_name[MAX_COMMIT_CB_STR_LEN - 1] = '\0';
+
+       rc = dt_trans_cb_add(th, dcb);
+       if (rc)
+               OBD_FREE_PTR(ccb);
+       return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+                    struct lu_seq_range *out, int sync)
+{
+       struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+       struct seq_thread_info *info;
+       struct thandle *th;
+       loff_t pos = 0;
+       int rc;
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       th = dt_trans_create(env, dt_dev);
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       rc = dt_declare_record_write(env, seq->lss_obj,
+                                    sizeof(struct lu_seq_range), 0, th);
+       if (rc)
+               GOTO(exit, rc);
+
+       if (out != NULL) {
+               rc = fld_declare_server_create(env,
+                                              seq->lss_site->ss_server_fld,
+                                              out, th);
+               if (rc)
+                       GOTO(exit, rc);
+       }
+
+       rc = dt_trans_start_local(env, dt_dev, th);
+       if (rc)
+               GOTO(exit, rc);
+
+       /* Store ranges in le format. */
+       range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+       rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+       if (rc) {
+               CERROR("%s: Can't write space data, rc %d\n",
+                      seq->lss_name, rc);
+               GOTO(exit, rc);
+       } else if (out != NULL) {
+               rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+                                      th);
+               if (rc) {
+                       CERROR("%s: Can't Update fld database, rc %d\n",
+                               seq->lss_name, rc);
+                       GOTO(exit, rc);
+               }
+       }
+       /* next sequence update will need sync until this update is committed
+        * in case of sync operation this is not needed obviously */
+       if (!sync)
+               /* if callback can't be added then sync always */
+               sync = !!seq_update_cb_add(th, seq);
+
+       th->th_sync |= sync;
+exit:
+       dt_trans_stop(env, dt_dev, th);
+       return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+                  const struct lu_env *env)
+{
+       struct seq_thread_info *info;
+       loff_t pos = 0;
+       int rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       rc = seq->lss_obj->do_body_ops->dbo_read(env, seq->lss_obj,
+                                                seq_store_buf(info),
+                                                &pos, BYPASS_CAPA);
+
+       if (rc == sizeof(info->sti_space)) {
+               range_le_to_cpu(&seq->lss_space, &info->sti_space);
+               CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+                      seq->lss_name, PRANGE(&seq->lss_space));
+               rc = 0;
+       } else if (rc == 0) {
+               rc = -ENODATA;
+       } else if (rc > 0) {
+               CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+                      rc, (int)sizeof(info->sti_space));
+               rc = -EIO;
+       }
+
+       RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+                  const struct lu_env *env,
+                  struct dt_device *dt)
+{
+       struct dt_object *dt_obj;
+       struct lu_fid fid;
+       struct lu_attr attr;
+       struct dt_object_format dof;
+       const char *name;
+       int rc;
+       ENTRY;
+
+       name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+               LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+       if (seq->lss_type == LUSTRE_SEQ_SERVER)
+               lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+       else
+               lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+       memset(&attr, 0, sizeof(attr));
+       attr.la_valid = LA_MODE;
+       attr.la_mode = S_IFREG | 0666;
+       dof.dof_type = DFT_REGULAR;
+
+       dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+       if (!IS_ERR(dt_obj)) {
+               seq->lss_obj = dt_obj;
+               rc = 0;
+       } else {
+               CERROR("%s: Can't find \"%s\" obj %d\n",
+                      seq->lss_name, name, (int)PTR_ERR(dt_obj));
+               rc = PTR_ERR(dt_obj);
+       }
+
+       RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq,
+                   const struct lu_env *env)
+{
+       ENTRY;
+
+       if (seq->lss_obj != NULL) {
+               if (!IS_ERR(seq->lss_obj))
+                       lu_object_put(env, &seq->lss_obj->do_lu);
+               seq->lss_obj = NULL;
+       }
+
+       EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644 (file)
index 0000000..49ea357
--- /dev/null
@@ -0,0 +1,360 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+#ifdef LPROCFS
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int
+seq_proc_write_common(struct file *file, const char *buffer,
+                     unsigned long count, void *data,
+                     struct lu_seq_range *range)
+{
+       struct lu_seq_range tmp;
+       int rc;
+       ENTRY;
+
+       LASSERT(range != NULL);
+
+       rc = sscanf(buffer, "[%llx - %llx]\n",
+                   (long long unsigned *)&tmp.lsr_start,
+                   (long long unsigned *)&tmp.lsr_end);
+       if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp))
+               RETURN(-EINVAL);
+       *range = tmp;
+       RETURN(0);
+}
+
+static int
+seq_proc_read_common(char *page, char **start, off_t off,
+                    int count, int *eof, void *data,
+                    struct lu_seq_range *range)
+{
+       int rc;
+       ENTRY;
+
+       *eof = 1;
+       rc = snprintf(page, count, "["LPX64" - "LPX64"]:%x:%s\n",
+                       PRANGE(range));
+       RETURN(rc);
+}
+
+/*
+ * Server side procfs stuff.
+ */
+static int
+seq_server_proc_write_space(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct lu_server_seq *seq = (struct lu_server_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lss_mutex);
+       rc = seq_proc_write_common(file, buffer, count,
+                                  data, &seq->lss_space);
+       if (rc == 0) {
+               CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+                      seq->lss_name, PRANGE(&seq->lss_space));
+       }
+
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(count);
+}
+
+static int
+seq_server_proc_read_space(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+       struct lu_server_seq *seq = (struct lu_server_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lss_mutex);
+       rc = seq_proc_read_common(page, start, off, count, eof,
+                                 data, &seq->lss_space);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+
+static int
+seq_server_proc_read_server(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct lu_server_seq *seq = (struct lu_server_seq *)data;
+       struct client_obd *cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       *eof = 1;
+       if (seq->lss_cli) {
+               if (seq->lss_cli->lcs_exp != NULL) {
+                       cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli;
+                       rc = snprintf(page, count, "%s\n",
+                                     cli->cl_target_uuid.uuid);
+               } else {
+                       rc = snprintf(page, count, "%s\n",
+                                     seq->lss_cli->lcs_srv->lss_name);
+               }
+       } else {
+               rc = snprintf(page, count, "<none>\n");
+       }
+
+       RETURN(rc);
+}
+
+static int
+seq_server_proc_write_width(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct lu_server_seq *seq = (struct lu_server_seq *)data;
+       int rc, val;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lss_mutex);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc != 0) {
+               CERROR("%s: invalid width.\n", seq->lss_name);
+               GOTO(out_unlock, rc);
+       }
+
+       seq->lss_width = val;
+
+       CDEBUG(D_INFO, "%s: Width: "LPU64"\n",
+              seq->lss_name, seq->lss_width);
+out_unlock:
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(count);
+}
+
+static int
+seq_server_proc_read_width(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+       struct lu_server_seq *seq = (struct lu_server_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lss_mutex);
+       rc = snprintf(page, count, LPU64"\n", seq->lss_width);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+
+/* Client side procfs stuff */
+static int
+seq_client_proc_write_space(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = seq_proc_write_common(file, buffer, count,
+                                  data, &seq->lcs_space);
+
+       if (rc == 0) {
+               CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+                      seq->lcs_name, PRANGE(&seq->lcs_space));
+       }
+
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(count);
+}
+
+static int
+seq_client_proc_read_space(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = seq_proc_read_common(page, start, off, count, eof,
+                                 data, &seq->lcs_space);
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static int
+seq_client_proc_write_width(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       __u64  max;
+       int rc, val;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc) {
+               mutex_unlock(&seq->lcs_mutex);
+               RETURN(rc);
+       }
+
+       if (seq->lcs_type == LUSTRE_SEQ_DATA)
+               max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+       else
+               max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+       if (val <= max && val > 0) {
+               seq->lcs_width = val;
+
+               if (rc == 0) {
+                       CDEBUG(D_INFO, "%s: Sequence size: "LPU64"\n",
+                              seq->lcs_name, seq->lcs_width);
+               }
+       }
+
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(count);
+}
+
+static int
+seq_client_proc_read_width(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = snprintf(page, count, LPU64"\n", seq->lcs_width);
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static int
+seq_client_proc_read_fid(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = snprintf(page, count, DFID"\n", PFID(&seq->lcs_fid));
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static int
+seq_client_proc_read_server(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)data;
+       struct client_obd *cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       if (seq->lcs_exp != NULL) {
+               cli = &seq->lcs_exp->exp_obd->u.cli;
+               rc = snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid);
+       } else {
+               rc = snprintf(page, count, "%s\n", seq->lcs_srv->lss_name);
+       }
+       RETURN(rc);
+}
+
+struct lprocfs_vars seq_server_proc_list[] = {
+       { "space",    seq_server_proc_read_space, seq_server_proc_write_space, NULL },
+       { "width",    seq_server_proc_read_width, seq_server_proc_write_width, NULL },
+       { "server",   seq_server_proc_read_server, NULL, NULL },
+       { NULL }};
+
+struct lprocfs_vars seq_client_proc_list[] = {
+       { "space",    seq_client_proc_read_space, seq_client_proc_write_space, NULL },
+       { "width",    seq_client_proc_read_width, seq_client_proc_write_width, NULL },
+       { "server",   seq_client_proc_read_server, NULL, NULL },
+       { "fid",      seq_client_proc_read_fid, NULL, NULL },
+       { NULL }};
+#endif
diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644 (file)
index 0000000..e7f2881
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_handler.o fld_request.o fld_cache.o fld_index.o lproc_fld.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644 (file)
index 0000000..347f2ae
--- /dev/null
@@ -0,0 +1,566 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+                                int cache_size, int cache_threshold)
+{
+       struct fld_cache *cache;
+       ENTRY;
+
+       LASSERT(name != NULL);
+       LASSERT(cache_threshold < cache_size);
+
+       OBD_ALLOC_PTR(cache);
+       if (cache == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       INIT_LIST_HEAD(&cache->fci_entries_head);
+       INIT_LIST_HEAD(&cache->fci_lru);
+
+       cache->fci_cache_count = 0;
+       rwlock_init(&cache->fci_lock);
+
+       strlcpy(cache->fci_name, name,
+               sizeof(cache->fci_name));
+
+       cache->fci_cache_size = cache_size;
+       cache->fci_threshold = cache_threshold;
+
+       /* Init fld cache info. */
+       memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+       CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+              cache->fci_name, cache_size, cache_threshold);
+
+       RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+       __u64 pct;
+       ENTRY;
+
+       LASSERT(cache != NULL);
+       fld_cache_flush(cache);
+
+       if (cache->fci_stat.fst_count > 0) {
+               pct = cache->fci_stat.fst_cache * 100;
+               do_div(pct, cache->fci_stat.fst_count);
+       } else {
+               pct = 0;
+       }
+
+       CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+       CDEBUG(D_INFO, "  Total reqs: "LPU64"\n", cache->fci_stat.fst_count);
+       CDEBUG(D_INFO, "  Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache);
+       CDEBUG(D_INFO, "  Cache hits: "LPU64"%%\n", pct);
+
+       OBD_FREE_PTR(cache);
+
+       EXIT;
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+                           struct fld_cache_entry *node)
+{
+       list_del(&node->fce_list);
+       list_del(&node->fce_lru);
+       cache->fci_cache_count--;
+       OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+       struct fld_cache_entry *f_curr;
+       struct fld_cache_entry *f_next;
+       struct lu_seq_range *c_range;
+       struct lu_seq_range *n_range;
+       struct list_head *head = &cache->fci_entries_head;
+       ENTRY;
+
+restart_fixup:
+
+       list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+               c_range = &f_curr->fce_range;
+               n_range = &f_next->fce_range;
+
+               LASSERT(range_is_sane(c_range));
+               if (&f_next->fce_list == head)
+                       break;
+
+               if (c_range->lsr_flags != n_range->lsr_flags)
+                       continue;
+
+               LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+                        "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+                        PRANGE(c_range), PRANGE(n_range));
+
+               /* check merge possibility with next range */
+               if (c_range->lsr_end == n_range->lsr_start) {
+                       if (c_range->lsr_index != n_range->lsr_index)
+                               continue;
+                       n_range->lsr_start = c_range->lsr_start;
+                       fld_cache_entry_delete(cache, f_curr);
+                       continue;
+               }
+
+               /* check if current range overlaps with next range. */
+               if (n_range->lsr_start < c_range->lsr_end) {
+                       if (c_range->lsr_index == n_range->lsr_index) {
+                               n_range->lsr_start = c_range->lsr_start;
+                               n_range->lsr_end = max(c_range->lsr_end,
+                                                      n_range->lsr_end);
+                               fld_cache_entry_delete(cache, f_curr);
+                       } else {
+                               if (n_range->lsr_end <= c_range->lsr_end) {
+                                       *n_range = *c_range;
+                                       fld_cache_entry_delete(cache, f_curr);
+                               } else
+                                       n_range->lsr_start = c_range->lsr_end;
+                       }
+
+                       /* we could have overlap over next
+                        * range too. better restart. */
+                       goto restart_fixup;
+               }
+
+               /* kill duplicates */
+               if (c_range->lsr_start == n_range->lsr_start &&
+                   c_range->lsr_end == n_range->lsr_end)
+                       fld_cache_entry_delete(cache, f_curr);
+       }
+
+       EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+                                      struct fld_cache_entry *f_new,
+                                      struct list_head *pos)
+{
+       list_add(&f_new->fce_list, pos);
+       list_add(&f_new->fce_lru, &cache->fci_lru);
+
+       cache->fci_cache_count++;
+       fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+       struct fld_cache_entry *flde;
+       struct list_head *curr;
+       int num = 0;
+       ENTRY;
+
+       LASSERT(cache != NULL);
+
+       if (cache->fci_cache_count < cache->fci_cache_size)
+               RETURN(0);
+
+       curr = cache->fci_lru.prev;
+
+       while (cache->fci_cache_count + cache->fci_threshold >
+              cache->fci_cache_size && curr != &cache->fci_lru) {
+
+               flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+               curr = curr->prev;
+               fld_cache_entry_delete(cache, flde);
+               num++;
+       }
+
+       CDEBUG(D_INFO, "%s: FLD cache - Shrunk by "
+              "%d entries\n", cache->fci_name, num);
+
+       RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+       ENTRY;
+
+       write_lock(&cache->fci_lock);
+       cache->fci_cache_size = 0;
+       fld_cache_shrink(cache);
+       write_unlock(&cache->fci_lock);
+
+       EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+void fld_cache_punch_hole(struct fld_cache *cache,
+                         struct fld_cache_entry *f_curr,
+                         struct fld_cache_entry *f_new)
+{
+       const struct lu_seq_range *range = &f_new->fce_range;
+       const seqno_t new_start  = range->lsr_start;
+       const seqno_t new_end  = range->lsr_end;
+       struct fld_cache_entry *fldt;
+
+       ENTRY;
+       OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC);
+       if (!fldt) {
+               OBD_FREE_PTR(f_new);
+               EXIT;
+               /* overlap is not allowed, so dont mess up list. */
+               return;
+       }
+       /*  break f_curr RANGE into three RANGES:
+        *      f_curr, f_new , fldt
+        */
+
+       /* f_new = *range */
+
+       /* fldt */
+       fldt->fce_range.lsr_start = new_end;
+       fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+       fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+       /* f_curr */
+       f_curr->fce_range.lsr_end = new_start;
+
+       /* add these two entries to list */
+       fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+       fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+       /* no need to fixup */
+       EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+                               struct fld_cache_entry *f_curr,
+                               struct fld_cache_entry *f_new)
+{
+       const struct lu_seq_range *range = &f_new->fce_range;
+       const seqno_t new_start  = range->lsr_start;
+       const seqno_t new_end  = range->lsr_end;
+       const mdsno_t mdt = range->lsr_index;
+
+       /* this is overlap case, these case are checking overlapping with
+        * prev range only. fixup will handle overlaping with next range. */
+
+       if (f_curr->fce_range.lsr_index == mdt) {
+               f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+                                                 new_start);
+
+               f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+                                               new_end);
+
+               OBD_FREE_PTR(f_new);
+               fld_fix_new_list(cache);
+
+       } else if (new_start <= f_curr->fce_range.lsr_start &&
+                       f_curr->fce_range.lsr_end <= new_end) {
+               /* case 1: new range completely overshadowed existing range.
+                *       e.g. whole range migrated. update fld cache entry */
+
+               f_curr->fce_range = *range;
+               OBD_FREE_PTR(f_new);
+               fld_fix_new_list(cache);
+
+       } else if (f_curr->fce_range.lsr_start < new_start &&
+                       new_end < f_curr->fce_range.lsr_end) {
+               /* case 2: new range fit within existing range. */
+
+               fld_cache_punch_hole(cache, f_curr, f_new);
+
+       } else  if (new_end <= f_curr->fce_range.lsr_end) {
+               /* case 3: overlap:
+                *       [new_start [c_start  new_end)  c_end)
+                */
+
+               LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+               f_curr->fce_range.lsr_start = new_end;
+               fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+       } else if (f_curr->fce_range.lsr_start <= new_start) {
+               /* case 4: overlap:
+                *       [c_start [new_start c_end) new_end)
+                */
+
+               LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+               f_curr->fce_range.lsr_end = new_start;
+               fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+       } else
+               CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+                      PRANGE(range),PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+       struct fld_cache_entry *f_new;
+
+       LASSERT(range_is_sane(range));
+
+       OBD_ALLOC_PTR(f_new);
+       if (!f_new)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       f_new->fce_range = *range;
+       RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+                           struct fld_cache_entry *f_new)
+{
+       struct fld_cache_entry *f_curr;
+       struct fld_cache_entry *n;
+       struct list_head *head;
+       struct list_head *prev = NULL;
+       const seqno_t new_start  = f_new->fce_range.lsr_start;
+       const seqno_t new_end  = f_new->fce_range.lsr_end;
+       __u32 new_flags  = f_new->fce_range.lsr_flags;
+       ENTRY;
+
+       /*
+        * Duplicate entries are eliminated in insert op.
+        * So we don't need to search new entry before starting
+        * insertion loop.
+        */
+
+       if (!cache->fci_no_shrink)
+               fld_cache_shrink(cache);
+
+       head = &cache->fci_entries_head;
+
+       list_for_each_entry_safe(f_curr, n, head, fce_list) {
+               /* add list if next is end of list */
+               if (new_end < f_curr->fce_range.lsr_start ||
+                  (new_end == f_curr->fce_range.lsr_start &&
+                   new_flags != f_curr->fce_range.lsr_flags))
+                       break;
+
+               prev = &f_curr->fce_list;
+               /* check if this range is to left of new range. */
+               if (new_start < f_curr->fce_range.lsr_end &&
+                   new_flags == f_curr->fce_range.lsr_flags) {
+                       fld_cache_overlap_handle(cache, f_curr, f_new);
+                       goto out;
+               }
+       }
+
+       if (prev == NULL)
+               prev = head;
+
+       CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+       /* Add new entry to cache and lru list. */
+       fld_cache_entry_add(cache, f_new, prev);
+out:
+       RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+                    const struct lu_seq_range *range)
+{
+       struct fld_cache_entry  *flde;
+       int rc;
+
+       flde = fld_cache_entry_create(range);
+       if (IS_ERR(flde))
+               RETURN(PTR_ERR(flde));
+
+       write_lock(&cache->fci_lock);
+       rc = fld_cache_insert_nolock(cache, flde);
+       write_unlock(&cache->fci_lock);
+       if (rc)
+               OBD_FREE_PTR(flde);
+
+       RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+                     const struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *tmp;
+       struct list_head *head;
+
+       head = &cache->fci_entries_head;
+       list_for_each_entry_safe(flde, tmp, head, fce_list) {
+               /* add list if next is end of list */
+               if (range->lsr_start == flde->fce_range.lsr_start ||
+                  (range->lsr_end == flde->fce_range.lsr_end &&
+                   range->lsr_flags == flde->fce_range.lsr_flags)) {
+                       fld_cache_entry_delete(cache, flde);
+                       break;
+               }
+       }
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+                     const struct lu_seq_range *range)
+{
+       write_lock(&cache->fci_lock);
+       fld_cache_delete_nolock(cache, range);
+       write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+                             struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *got = NULL;
+       struct list_head *head;
+
+       head = &cache->fci_entries_head;
+       list_for_each_entry(flde, head, fce_list) {
+               if (range->lsr_start == flde->fce_range.lsr_start ||
+                  (range->lsr_end == flde->fce_range.lsr_end &&
+                   range->lsr_flags == flde->fce_range.lsr_flags)) {
+                       got = flde;
+                       break;
+               }
+       }
+
+       RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+       struct fld_cache_entry *got = NULL;
+       ENTRY;
+
+       read_lock(&cache->fci_lock);
+       got = fld_cache_entry_lookup_nolock(cache, range);
+       read_unlock(&cache->fci_lock);
+       RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+                    const seqno_t seq, struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *prev = NULL;
+       struct list_head *head;
+       ENTRY;
+
+       read_lock(&cache->fci_lock);
+       head = &cache->fci_entries_head;
+
+       cache->fci_stat.fst_count++;
+       list_for_each_entry(flde, head, fce_list) {
+               if (flde->fce_range.lsr_start > seq) {
+                       if (prev != NULL)
+                               *range = prev->fce_range;
+                       break;
+               }
+
+               prev = flde;
+               if (range_within(&flde->fce_range, seq)) {
+                       *range = flde->fce_range;
+
+                       cache->fci_stat.fst_cache++;
+                       read_unlock(&cache->fci_lock);
+                       RETURN(0);
+               }
+       }
+       read_unlock(&cache->fci_lock);
+       RETURN(-ENOENT);
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_handler.c b/drivers/staging/lustre/lustre/fld/fld_handler.c
new file mode 100644 (file)
index 0000000..d2707ae
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <md_object.h>
+#include <lustre_fid.h>
+#include <lustre_req_layout.h>
+#include "fld_internal.h"
+#include <lustre_fid.h>
+
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+proc_dir_entry_t *fld_type_proc_dir = NULL;
+
+static int __init fld_mod_init(void)
+{
+       fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+                                            proc_lustre_root,
+                                            NULL, NULL);
+       if (IS_ERR(fld_type_proc_dir))
+               return PTR_ERR(fld_type_proc_dir);
+
+       LU_CONTEXT_KEY_INIT(&fld_thread_key);
+       lu_context_key_register(&fld_thread_key);
+       return 0;
+}
+
+static void __exit fld_mod_exit(void)
+{
+       lu_context_key_degister(&fld_thread_key);
+       if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+               lprocfs_remove(&fld_type_proc_dir);
+               fld_type_proc_dir = NULL;
+       }
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+                             struct lu_server_fld *fld,
+                             struct lu_seq_range *range,
+                             struct thandle *th)
+{
+       int rc;
+
+       rc = fld_declare_index_create(env, fld, range, th);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+                     struct lu_seq_range *range, struct thandle *th)
+{
+       int rc;
+
+       mutex_lock(&fld->lsf_lock);
+       rc = fld_index_create(env, fld, range, th);
+       mutex_unlock(&fld->lsf_lock);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ *  Lookup mds by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                     seqno_t seq, struct lu_seq_range *range)
+{
+       struct lu_seq_range *erange;
+       struct fld_thread_info *info;
+       int rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+       erange = &info->fti_lrange;
+
+       /* Lookup it in the cache. */
+       rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+       if (rc == 0) {
+               if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+                            !fld_range_is_any(range))) {
+                       CERROR("%s: FLD cache range "DRANGE" does not match"
+                              "requested flag %x: rc = %d\n", fld->lsf_name,
+                              PRANGE(erange), range->lsr_flags, -EIO);
+                       RETURN(-EIO);
+               }
+               *range = *erange;
+               RETURN(0);
+       }
+
+       if (fld->lsf_obj) {
+               /* On server side, all entries should be in cache.
+                * If we can not find it in cache, just return error */
+               CERROR("%s: Cannot find sequence "LPX64": rc = %d\n",
+                       fld->lsf_name, seq, -EIO);
+               RETURN(-EIO);
+       } else {
+               LASSERT(fld->lsf_control_exp);
+               /* send request to mdt0 i.e. super seq. controller.
+                * This is temporary solution, long term solution is fld
+                * replication on all mdt servers.
+                */
+               range->lsr_start = seq;
+               rc = fld_client_rpc(fld->lsf_control_exp,
+                                   range, FLD_LOOKUP);
+               if (rc == 0)
+                       fld_cache_insert(fld->lsf_cache, range);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+
+static int fld_server_handle(struct lu_server_fld *fld,
+                            const struct lu_env *env,
+                            __u32 opc, struct lu_seq_range *range,
+                            struct fld_thread_info *info)
+{
+       int rc;
+       ENTRY;
+
+       switch (opc) {
+       case FLD_LOOKUP:
+               rc = fld_server_lookup(env, fld, range->lsr_start, range);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: "
+              DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range));
+
+       RETURN(rc);
+
+}
+
+static int fld_req_handle(struct ptlrpc_request *req,
+                         struct fld_thread_info *info)
+{
+       struct obd_export *exp = req->rq_export;
+       struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+       struct lu_seq_range *in;
+       struct lu_seq_range *out;
+       int rc;
+       __u32 *opc;
+       ENTRY;
+
+       rc = req_capsule_server_pack(info->fti_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       opc = req_capsule_client_get(info->fti_pill, &RMF_FLD_OPC);
+       if (opc != NULL) {
+               in = req_capsule_client_get(info->fti_pill, &RMF_FLD_MDFLD);
+               if (in == NULL)
+                       RETURN(err_serious(-EPROTO));
+               out = req_capsule_server_get(info->fti_pill, &RMF_FLD_MDFLD);
+               if (out == NULL)
+                       RETURN(err_serious(-EPROTO));
+               *out = *in;
+
+               /* For old 2.0 client, the 'lsr_flags' is uninitialized.
+                * Set it as 'LU_SEQ_RANGE_MDT' by default. */
+               if (!(exp_connect_flags(exp) & OBD_CONNECT_64BITHASH) &&
+                   !(exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
+                   !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) &&
+                   !exp->exp_libclient)
+                       fld_range_set_mdt(out);
+
+               rc = fld_server_handle(lu_site2seq(site)->ss_server_fld,
+                                      req->rq_svc_thread->t_env,
+                                      *opc, out, info);
+       } else {
+               rc = err_serious(-EPROTO);
+       }
+
+       RETURN(rc);
+}
+
+static void fld_thread_info_init(struct ptlrpc_request *req,
+                                struct fld_thread_info *info)
+{
+       info->fti_pill = &req->rq_pill;
+       /* Init request capsule. */
+       req_capsule_init(info->fti_pill, req, RCL_SERVER);
+       req_capsule_set(info->fti_pill, &RQF_FLD_QUERY);
+}
+
+static void fld_thread_info_fini(struct fld_thread_info *info)
+{
+       req_capsule_fini(info->fti_pill);
+}
+
+static int fld_handle(struct ptlrpc_request *req)
+{
+       struct fld_thread_info *info;
+       const struct lu_env *env;
+       int rc;
+
+       env = req->rq_svc_thread->t_env;
+       LASSERT(env != NULL);
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+
+       fld_thread_info_init(req, info);
+       rc = fld_req_handle(req, info);
+       fld_thread_info_fini(info);
+
+       return rc;
+}
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int fld_query(struct com_thread_info *info)
+{
+       return fld_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(fld_query);
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+                struct lu_site *site, const struct lu_fid *fid)
+{
+       int result;
+       struct seq_server_site *ss_site;
+       struct lu_seq_range *range;
+       struct fld_thread_info *info;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       range = &info->fti_lrange;
+
+       result = 1; /* conservatively assume fid is local */
+       ss_site = lu_site2seq(site);
+       if (ss_site->ss_client_fld != NULL) {
+               int rc;
+
+               rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+                                     fid_seq(fid), range);
+               if (rc == 0)
+                       result = (range->lsr_index == ss_site->ss_node_id);
+       }
+       return result;
+}
+EXPORT_SYMBOL(fid_is_local);
+
+static void fld_server_proc_fini(struct lu_server_fld *fld);
+
+#ifdef LPROCFS
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+       int rc = 0;
+       ENTRY;
+
+       fld->lsf_proc_dir = lprocfs_register(fld->lsf_name,
+                                            fld_type_proc_dir,
+                                            fld_server_proc_list, fld);
+       if (IS_ERR(fld->lsf_proc_dir)) {
+               rc = PTR_ERR(fld->lsf_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+                               &fld_proc_seq_fops, fld);
+       if (rc) {
+               lprocfs_remove(&fld->lsf_proc_dir);
+               fld->lsf_proc_dir = NULL;
+       }
+
+       RETURN(rc);
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+       ENTRY;
+       if (fld->lsf_proc_dir != NULL) {
+               if (!IS_ERR(fld->lsf_proc_dir))
+                       lprocfs_remove(&fld->lsf_proc_dir);
+               fld->lsf_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+       return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+       return;
+}
+#endif
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+                   struct dt_device *dt, const char *prefix, int mds_node_id,
+                   int type)
+{
+       int cache_size, cache_threshold;
+       int rc;
+       ENTRY;
+
+       snprintf(fld->lsf_name, sizeof(fld->lsf_name),
+                "srv-%s", prefix);
+
+       cache_size = FLD_SERVER_CACHE_SIZE /
+               sizeof(struct fld_cache_entry);
+
+       cache_threshold = cache_size *
+               FLD_SERVER_CACHE_THRESHOLD / 100;
+
+       mutex_init(&fld->lsf_lock);
+       fld->lsf_cache = fld_cache_init(fld->lsf_name,
+                                       cache_size, cache_threshold);
+       if (IS_ERR(fld->lsf_cache)) {
+               rc = PTR_ERR(fld->lsf_cache);
+               fld->lsf_cache = NULL;
+               GOTO(out, rc);
+       }
+
+       if (!mds_node_id && type == LU_SEQ_RANGE_MDT) {
+               rc = fld_index_init(env, fld, dt);
+               if (rc)
+                       GOTO(out, rc);
+       } else {
+               fld->lsf_obj = NULL;
+       }
+
+       rc = fld_server_proc_init(fld);
+       if (rc)
+               GOTO(out, rc);
+
+       fld->lsf_control_exp = NULL;
+
+       GOTO(out, rc);
+
+out:
+       if (rc)
+               fld_server_fini(env, fld);
+       return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+       ENTRY;
+
+       fld_server_proc_fini(fld);
+       fld_index_fini(env, fld);
+
+       if (fld->lsf_cache != NULL) {
+               if (!IS_ERR(fld->lsf_cache))
+                       fld_cache_fini(fld->lsf_cache);
+               fld->lsf_cache = NULL;
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+cfs_module(mdd, "0.1.0", fld_mod_init, fld_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fld/fld_index.c b/drivers/staging/lustre/lustre/fld/fld_index.c
new file mode 100644 (file)
index 0000000..ec68a54
--- /dev/null
@@ -0,0 +1,426 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_mdc.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+       .lsr_start = FID_SEQ_IGIF,
+       .lsr_end   = FID_SEQ_IGIF_MAX + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+       .lsr_start = FID_SEQ_DOT_LUSTRE,
+       .lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+       .lsr_start = FID_SEQ_ROOT,
+       .lsr_end   = FID_SEQ_ROOT + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+const struct dt_index_features fld_index_features = {
+       .dif_flags       = DT_IND_UPDATE,
+       .dif_keysize_min = sizeof(seqno_t),
+       .dif_keysize_max = sizeof(seqno_t),
+       .dif_recsize_min = sizeof(struct lu_seq_range),
+       .dif_recsize_max = sizeof(struct lu_seq_range),
+       .dif_ptrsize     = 4
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_declare_index_create(const struct lu_env *env,
+                            struct lu_server_fld *fld,
+                            const struct lu_seq_range *new_range,
+                            struct thandle *th)
+{
+       struct lu_seq_range     *tmp;
+       struct lu_seq_range     *range;
+       struct fld_thread_info  *info;
+       int                     rc = 0;
+
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       range = &info->fti_lrange;
+       tmp = &info->fti_irange;
+       memset(range, 0, sizeof(*range));
+
+       rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+       if (rc == 0) {
+               /* In case of duplicate entry, the location must be same */
+               LASSERT((range_compare_loc(new_range, range) == 0));
+               GOTO(out, rc = -EEXIST);
+       }
+
+       if (rc != -ENOENT) {
+               CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+                       fld->lsf_name, PRANGE(range), rc);
+               GOTO(out, rc);
+       }
+
+       /* Check for merge case, since the fld entry can only be increamental,
+        * so we will only check whether it can be merged from the left. */
+       if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+           range_compare_loc(new_range, range) == 0) {
+               range_cpu_to_be(tmp, range);
+               rc = dt_declare_delete(env, fld->lsf_obj,
+                                      (struct dt_key *)&tmp->lsr_start, th);
+               if (rc) {
+                       CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+                              fld->lsf_name, PRANGE(range), rc);
+                       GOTO(out, rc);
+               }
+               memcpy(tmp, new_range, sizeof(*new_range));
+               tmp->lsr_start = range->lsr_start;
+       } else {
+               memcpy(tmp, new_range, sizeof(*new_range));
+       }
+
+       range_cpu_to_be(tmp, tmp);
+       rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+                              (struct dt_key *)&tmp->lsr_start, th);
+out:
+       RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *                  transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+                    const struct lu_seq_range *new_range, struct thandle *th)
+{
+       struct lu_seq_range     *range;
+       struct lu_seq_range     *tmp;
+       struct fld_thread_info  *info;
+       int                     rc = 0;
+       int                     deleted = 0;
+       struct fld_cache_entry  *flde;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+       LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+       range = &info->fti_lrange;
+       memset(range, 0, sizeof(*range));
+       tmp = &info->fti_irange;
+       rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+       if (rc != -ENOENT) {
+               rc = rc == 0 ? -EEXIST : rc;
+               GOTO(out, rc);
+       }
+
+       if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+           range_compare_loc(new_range, range) == 0) {
+               range_cpu_to_be(tmp, range);
+               rc = dt_delete(env, fld->lsf_obj,
+                              (struct dt_key *)&tmp->lsr_start, th,
+                               BYPASS_CAPA);
+               if (rc != 0)
+                       GOTO(out, rc);
+               memcpy(tmp, new_range, sizeof(*new_range));
+               tmp->lsr_start = range->lsr_start;
+               deleted = 1;
+       } else {
+               memcpy(tmp, new_range, sizeof(*new_range));
+       }
+
+       range_cpu_to_be(tmp, tmp);
+       rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+                      (struct dt_key *)&tmp->lsr_start, th, BYPASS_CAPA, 1);
+       if (rc != 0) {
+               CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+                      fld->lsf_name, PRANGE(new_range), rc);
+               GOTO(out, rc);
+       }
+
+       flde = fld_cache_entry_create(new_range);
+       if (IS_ERR(flde))
+               GOTO(out, rc = PTR_ERR(flde));
+
+       write_lock(&fld->lsf_cache->fci_lock);
+       if (deleted)
+               fld_cache_delete_nolock(fld->lsf_cache, new_range);
+       rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+       write_unlock(&fld->lsf_cache->fci_lock);
+       if (rc)
+               OBD_FREE_PTR(flde);
+out:
+       RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0     found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve         other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                    seqno_t seq, struct lu_seq_range *range)
+{
+       struct lu_seq_range     *fld_rec;
+       struct fld_thread_info  *info;
+       int rc;
+
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       fld_rec = &info->fti_rec;
+
+       rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+       if (rc == 0) {
+               *range = *fld_rec;
+               if (range_within(range, seq))
+                       rc = 0;
+               else
+                       rc = -ENOENT;
+       }
+
+       CDEBUG(D_INFO, "%s: lookup seq = "LPX64" range : "DRANGE" rc = %d\n",
+              fld->lsf_name, seq, PRANGE(range), rc);
+
+       RETURN(rc);
+}
+
+int fld_insert_entry(const struct lu_env *env,
+                    struct lu_server_fld *fld,
+                    const struct lu_seq_range *range)
+{
+       struct thandle *th;
+       int rc;
+       ENTRY;
+
+       th = dt_trans_create(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev));
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       rc = fld_declare_index_create(env, fld, range, th);
+       if (rc != 0) {
+               if (rc == -EEXIST)
+                       rc = 0;
+               GOTO(out, rc);
+       }
+
+       rc = dt_trans_start_local(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev),
+                                 th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = fld_index_create(env, fld, range, th);
+       if (rc == -EEXIST)
+               rc = 0;
+out:
+       dt_trans_stop(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), th);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+                                     struct lu_server_fld *fld)
+{
+       int rc;
+
+       rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+       RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+                  struct dt_device *dt)
+{
+       struct dt_object        *dt_obj = NULL;
+       struct lu_fid           fid;
+       struct lu_attr          *attr = NULL;
+       struct lu_seq_range     *range = NULL;
+       struct fld_thread_info  *info;
+       struct dt_object_format dof;
+       struct dt_it            *it;
+       const struct dt_it_ops  *iops;
+       int                     rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+
+       lu_local_obj_fid(&fid, FLD_INDEX_OID);
+       OBD_ALLOC_PTR(attr);
+       if (attr == NULL)
+               RETURN(-ENOMEM);
+
+       memset(attr, 0, sizeof(*attr));
+       attr->la_valid = LA_MODE;
+       attr->la_mode = S_IFREG | 0666;
+       dof.dof_type = DFT_INDEX;
+       dof.u.dof_idx.di_feat = &fld_index_features;
+
+       dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+       if (IS_ERR(dt_obj)) {
+               rc = PTR_ERR(dt_obj);
+               CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+                       fld_index_name, rc);
+               dt_obj = NULL;
+               GOTO(out, rc);
+       }
+
+       fld->lsf_obj = dt_obj;
+       rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+       if (rc != 0) {
+               CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+                      fld->lsf_name, fld_index_name, rc);
+               GOTO(out, rc);
+       }
+
+       range = &info->fti_rec;
+       /* Load fld entry to cache */
+       iops = &dt_obj->do_index_ops->dio_it;
+       it = iops->init(env, dt_obj, 0, NULL);
+       if (IS_ERR(it))
+               GOTO(out, rc = PTR_ERR(it));
+
+       rc = iops->load(env, it, 0);
+       if (rc < 0)
+               GOTO(out_it_fini, rc);
+
+       if (rc > 0) {
+               /* Load FLD entry into server cache */
+               do {
+                       rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+                       if (rc != 0)
+                               GOTO(out_it_put, rc);
+                       LASSERT(range != NULL);
+                       range_be_to_cpu(range, range);
+                       rc = fld_cache_insert(fld->lsf_cache, range);
+                       if (rc != 0)
+                               GOTO(out_it_put, rc);
+                       rc = iops->next(env, it);
+               } while (rc == 0);
+       }
+
+       /* Note: fld_insert_entry will detect whether these
+        * special entries already exist inside FLDB */
+       mutex_lock(&fld->lsf_lock);
+       rc = fld_insert_special_entries(env, fld);
+       mutex_unlock(&fld->lsf_lock);
+       if (rc != 0) {
+               CERROR("%s: insert special entries failed!: rc = %d\n",
+                      fld->lsf_name, rc);
+               GOTO(out_it_put, rc);
+       }
+
+out_it_put:
+       iops->put(env, it);
+out_it_fini:
+       iops->fini(env, it);
+out:
+       if (attr != NULL)
+               OBD_FREE_PTR(attr);
+
+       if (rc != 0) {
+               if (dt_obj != NULL)
+                       lu_object_put(env, &dt_obj->do_lu);
+               fld->lsf_obj = NULL;
+       }
+       RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+       ENTRY;
+       if (fld->lsf_obj != NULL) {
+               if (!IS_ERR(fld->lsf_obj))
+                       lu_object_put(env, &fld->lsf_obj->do_lu);
+               fld->lsf_obj = NULL;
+       }
+       EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644 (file)
index 0000000..9fa9e01
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+
+enum {
+       LUSTRE_FLD_INIT = 1 << 0,
+       LUSTRE_FLD_RUN  = 1 << 1
+};
+
+struct fld_stats {
+       __u64   fst_count;
+       __u64   fst_cache;
+       __u64   fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+       const char            *fh_name;
+       fld_hash_func_t   fh_hash_func;
+       fld_scan_func_t   fh_scan_func;
+};
+
+struct fld_cache_entry {
+       struct list_head               fce_lru;
+       struct list_head               fce_list;
+       /**
+        * fld cache entries are sorted on range->lsr_start field. */
+       struct lu_seq_range      fce_range;
+};
+
+struct fld_cache {
+       /**
+        * Cache guard, protects fci_hash mostly because others immutable after
+        * init is finished.
+        */
+       rwlock_t                 fci_lock;
+
+       /**
+        * Cache shrink threshold */
+       int                   fci_threshold;
+
+       /**
+        * Prefered number of cached entries */
+       int                   fci_cache_size;
+
+       /**
+        * Current number of cached entries. Protected by \a fci_lock */
+       int                   fci_cache_count;
+
+       /**
+        * LRU list fld entries. */
+       struct list_head               fci_lru;
+
+       /**
+        * sorted fld entries. */
+       struct list_head               fci_entries_head;
+
+       /**
+        * Cache statistics. */
+       struct fld_stats         fci_stat;
+
+       /**
+        * Cache name used for debug and messages. */
+       char                 fci_name[80];
+       unsigned int             fci_no_shrink:1;
+};
+
+enum fld_op {
+       FLD_CREATE = 0,
+       FLD_DELETE = 1,
+       FLD_LOOKUP = 2
+};
+
+enum {
+       /* 4M of FLD cache will not hurt client a lot. */
+       FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+       /* 1M of FLD cache will not hurt client a lot. */
+       FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+       /* Cache threshold is 10 percent of size. */
+       FLD_SERVER_CACHE_THRESHOLD = 10,
+
+       /* Cache threshold is 10 percent of size. */
+       FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+
+struct fld_thread_info {
+       struct req_capsule *fti_pill;
+       __u64          fti_key;
+       struct lu_seq_range fti_rec;
+       struct lu_seq_range fti_lrange;
+       struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+                  struct dt_device *dt);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+                            struct lu_server_fld *fld,
+                            const struct lu_seq_range *new,
+                            struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+                    const struct lu_seq_range *new, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                    seqno_t seq, struct lu_seq_range *range);
+
+int fld_client_rpc(struct obd_export *exp,
+                  struct lu_seq_range *range, __u32 fld_op);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars fld_server_proc_list[];
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+                                int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+                    const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+                           struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+                     const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+                            const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+                    const seqno_t seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+                           struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+                             struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+       if (tar->ft_srv != NULL)
+               return tar->ft_srv->lsf_name;
+
+       return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+extern proc_dir_entry_t *fld_type_proc_dir;
+extern struct file_operations fld_proc_seq_fops;
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644 (file)
index 0000000..e9f0739
--- /dev/null
@@ -0,0 +1,519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+       int rc;
+       ENTRY;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&mcw->mcw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       RETURN(rc);
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+       struct mdc_cache_waiter mcw;
+       struct l_wait_info lwi = { 0 };
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+               list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+               init_waitqueue_head(&mcw.mcw_waitq);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+       } else {
+               cli->cl_r_in_flight++;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct mdc_cache_waiter *mcw;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_r_in_flight--;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+               if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+                       /* No free request slots anymore */
+                       break;
+               }
+
+               mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+               list_del_init(&mcw->mcw_entry);
+               cli->cl_r_in_flight++;
+               wake_up(&mcw->mcw_waitq);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld,
+                       seqno_t seq)
+{
+       LASSERT(fld->lcf_count > 0);
+       return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq)
+{
+       struct lu_fld_target *target;
+       int hash;
+       ENTRY;
+
+       /* Because almost all of special sequence located in MDT0,
+        * it should go to index 0 directly, instead of calculating
+        * hash again, and also if other MDTs is not being connected,
+        * the fld lookup requests(for seq on MDT0) should not be
+        * blocked because of other MDTs */
+       if (fid_seq_is_norm(seq))
+               hash = fld_rrb_hash(fld, seq);
+       else
+               hash = 0;
+
+       list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+               if (target->ft_idx == hash)
+                       RETURN(target);
+       }
+
+       CERROR("%s: Can't find target by hash %d (seq "LPX64"). "
+              "Targets (%d):\n", fld->lcf_name, hash, seq,
+              fld->lcf_count);
+
+       list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+               const char *srv_name = target->ft_srv != NULL  ?
+                       target->ft_srv->lsf_name : "<null>";
+               const char *exp_name = target->ft_exp != NULL ?
+                       (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+                       "<null>";
+
+               CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: "LPU64"\n",
+                      target->ft_exp, exp_name, target->ft_srv,
+                      srv_name, target->ft_idx);
+       }
+
+       /*
+        * If target is not found, there is logical error anyway, so here is
+        * LBUG() to catch this situation.
+        */
+       LBUG();
+       RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+       {
+               .fh_name = "RRB",
+               .fh_hash_func = fld_rrb_hash,
+               .fh_scan_func = fld_rrb_scan
+       },
+       {
+               0,
+       }
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, seqno_t seq)
+{
+       struct lu_fld_target *target;
+       ENTRY;
+
+       LASSERT(fld->lcf_hash != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       target = fld->lcf_hash->fh_scan_func(fld, seq);
+       spin_unlock(&fld->lcf_lock);
+
+       if (target != NULL) {
+               CDEBUG(D_INFO, "%s: Found target (idx "LPU64
+                      ") by seq "LPX64"\n", fld->lcf_name,
+                      target->ft_idx, seq);
+       }
+
+       RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+                         struct lu_fld_target *tar)
+{
+       const char *name;
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       LASSERT(tar != NULL);
+       name = fld_target_name(tar);
+       LASSERT(name != NULL);
+       LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+       if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+               CERROR("%s: Attempt to add target %s (idx "LPU64") "
+                      "on fly - skip it\n", fld->lcf_name, name,
+                      tar->ft_idx);
+               RETURN(0);
+       } else {
+               CDEBUG(D_INFO, "%s: Adding target %s (idx "
+                      LPU64")\n", fld->lcf_name, name, tar->ft_idx);
+       }
+
+       OBD_ALLOC_PTR(target);
+       if (target == NULL)
+               RETURN(-ENOMEM);
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+               if (tmp->ft_idx == tar->ft_idx) {
+                       spin_unlock(&fld->lcf_lock);
+                       OBD_FREE_PTR(target);
+                       CERROR("Target %s exists in FLD and known as %s:#"LPU64"\n",
+                              name, fld_target_name(tmp), tmp->ft_idx);
+                       RETURN(-EEXIST);
+               }
+       }
+
+       target->ft_exp = tar->ft_exp;
+       if (target->ft_exp != NULL)
+               class_export_get(target->ft_exp);
+       target->ft_srv = tar->ft_srv;
+       target->ft_idx = tar->ft_idx;
+
+       list_add_tail(&target->ft_chain,
+                         &fld->lcf_targets);
+
+       fld->lcf_count++;
+       spin_unlock(&fld->lcf_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry_safe(target, tmp,
+                                    &fld->lcf_targets, ft_chain) {
+               if (target->ft_idx == idx) {
+                       fld->lcf_count--;
+                       list_del(&target->ft_chain);
+                       spin_unlock(&fld->lcf_lock);
+
+                       if (target->ft_exp != NULL)
+                               class_export_put(target->ft_exp);
+
+                       OBD_FREE_PTR(target);
+                       RETURN(0);
+               }
+       }
+       spin_unlock(&fld->lcf_lock);
+       RETURN(-ENOENT);
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+#ifdef LPROCFS
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+       int rc;
+       ENTRY;
+
+       fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+                                            fld_type_proc_dir,
+                                            NULL, NULL);
+
+       if (IS_ERR(fld->lcf_proc_dir)) {
+               CERROR("%s: LProcFS failed in fld-init\n",
+                      fld->lcf_name);
+               rc = PTR_ERR(fld->lcf_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(fld->lcf_proc_dir,
+                             fld_client_proc_list, fld);
+       if (rc) {
+               CERROR("%s: Can't init FLD proc, rc %d\n",
+                      fld->lcf_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       fld_client_proc_fini(fld);
+       return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+       ENTRY;
+       if (fld->lcf_proc_dir) {
+               if (!IS_ERR(fld->lcf_proc_dir))
+                       lprocfs_remove(&fld->lcf_proc_dir);
+               fld->lcf_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+       return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+       return;
+}
+#endif
+
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+       return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+                   const char *prefix, int hash)
+{
+       int cache_size, cache_threshold;
+       int rc;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+                "cli-%s", prefix);
+
+       if (!hash_is_sane(hash)) {
+               CERROR("%s: Wrong hash function %#x\n",
+                      fld->lcf_name, hash);
+               RETURN(-EINVAL);
+       }
+
+       fld->lcf_count = 0;
+       spin_lock_init(&fld->lcf_lock);
+       fld->lcf_hash = &fld_hash[hash];
+       fld->lcf_flags = LUSTRE_FLD_INIT;
+       INIT_LIST_HEAD(&fld->lcf_targets);
+
+       cache_size = FLD_CLIENT_CACHE_SIZE /
+               sizeof(struct fld_cache_entry);
+
+       cache_threshold = cache_size *
+               FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+       fld->lcf_cache = fld_cache_init(fld->lcf_name,
+                                       cache_size, cache_threshold);
+       if (IS_ERR(fld->lcf_cache)) {
+               rc = PTR_ERR(fld->lcf_cache);
+               fld->lcf_cache = NULL;
+               GOTO(out, rc);
+       }
+
+       rc = fld_client_proc_init(fld);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+out:
+       if (rc)
+               fld_client_fini(fld);
+       else
+               CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+                      fld->lcf_name, fld->lcf_hash->fh_name);
+       return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry_safe(target, tmp,
+                                    &fld->lcf_targets, ft_chain) {
+               fld->lcf_count--;
+               list_del(&target->ft_chain);
+               if (target->ft_exp != NULL)
+                       class_export_put(target->ft_exp);
+               OBD_FREE_PTR(target);
+       }
+       spin_unlock(&fld->lcf_lock);
+
+       if (fld->lcf_cache != NULL) {
+               if (!IS_ERR(fld->lcf_cache))
+                       fld_cache_fini(fld->lcf_cache);
+               fld->lcf_cache = NULL;
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+                  struct lu_seq_range *range, __u32 fld_op)
+{
+       struct ptlrpc_request *req;
+       struct lu_seq_range   *prange;
+       __u32            *op;
+       int                 rc;
+       struct obd_import     *imp;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+
+       imp = class_exp2cliimp(exp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+                                       FLD_QUERY);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+       *op = fld_op;
+
+       prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+       *prange = *range;
+
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = FLD_REQUEST_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (fld_op == FLD_LOOKUP &&
+           imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+               req->rq_allow_replay = 1;
+
+       if (fld_op != FLD_LOOKUP)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       fld_enter_request(&exp->exp_obd->u.cli);
+       rc = ptlrpc_queue_wait(req);
+       fld_exit_request(&exp->exp_obd->u.cli);
+       if (fld_op != FLD_LOOKUP)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc)
+               GOTO(out_req, rc);
+
+       prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+       if (prange == NULL)
+               GOTO(out_req, rc = -EFAULT);
+       *range = *prange;
+       EXIT;
+out_req:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+                     __u32 flags, const struct lu_env *env)
+{
+       struct lu_seq_range res = { 0 };
+       struct lu_fld_target *target;
+       int rc;
+       ENTRY;
+
+       fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+       rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+       if (rc == 0) {
+               *mds = res.lsr_index;
+               RETURN(0);
+       }
+
+       /* Can not find it in the cache */
+       target = fld_client_get_target(fld, seq);
+       LASSERT(target != NULL);
+
+       CDEBUG(D_INFO, "%s: Lookup fld entry (seq: "LPX64") on "
+              "target %s (idx "LPU64")\n", fld->lcf_name, seq,
+              fld_target_name(target), target->ft_idx);
+
+       res.lsr_start = seq;
+       fld_range_set_type(&res, flags);
+       if (target->ft_srv != NULL) {
+               LASSERT(env != NULL);
+               rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+       } else {
+               rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+       }
+
+       if (rc == 0) {
+               *mds = res.lsr_index;
+
+               fld_cache_insert(fld->lcf_cache, &res);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+       fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);
diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644 (file)
index 0000000..00fe31e
--- /dev/null
@@ -0,0 +1,365 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *     Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+#ifdef LPROCFS
+static int
+fld_proc_read_targets(char *page, char **start, off_t off,
+                     int count, int *eof, void *data)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)data;
+       struct lu_fld_target *target;
+       int total = 0, rc;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry(target,
+                               &fld->lcf_targets, ft_chain)
+       {
+               rc = snprintf(page, count, "%s\n",
+                             fld_target_name(target));
+               page += rc;
+               count -= rc;
+               total += rc;
+               if (count == 0)
+                       break;
+       }
+       spin_unlock(&fld->lcf_lock);
+       RETURN(total);
+}
+
+static int
+fld_proc_read_hash(char *page, char **start, off_t off,
+                  int count, int *eof, void *data)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)data;
+       int rc;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       rc = snprintf(page, count, "%s\n", fld->lcf_hash->fh_name);
+       spin_unlock(&fld->lcf_lock);
+
+       RETURN(rc);
+}
+
+static int
+fld_proc_write_hash(struct file *file, const char *buffer,
+                   unsigned long count, void *data)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)data;
+       struct lu_fld_hash *hash = NULL;
+       int i;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+               if (count != strlen(fld_hash[i].fh_name))
+                       continue;
+
+               if (!strncmp(fld_hash[i].fh_name, buffer, count)) {
+                       hash = &fld_hash[i];
+                       break;
+               }
+       }
+
+       if (hash != NULL) {
+               spin_lock(&fld->lcf_lock);
+               fld->lcf_hash = hash;
+               spin_unlock(&fld->lcf_lock);
+
+               CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+                      fld->lcf_name, hash->fh_name);
+       }
+
+       RETURN(count);
+}
+
+static int
+fld_proc_write_cache_flush(struct file *file, const char *buffer,
+                          unsigned long count, void *data)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)data;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       fld_cache_flush(fld->lcf_cache);
+
+       CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+       RETURN(count);
+}
+
+struct fld_seq_param {
+       struct lu_env           fsp_env;
+       struct dt_it            *fsp_it;
+       struct lu_server_fld    *fsp_fld;
+       unsigned int            fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+
+       if (param == NULL || param->fsp_stop)
+               return NULL;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       iops->load(&param->fsp_env, param->fsp_it, *pos);
+
+       *pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+       return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+       struct fld_seq_param    *param = p->private;
+       const struct dt_it_ops  *iops;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+
+       if (param == NULL)
+               return;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       int                     rc;
+
+       if (param == NULL || param->fsp_stop)
+               return NULL;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       rc = iops->next(&param->fsp_env, param->fsp_it);
+       if (rc > 0) {
+               param->fsp_stop = 1;
+               return NULL;
+       }
+
+       *pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+       return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       struct fld_thread_info  *info;
+       struct lu_seq_range     *fld_rec;
+       int                     rc;
+
+       if (param == NULL || param->fsp_stop)
+               return 0;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       info = lu_context_key_get(&param->fsp_env.le_ctx,
+                                 &fld_thread_key);
+       fld_rec = &info->fti_rec;
+       rc = iops->rec(&param->fsp_env, param->fsp_it,
+                      (struct dt_rec *)fld_rec, 0);
+       if (rc != 0) {
+               CERROR("%s:read record error: rc %d\n",
+                      fld->lsf_name, rc);
+       } else if (fld_rec->lsr_start != 0) {
+               range_be_to_cpu(fld_rec, fld_rec);
+               rc = seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+       }
+
+       return rc;
+}
+
+struct seq_operations fldb_sops = {
+       .start = fldb_seq_start,
+       .stop = fldb_seq_stop,
+       .next = fldb_seq_next,
+       .show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry   *dp = PDE(inode);
+       struct seq_file         *seq;
+       struct lu_server_fld    *fld = (struct lu_server_fld *)dp->data;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       struct fld_seq_param    *param = NULL;
+       int                     env_init = 0;
+       int                     rc;
+
+       LPROCFS_ENTRY_AND_CHECK(dp);
+       rc = seq_open(file, &fldb_sops);
+       if (rc)
+               GOTO(out, rc);
+
+       obj = fld->lsf_obj;
+       if (obj == NULL) {
+               seq = file->private_data;
+               seq->private = NULL;
+               return 0;
+       }
+
+       OBD_ALLOC_PTR(param);
+       if (param == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       env_init = 1;
+       iops = &obj->do_index_ops->dio_it;
+       param->fsp_it = iops->init(&param->fsp_env, obj, 0, NULL);
+       if (IS_ERR(param->fsp_it))
+               GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+       param->fsp_fld = fld;
+       param->fsp_stop = 0;
+
+       seq = file->private_data;
+       seq->private = param;
+out:
+       if (rc != 0) {
+               if (env_init == 1)
+                       lu_env_fini(&param->fsp_env);
+               if (param != NULL)
+                       OBD_FREE_PTR(param);
+               LPROCFS_EXIT();
+       }
+       return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+       struct seq_file         *seq = file->private_data;
+       struct fld_seq_param    *param;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+
+       param = seq->private;
+       if (param == NULL) {
+               lprocfs_seq_release(inode, file);
+               return 0;
+       }
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       LASSERT(iops != NULL);
+       LASSERT(obj != NULL);
+       LASSERT(param->fsp_it != NULL);
+       iops->fini(&param->fsp_env, param->fsp_it);
+       lu_env_fini(&param->fsp_env);
+       OBD_FREE_PTR(param);
+       lprocfs_seq_release(inode, file);
+
+       return 0;
+}
+
+struct lprocfs_vars fld_server_proc_list[] = {
+       { NULL }};
+
+struct lprocfs_vars fld_client_proc_list[] = {
+       { "targets",     fld_proc_read_targets, NULL, NULL },
+       { "hash",       fld_proc_read_hash, fld_proc_write_hash, NULL },
+       { "cache_flush", NULL, fld_proc_write_cache_flush, NULL },
+       { NULL }};
+
+struct file_operations fld_proc_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = fldb_seq_open,
+       .read    = seq_read,
+       .release = fldb_seq_release,
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644 (file)
index 0000000..61c4635
--- /dev/null
@@ -0,0 +1,3281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *              system call, or write-out of pages from under the lock being
+ *              canceled. cl_io has sub-ios that can be stopped and resumed
+ *              independently, thus achieving high degree of transfer
+ *              parallelism. Single cl_io can be advanced forward by
+ *              the multiple threads (although in the most usual case of
+ *              read/write system call it is associated with the single user
+ *              thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *              constructed by req-forming engine that tries to saturate
+ *              transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *       - cl_object_header::coh_page_guard
+ *       - cl_object_header::coh_lock_guard
+ *       - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+#      include <linux/mutex.h>
+#      include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+       /**
+        * Initialize cl_req. This method is called top-to-bottom on all
+        * devices in the stack to get them a chance to allocate layer-private
+        * data, and to attach them to the cl_req by calling
+        * cl_req_slice_add().
+        *
+        * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+        * \see ccc_req_init()
+        */
+       int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+                           struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+       /** Super-class. */
+       struct lu_device                   cd_lu_dev;
+       /** Per-layer operation vector. */
+       const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+       /** Object size, in bytes */
+       loff_t cat_size;
+       /**
+        * Known minimal size, in bytes.
+        *
+        * This is only valid when at least one DLM lock is held.
+        */
+       loff_t cat_kms;
+       /** Modification time. Measured in seconds since epoch. */
+       time_t cat_mtime;
+       /** Access time. Measured in seconds since epoch. */
+       time_t cat_atime;
+       /** Change time. Measured in seconds since epoch. */
+       time_t cat_ctime;
+       /**
+        * Blocks allocated to this cl_object on the server file system.
+        *
+        * \todo XXX An interface for block size is needed.
+        */
+       __u64  cat_blocks;
+       /**
+        * User identifier for quota purposes.
+        */
+       uid_t  cat_uid;
+       /**
+        * Group identifier for quota purposes.
+        */
+       gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+       CAT_SIZE   = 1 << 0,
+       CAT_KMS    = 1 << 1,
+       CAT_MTIME  = 1 << 3,
+       CAT_ATIME  = 1 << 4,
+       CAT_CTIME  = 1 << 5,
+       CAT_BLOCKS = 1 << 6,
+       CAT_UID    = 1 << 7,
+       CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *     - vvp
+ *     - lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *     - lovsub
+ *     - osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+       /** super class */
+       struct lu_object                   co_lu;
+       /** per-object-layer operations */
+       const struct cl_object_operations *co_ops;
+       /** offset of page slice in cl_page buffer */
+       int                                co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+       /** Super-class. */
+       struct lu_object_conf     coc_lu;
+       union {
+               /**
+                * Object layout. This is consumed by lov.
+                */
+               struct lustre_md *coc_md;
+               /**
+                * Description of particular stripe location in the
+                * cluster. This is consumed by osc.
+                */
+               struct lov_oinfo *coc_oinfo;
+       } u;
+       /**
+        * VFS inode. This is consumed by vvp.
+        */
+       struct inode         *coc_inode;
+       /**
+        * Layout lock handle.
+        */
+       struct ldlm_lock         *coc_lock;
+       /**
+        * Operation to handle layout, OBJECT_CONF_XYZ.
+        */
+       int                       coc_opc;
+};
+
+enum {
+       /** configure layout, set up a new stripe, must be called while
+        * holding layout lock. */
+       OBJECT_CONF_SET = 0,
+       /** invalidate the current stripe configuration due to losing
+        * layout lock. */
+       OBJECT_CONF_INVALIDATE = 1,
+       /** wait for old layout to go away so that new layout can be
+        * set up. */
+       OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+       /**
+        * Initialize page slice for this layer. Called top-to-bottom through
+        * every object layer when a new cl_page is instantiated. Layer
+        * keeping private per-page data, or requiring its own page operations
+        * vector should allocate these data here, and attach then to the page
+        * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+        * sense). Optional.
+        *
+        * \retval NULL success.
+        *
+        * \retval ERR_PTR(errno) failure code.
+        *
+        * \retval valid-pointer pointer to already existing referenced page
+        *       to be used instead of newly created.
+        */
+       int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, struct page *vmpage);
+       /**
+        * Initialize lock slice for this layer. Called top-to-bottom through
+        * every object layer when a new cl_lock is instantiated. Layer
+        * keeping private per-lock data, or requiring its own lock operations
+        * vector should allocate these data here, and attach then to the lock
+        * by calling cl_lock_slice_add(). Mandatory.
+        */
+       int  (*coo_lock_init)(const struct lu_env *env,
+                             struct cl_object *obj, struct cl_lock *lock,
+                             const struct cl_io *io);
+       /**
+        * Initialize io state for a given layer.
+        *
+        * called top-to-bottom once per io existence to initialize io
+        * state. If layer wants to keep some state for this type of io, it
+        * has to embed struct cl_io_slice in lu_env::le_ses, and register
+        * slice with cl_io_slice_add(). It is guaranteed that all threads
+        * participating in this io share the same session.
+        */
+       int  (*coo_io_init)(const struct lu_env *env,
+                           struct cl_object *obj, struct cl_io *io);
+       /**
+        * Fill portion of \a attr that this layer controls. This method is
+        * called top-to-bottom through all object layers.
+        *
+        * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+        *
+        * \return   0: to continue
+        * \return +ve: to stop iterating through layers (but 0 is returned
+        * from enclosing cl_object_attr_get())
+        * \return -ve: to signal error
+        */
+       int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_attr *attr);
+       /**
+        * Update attributes.
+        *
+        * \a valid is a bitmask composed from enum #cl_attr_valid, and
+        * indicating what attributes are to be set.
+        *
+        * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+        *
+        * \return the same convention as for
+        * cl_object_operations::coo_attr_get() is used.
+        */
+       int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid);
+       /**
+        * Update object configuration. Called top-to-bottom to modify object
+        * configuration.
+        *
+        * XXX error conditions and handling.
+        */
+       int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_object_conf *conf);
+       /**
+        * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+        * object. Layers are supposed to fill parts of \a lvb that will be
+        * shipped to the glimpse originator as a glimpse result.
+        *
+        * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+        * \see osc_object_glimpse()
+        */
+       int (*coo_glimpse)(const struct lu_env *env,
+                          const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+       /** Standard lu_object_header. cl_object::co_lu::lo_header points
+        * here. */
+       struct lu_object_header  coh_lu;
+       /** \name locks
+        * \todo XXX move locks below to the separate cache-lines, they are
+        * mostly useless otherwise.
+        */
+       /** @{ */
+       /** Lock protecting page tree. */
+       spinlock_t               coh_page_guard;
+       /** Lock protecting lock list. */
+       spinlock_t               coh_lock_guard;
+       /** @} locks */
+       /** Radix tree of cl_page's, cached for this object. */
+       struct radix_tree_root   coh_tree;
+       /** # of pages in radix tree. */
+       unsigned long       coh_pages;
+       /** List of cl_lock's granted for this object. */
+       struct list_head               coh_locks;
+
+       /**
+        * Parent object. It is assumed that an object has a well-defined
+        * parent, but not a well-defined child (there may be multiple
+        * sub-objects, for the same top-object). cl_object_header::coh_parent
+        * field allows certain code to be written generically, without
+        * limiting possible cl_object layouts unduly.
+        */
+       struct cl_object_header *coh_parent;
+       /**
+        * Protects consistency between cl_attr of parent object and
+        * attributes of sub-objects, that the former is calculated ("merged")
+        * from.
+        *
+        * \todo XXX this can be read/write lock if needed.
+        */
+       spinlock_t               coh_attr_guard;
+       /**
+        * Size of cl_page + page slices
+        */
+       unsigned short           coh_page_bufsize;
+       /**
+        * Number of objects above this one: 0 for a top-object, 1 for its
+        * sub-object, etc.
+        */
+       unsigned char            coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)                               \
+       list_for_each_entry((slice),                                \
+                               &(obj)->co_lu.lo_header->loh_layers,    \
+                               co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)                        \
+       list_for_each_entry_reverse((slice),                         \
+                                       &(obj)->co_lu.lo_header->loh_layers, \
+                                       co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *     - intercepts per-VM-page call-backs made by the environment (e.g.,
+ *       memory pressure),
+ *
+ *     - translates state (page flag bits) and locking between lustre and
+ *       environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *     - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *     - VM locks a page and then calls the client, that has "to assume"
+ *       the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *     - there are fixed known ways for a thread to obtain a new reference
+ *       to a page:
+ *
+ *         - by doing a lookup in the cl_object radix tree, protected by the
+ *           spin-lock;
+ *
+ *         - by starting from VM-locked struct page and following some
+ *           hosting environment method (e.g., following ->private pointer in
+ *           the case of Linux kernel), see cl_vmpage_page();
+ *
+ *     - when the page enters cl_page_state::CPS_FREEING state, all these
+ *       ways are severed with the proper synchronization
+ *       (cl_page_delete());
+ *
+ *     - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *       lock;
+ *
+ *     - no new references to the page in cl_page_state::CPS_FREEING state
+ *       are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *       write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+       /**
+        * Page is in the cache, un-owned. Page leaves cached state in the
+        * following cases:
+        *
+        *     - [cl_page_state::CPS_OWNED] io comes across the page and
+        *     owns it;
+        *
+        *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+        *     req-formation engine decides that it wants to include this page
+        *     into an cl_req being constructed, and yanks it from the cache;
+        *
+        *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+        *     evict the page form the memory;
+        *
+        * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+        */
+       CPS_CACHED,
+       /**
+        * Page is exclusively owned by some cl_io. Page may end up in this
+        * state as a result of
+        *
+        *     - io creating new page and immediately owning it;
+        *
+        *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+        *     and owning it;
+        *
+        *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+        *     and waiting for owner to release the page;
+        *
+        * Page leaves owned state in the following cases:
+        *
+        *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+        *     the cache, doing nothing;
+        *
+        *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+        *     this page;
+        *
+        *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+        *     transfer for this page;
+        *
+        *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+        *     page (e.g., as part of truncate or extent lock cancellation).
+        *
+        * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+        */
+       CPS_OWNED,
+       /**
+        * Page is being written out, as a part of a transfer. This state is
+        * entered when req-formation logic decided that it wants this page to
+        * be sent through the wire _now_. Specifically, it means that once
+        * this state is achieved, transfer completion handler (with either
+        * success or failure indication) is guaranteed to be executed against
+        * this page independently of any locks and any scheduling decisions
+        * made by the hosting environment (that effectively means that the
+        * page is never put into cl_page_state::CPS_PAGEOUT state "in
+        * advance". This property is mentioned, because it is important when
+        * reasoning about possible dead-locks in the system). The page can
+        * enter this state as a result of
+        *
+        *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+        *     write-out of this page, or
+        *
+        *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+        *     that it has enough dirty pages cached to issue a "good"
+        *     transfer.
+        *
+        * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+        * is completed---it is moved into cl_page_state::CPS_CACHED state.
+        *
+        * Underlying VM page is locked for the duration of transfer.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+        */
+       CPS_PAGEOUT,
+       /**
+        * Page is being read in, as a part of a transfer. This is quite
+        * similar to the cl_page_state::CPS_PAGEOUT state, except that
+        * read-in is always "immediate"---there is no such thing a sudden
+        * construction of read cl_req from cached, presumably not up to date,
+        * pages.
+        *
+        * Underlying VM page is locked for the duration of transfer.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+        */
+       CPS_PAGEIN,
+       /**
+        * Page is being destroyed. This state is entered when client decides
+        * that page has to be deleted from its host object, as, e.g., a part
+        * of truncate.
+        *
+        * Once this state is reached, there is no way to escape it.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+        */
+       CPS_FREEING,
+       CPS_NR
+};
+
+enum cl_page_type {
+       /** Host page, the page is from the host inode which the cl_page
+        * belongs to. */
+       CPT_CACHEABLE = 1,
+
+       /** Transient page, the transient cl_page is used to bind a cl_page
+        *  to vmpage which is not belonging to the same object of cl_page.
+        *  it is used in DirectIO, lockless IO and liblustre. */
+       CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+       /**
+        * Set when pagein completes. Used for debugging (read completes at
+        * most once for a page).
+        */
+       CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+       /** Reference counter. */
+       atomic_t             cp_ref;
+       /** An object this page is a part of. Immutable after creation. */
+       struct cl_object        *cp_obj;
+       /** Logical page index within the object. Immutable after creation. */
+       pgoff_t           cp_index;
+       /** List of slices. Immutable after creation. */
+       struct list_head               cp_layers;
+       /** Parent page, NULL for top-level page. Immutable after creation. */
+       struct cl_page    *cp_parent;
+       /** Lower-layer page. NULL for bottommost page. Immutable after
+        * creation. */
+       struct cl_page    *cp_child;
+       /**
+        * Page state. This field is const to avoid accidental update, it is
+        * modified only internally within cl_page.c. Protected by a VM lock.
+        */
+       const enum cl_page_state cp_state;
+       /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+       struct list_head                cp_batch;
+       /** Mutex serializing membership of a page in a batch. */
+       struct mutex            cp_mutex;
+       /** Linkage of pages within cl_req. */
+       struct list_head               cp_flight;
+       /** Transfer error. */
+       int                   cp_error;
+
+       /**
+        * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+        * creation.
+        */
+       enum cl_page_type       cp_type;
+
+       /**
+        * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+        * by sub-io. Protected by a VM lock.
+        */
+       struct cl_io        *cp_owner;
+       /**
+        * Debug information, the task is owning the page.
+        */
+       task_t        *cp_task;
+       /**
+        * Owning IO request in cl_page_state::CPS_PAGEOUT and
+        * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+        * the top-level pages. Protected by a VM lock.
+        */
+       struct cl_req      *cp_req;
+       /** List of references to this page, for debugging. */
+       struct lu_ref       cp_reference;
+       /** Link to an object, for debugging. */
+       struct lu_ref_link      *cp_obj_ref;
+       /** Link to a queue, for debugging. */
+       struct lu_ref_link      *cp_queue_ref;
+       /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+       unsigned                 cp_flags;
+       /** Assigned if doing a sync_io */
+       struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+       struct cl_page            *cpl_page;
+       /**
+        * Object slice corresponding to this page slice. Immutable after
+        * creation.
+        */
+       struct cl_object                *cpl_obj;
+       const struct cl_page_operations *cpl_ops;
+       /** Linkage into cl_page::cp_layers. Immutable after creation. */
+       struct list_head                       cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+       /**
+        * Mode of a lock that protects no data, and exists only as a
+        * placeholder. This is used for `glimpse' requests. A phantom lock
+        * might get promoted to real lock at some point.
+        */
+       CLM_PHANTOM,
+       CLM_READ,
+       CLM_WRITE,
+       CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+       CRT_READ,
+       CRT_WRITE,
+       CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+       /**
+        * cl_page<->struct page methods. Only one layer in the stack has to
+        * implement these. Current code assumes that this functionality is
+        * provided by the topmost layer, see cl_page_disown0() as an example.
+        */
+
+       /**
+        * \return the underlying VM page. Optional.
+        */
+       struct page *(*cpo_vmpage)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice);
+       /**
+        * Called when \a io acquires this page into the exclusive
+        * ownership. When this method returns, it is guaranteed that the is
+        * not owned by other io, and no transfer is going on against
+        * it. Optional.
+        *
+        * \see cl_page_own()
+        * \see vvp_page_own(), lov_page_own()
+        */
+       int  (*cpo_own)(const struct lu_env *env,
+                       const struct cl_page_slice *slice,
+                       struct cl_io *io, int nonblock);
+       /** Called when ownership it yielded. Optional.
+        *
+        * \see cl_page_disown()
+        * \see vvp_page_disown()
+        */
+       void (*cpo_disown)(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+       /**
+        * Called for a page that is already "owned" by \a io from VM point of
+        * view. Optional.
+        *
+        * \see cl_page_assume()
+        * \see vvp_page_assume(), lov_page_assume()
+        */
+       void (*cpo_assume)(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+       /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+        * bottom-to-top when IO releases a page without actually unlocking
+        * it.
+        *
+        * \see cl_page_unassume()
+        * \see vvp_page_unassume()
+        */
+       void (*cpo_unassume)(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+       /**
+        * Announces whether the page contains valid data or not by \a uptodate.
+        *
+        * \see cl_page_export()
+        * \see vvp_page_export()
+        */
+       void  (*cpo_export)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, int uptodate);
+       /**
+        * Unmaps page from the user space (if it is mapped).
+        *
+        * \see cl_page_unmap()
+        * \see vvp_page_unmap()
+        */
+       int (*cpo_unmap)(const struct lu_env *env,
+                        const struct cl_page_slice *slice, struct cl_io *io);
+       /**
+        * Checks whether underlying VM page is locked (in the suitable
+        * sense). Used for assertions.
+        *
+        * \retval    -EBUSY: page is protected by a lock of a given mode;
+        * \retval  -ENODATA: page is not protected by a lock;
+        * \retval       0: this layer cannot decide. (Should never happen.)
+        */
+       int (*cpo_is_vmlocked)(const struct lu_env *env,
+                              const struct cl_page_slice *slice);
+       /**
+        * Page destruction.
+        */
+
+       /**
+        * Called when page is truncated from the object. Optional.
+        *
+        * \see cl_page_discard()
+        * \see vvp_page_discard(), osc_page_discard()
+        */
+       void (*cpo_discard)(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io);
+       /**
+        * Called when page is removed from the cache, and is about to being
+        * destroyed. Optional.
+        *
+        * \see cl_page_delete()
+        * \see vvp_page_delete(), osc_page_delete()
+        */
+       void (*cpo_delete)(const struct lu_env *env,
+                          const struct cl_page_slice *slice);
+       /** Destructor. Frees resources and slice itself. */
+       void (*cpo_fini)(const struct lu_env *env,
+                        struct cl_page_slice *slice);
+
+       /**
+        * Checks whether the page is protected by a cl_lock. This is a
+        * per-layer method, because certain layers have ways to check for the
+        * lock much more efficiently than through the generic locks scan, or
+        * implement locking mechanisms separate from cl_lock, e.g.,
+        * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+        * being canceled, or scheduled for cancellation as soon as the last
+        * user goes away, too.
+        *
+        * \retval    -EBUSY: page is protected by a lock of a given mode;
+        * \retval  -ENODATA: page is not protected by a lock;
+        * \retval       0: this layer cannot decide.
+        *
+        * \see cl_page_is_under_lock()
+        */
+       int (*cpo_is_under_lock)(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+
+       /**
+        * Optional debugging helper. Prints given page slice.
+        *
+        * \see cl_page_print()
+        */
+       int (*cpo_print)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        void *cookie, lu_printer_t p);
+       /**
+        * \name transfer
+        *
+        * Transfer methods. See comment on cl_req for a description of
+        * transfer formation and life-cycle.
+        *
+        * @{
+        */
+       /**
+        * Request type dependent vector of operations.
+        *
+        * Transfer operations depend on transfer mode (cl_req_type). To avoid
+        * passing transfer mode to each and every of these methods, and to
+        * avoid branching on request type inside of the methods, separate
+        * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+        * provided. That is, method invocation usually looks like
+        *
+        *       slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+        */
+       struct {
+               /**
+                * Called when a page is submitted for a transfer as a part of
+                * cl_page_list.
+                *
+                * \return    0  : page is eligible for submission;
+                * \return    -EALREADY : skip this page;
+                * \return    -ve       : error.
+                *
+                * \see cl_page_prep()
+                */
+               int  (*cpo_prep)(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+               /**
+                * Completion handler. This is guaranteed to be eventually
+                * fired after cl_page_operations::cpo_prep() or
+                * cl_page_operations::cpo_make_ready() call.
+                *
+                * This method can be called in a non-blocking context. It is
+                * guaranteed however, that the page involved and its object
+                * are pinned in memory (and, hence, calling cl_page_put() is
+                * safe).
+                *
+                * \see cl_page_completion()
+                */
+               void (*cpo_completion)(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      int ioret);
+               /**
+                * Called when cached page is about to be added to the
+                * cl_req as a part of req formation.
+                *
+                * \return    0       : proceed with this page;
+                * \return    -EAGAIN : skip this page;
+                * \return    -ve     : error.
+                *
+                * \see cl_page_make_ready()
+                */
+               int  (*cpo_make_ready)(const struct lu_env *env,
+                                      const struct cl_page_slice *slice);
+               /**
+                * Announce that this page is to be written out
+                * opportunistically, that is, page is dirty, it is not
+                * necessary to start write-out transfer right now, but
+                * eventually page has to be written out.
+                *
+                * Main caller of this is the write path (see
+                * vvp_io_commit_write()), using this method to build a
+                * "transfer cache" from which large transfers are then
+                * constructed by the req-formation engine.
+                *
+                * \todo XXX it would make sense to add page-age tracking
+                * semantics here, and to oblige the req-formation engine to
+                * send the page out not later than it is too old.
+                *
+                * \see cl_page_cache_add()
+                */
+               int  (*cpo_cache_add)(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *io);
+       } io[CRT_NR];
+       /**
+        * Tell transfer engine that only [to, from] part of a page should be
+        * transmitted.
+        *
+        * This is used for immediate transfers.
+        *
+        * \todo XXX this is not very good interface. It would be much better
+        * if all transfer parameters were supplied as arguments to
+        * cl_io_operations::cio_submit() call, but it is not clear how to do
+        * this for page queues.
+        *
+        * \see cl_page_clip()
+        */
+       void (*cpo_clip)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        int from, int to);
+       /**
+        * \pre  the page was queued for transferring.
+        * \post page is removed from client's pending list, or -EBUSY
+        *       is returned if it has already been in transferring.
+        *
+        * This is one of seldom page operation which is:
+        * 0. called from top level;
+        * 1. don't have vmpage locked;
+        * 2. every layer should synchronize execution of its ->cpo_cancel()
+        *    with completion handlers. Osc uses client obd lock for this
+        *    purpose. Based on there is no vvp_page_cancel and
+        *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+        *
+        * \see osc_page_cancel().
+        */
+       int (*cpo_cancel)(const struct lu_env *env,
+                         const struct cl_page_slice *slice);
+       /**
+        * Write out a page by kernel. This is only called by ll_writepage
+        * right now.
+        *
+        * \see cl_page_flush()
+        */
+       int (*cpo_flush)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io);
+       /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)                 \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                     \
+do {                                                                     \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                      \
+                                                                             \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                    \
+               cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+               CDEBUG(mask, format , ## __VA_ARGS__);                  \
+       }                                                                    \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+       if (page->cp_type == CPT_CACHEABLE)
+               ++refc;
+       LASSERT(atomic_read(&page->cp_ref) > 0);
+       return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *     struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+       /** Object this lock is granted for. */
+       struct cl_object *cld_obj;
+       /** Index of the first page protected by this lock. */
+       pgoff_t    cld_start;
+       /** Index of the last page (inclusive) protected by this lock. */
+       pgoff_t    cld_end;
+       /** Group ID, for group lock */
+       __u64        cld_gid;
+       /** Lock mode. */
+       enum cl_lock_mode cld_mode;
+       /**
+        * flags to enqueue lock. A combination of bit-flags from
+        * enum cl_enq_flags.
+        */
+       __u32        cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)                                             \
+       cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,        \
+       (descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *           +------------------>NEW
+ *           |             |
+ *           |             | cl_enqueue_try()
+ *           |             |
+ *           |    cl_unuse_try()  V
+ *           |  +--------------QUEUING (*)
+ *           |  |               |
+ *           |  |               | cl_enqueue_try()
+ *           |  |               |
+ *           |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |            |
+ *           |  |               | cl_wait_try()
+ *           |  |               |
+ *           |  |              (R)
+ *           |  |               |
+ *           |  |               V
+ *           |  |              HELD<---------+
+ *           |  |               |          |
+ *           |  |               |          | cl_use_try()
+ *           |  |  cl_unuse_try() |        |
+ *           |  |               |          |
+ *           |  |               V       ---+
+ *           |  +------------>INTRANSIT (D) <--+
+ *           |             |       |
+ *           |     cl_unuse_try() |        | cached lock found
+ *           |             |       | cl_use_try()
+ *           |             |       |
+ *           |             V       |
+ *           +------------------CACHED---------+
+ *                                |
+ *                               (C)
+ *                                |
+ *                                V
+ *                             FREEING
+ *
+ * Legend:
+ *
+ *      In states marked with (*) transition to the same state (i.e., a loop
+ *      in the diagram) is possible.
+ *
+ *      (R) is the point where Receive call-back is invoked: it allows layers
+ *      to handle arrival of lock reply.
+ *
+ *      (C) is the point where Cancellation call-back is invoked.
+ *
+ *      (D) is the transit state which means the lock is changing.
+ *
+ *      Transition to FREEING state is possible from any other state in the
+ *      diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+       /**
+        * Lock that wasn't yet enqueued
+        */
+       CLS_NEW,
+       /**
+        * Enqueue is in progress, blocking for some intermediate interaction
+        * with the other side.
+        */
+       CLS_QUEUING,
+       /**
+        * Lock is fully enqueued, waiting for server to reply when it is
+        * granted.
+        */
+       CLS_ENQUEUED,
+       /**
+        * Lock granted, actively used by some IO.
+        */
+       CLS_HELD,
+       /**
+        * This state is used to mark the lock is being used, or unused.
+        * We need this state because the lock may have several sublocks,
+        * so it's impossible to have an atomic way to bring all sublocks
+        * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+        * at unuse case.
+        * If a thread is referring to a lock, and it sees the lock is in this
+        * state, it must wait for the lock.
+        * See state diagram for details.
+        */
+       CLS_INTRANSIT,
+       /**
+        * Lock granted, not used.
+        */
+       CLS_CACHED,
+       /**
+        * Lock is being destroyed.
+        */
+       CLS_FREEING,
+       CLS_NR
+};
+
+enum cl_lock_flags {
+       /**
+        * lock has been cancelled. This flag is never cleared once set (by
+        * cl_lock_cancel0()).
+        */
+       CLF_CANCELLED  = 1 << 0,
+       /** cancellation is pending for this lock. */
+       CLF_CANCELPEND = 1 << 1,
+       /** destruction is pending for this lock. */
+       CLF_DOOMED     = 1 << 2,
+       /** from enqueue RPC reply upcall. */
+       CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+       /**
+        * Lock that is mutexed when closure construction is started. When
+        * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+        * origin is released before waiting.
+        */
+       struct cl_lock   *clc_origin;
+       /**
+        * List of enclosed locks, so far. Locks are linked here through
+        * cl_lock::cll_inclosure.
+        */
+       struct list_head        clc_list;
+       /**
+        * True iff closure is in a `wait' mode. This determines what
+        * cl_lock_enclosure() does when a lock L to be added to the closure
+        * is currently mutexed by some other thread.
+        *
+        * If cl_lock_closure::clc_wait is not set, then closure construction
+        * fails with CLO_REPEAT immediately.
+        *
+        * In wait mode, cl_lock_enclosure() waits until next attempt to build
+        * a closure might succeed. To this end it releases an origin mutex
+        * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+        * owned by the current thread, and then waits on L mutex (by grabbing
+        * it and immediately releasing), before returning CLO_REPEAT to the
+        * caller.
+        */
+       int            clc_wait;
+       /** Number of locks in the closure. */
+       int            clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+       /** Reference counter. */
+       atomic_t          cll_ref;
+       /** List of slices. Immutable after creation. */
+       struct list_head            cll_layers;
+       /**
+        * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+        * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+        */
+       struct list_head            cll_linkage;
+       /**
+        * Parameters of this lock. Protected by
+        * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+        * cl_lock::cll_guard. Modified only on lock creation and in
+        * cl_lock_modify().
+        */
+       struct cl_lock_descr  cll_descr;
+       /** Protected by cl_lock::cll_guard. */
+       enum cl_lock_state    cll_state;
+       /** signals state changes. */
+       wait_queue_head_t          cll_wq;
+       /**
+        * Recursive lock, most fields in cl_lock{} are protected by this.
+        *
+        * Locking rules: this mutex is never held across network
+        * communication, except when lock is being canceled.
+        *
+        * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+        * on a top-lock. Other direction is implemented through a
+        * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+        * by try-locking.
+        *
+        * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+        */
+       struct mutex            cll_guard;
+       task_t     *cll_guarder;
+       int                cll_depth;
+
+       /**
+        * the owner for INTRANSIT state
+        */
+       task_t     *cll_intransit_owner;
+       int                cll_error;
+       /**
+        * Number of holds on a lock. A hold prevents a lock from being
+        * canceled and destroyed. Protected by cl_lock::cll_guard.
+        *
+        * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+        */
+       int                cll_holds;
+        /**
+         * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+         * only. Lock user pins lock in CLS_HELD state. Protected by
+         * cl_lock::cll_guard.
+         *
+         * \see cl_wait(), cl_unuse().
+         */
+       int                cll_users;
+       /**
+        * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+        * protected by cl_lock::cll_guard.
+        */
+       unsigned long    cll_flags;
+       /**
+        * A linkage into a list of locks in a closure.
+        *
+        * \see cl_lock_closure
+        */
+       struct list_head            cll_inclosure;
+       /**
+        * Confict lock at queuing time.
+        */
+       struct cl_lock       *cll_conflict;
+       /**
+        * A list of references to this lock, for debugging.
+        */
+       struct lu_ref    cll_reference;
+       /**
+        * A list of holds on this lock, for debugging.
+        */
+       struct lu_ref    cll_holders;
+       /**
+        * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+        */
+       struct lu_ref_link   *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+       /* "dep_map" name is assumed by lockdep.h macros. */
+       struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+       struct cl_lock            *cls_lock;
+       /** Object slice corresponding to this lock slice. Immutable after
+        * creation. */
+       struct cl_object                *cls_obj;
+       const struct cl_lock_operations *cls_ops;
+       /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+       struct list_head                       cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+       /** operation cannot be completed immediately. Wait for state change. */
+       CLO_WAIT        = 1,
+       /** operation had to release lock mutex, restart. */
+       CLO_REPEAT      = 2,
+       /** lower layer re-enqueued. */
+       CLO_REENQUEUED  = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+       /**
+        * \name statemachine
+        *
+        * State machine transitions. These 3 methods are called to transfer
+        * lock from one state to another, as described in the commentary
+        * above enum #cl_lock_state.
+        *
+        * \retval 0      this layer has nothing more to do to before
+        *                     transition to the target state happens;
+        *
+        * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+        *                  mutex, repeat invocation of transition method
+        *                  across all layers;
+        *
+        * \retval CLO_WAIT   this layer cannot move to the target state
+        *                  immediately, as it has to wait for certain event
+        *                  (e.g., the communication with the server). It
+        *                  is guaranteed, that when the state transfer
+        *                  becomes possible, cl_lock::cll_wq wait-queue
+        *                  is signaled. Caller can wait for this event by
+        *                  calling cl_lock_state_wait();
+        *
+        * \retval -ve  failure, abort state transition, move the lock
+        *                  into cl_lock_state::CLS_FREEING state, and set
+        *                  cl_lock::cll_error.
+        *
+        * Once all layers voted to agree to transition (by returning 0), lock
+        * is moved into corresponding target state. All state transition
+        * methods are optional.
+        */
+       /** @{ */
+       /**
+        * Attempts to enqueue the lock. Called top-to-bottom.
+        *
+        * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+        * \see osc_lock_enqueue()
+        */
+       int  (*clo_enqueue)(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *io, __u32 enqflags);
+       /**
+        * Attempts to wait for enqueue result. Called top-to-bottom.
+        *
+        * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+        */
+       int  (*clo_wait)(const struct lu_env *env,
+                        const struct cl_lock_slice *slice);
+       /**
+        * Attempts to unlock the lock. Called bottom-to-top. In addition to
+        * usual return values of lock state-machine methods, this can return
+        * -ESTALE to indicate that lock cannot be returned to the cache, and
+        * has to be re-initialized.
+        * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+        *
+        * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+        */
+       int  (*clo_unuse)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice);
+       /**
+        * Notifies layer that cached lock is started being used.
+        *
+        * \pre lock->cll_state == CLS_CACHED
+        *
+        * \see lov_lock_use(), osc_lock_use()
+        */
+       int  (*clo_use)(const struct lu_env *env,
+                       const struct cl_lock_slice *slice);
+       /** @} statemachine */
+       /**
+        * A method invoked when lock state is changed (as a result of state
+        * transition). This is used, for example, to track when the state of
+        * a sub-lock changes, to propagate this change to the corresponding
+        * top-lock. Optional
+        *
+        * \see lovsub_lock_state()
+        */
+       void (*clo_state)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice,
+                         enum cl_lock_state st);
+       /**
+        * Returns true, iff given lock is suitable for the given io, idea
+        * being, that there are certain "unsafe" locks, e.g., ones acquired
+        * for O_APPEND writes, that we don't want to re-use for a normal
+        * write, to avoid the danger of cascading evictions. Optional. Runs
+        * under cl_object_header::coh_lock_guard.
+        *
+        * XXX this should take more information about lock needed by
+        * io. Probably lock description or something similar.
+        *
+        * \see lov_fits_into()
+        */
+       int (*clo_fits_into)(const struct lu_env *env,
+                            const struct cl_lock_slice *slice,
+                            const struct cl_lock_descr *need,
+                            const struct cl_io *io);
+       /**
+        * \name ast
+        * Asynchronous System Traps. All of then are optional, all are
+        * executed bottom-to-top.
+        */
+       /** @{ */
+
+       /**
+        * Cancellation callback. Cancel a lock voluntarily, or under
+        * the request of server.
+        */
+       void (*clo_cancel)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice);
+       /**
+        * Lock weighting ast. Executed to estimate how precious this lock
+        * is. The sum of results across all layers is used to determine
+        * whether lock worth keeping in cache given present memory usage.
+        *
+        * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+        */
+       unsigned long (*clo_weigh)(const struct lu_env *env,
+                                  const struct cl_lock_slice *slice);
+       /** @} ast */
+
+       /**
+        * \see lovsub_lock_closure()
+        */
+       int (*clo_closure)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          struct cl_lock_closure *closure);
+       /**
+        * Executed bottom-to-top when lock description changes (e.g., as a
+        * result of server granting more generous lock than was requested).
+        *
+        * \see lovsub_lock_modify()
+        */
+       int (*clo_modify)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice,
+                         const struct cl_lock_descr *updated);
+       /**
+        * Notifies layers (bottom-to-top) that lock is going to be
+        * destroyed. Responsibility of layers is to prevent new references on
+        * this lock from being acquired once this method returns.
+        *
+        * This can be called multiple times due to the races.
+        *
+        * \see cl_lock_delete()
+        * \see osc_lock_delete(), lovsub_lock_delete()
+        */
+       void (*clo_delete)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice);
+       /**
+        * Destructor. Frees resources and the slice.
+        *
+        * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+        * \see osc_lock_fini()
+        */
+       void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+       /**
+        * Optional debugging helper. Prints given lock slice.
+        */
+       int (*clo_print)(const struct lu_env *env,
+                        void *cookie, lu_printer_t p,
+                        const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                 \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {                       \
+       if (likely(expr))                                              \
+               break;                                            \
+                                                                       \
+       CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+       LBUG();                                                  \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+       unsigned             pl_nr;
+       struct list_head           pl_pages;
+       task_t    *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+       struct cl_page_list c2_qin;
+       struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *      (cl_io_operations::cio_read_page() for read,
+ *      cl_io_operations::cio_prepare_write(),
+ *      cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+       /** read system call */
+       CIT_READ,
+       /** write system call */
+       CIT_WRITE,
+       /** truncate, utime system calls */
+       CIT_SETATTR,
+       /**
+        * page fault handling
+        */
+       CIT_FAULT,
+       /**
+        * fsync system call handling
+        * To write out a range of file
+        */
+       CIT_FSYNC,
+       /**
+        * Miscellaneous io. This is used for occasional io activity that
+        * doesn't fit into other types. Currently this is used for:
+        *
+        *     - cancellation of an extent lock. This io exists as a context
+        *     to write dirty pages from under the lock being canceled back
+        *     to the server;
+        *
+        *     - VM induced page write-out. An io context for writing page out
+        *     for memory cleansing;
+        *
+        *     - glimpse. An io context to acquire glimpse lock.
+        *
+        *     - grouplock. An io context to acquire group lock.
+        *
+        * CIT_MISC io is used simply as a context in which locks and pages
+        * are manipulated. Such io has no internal "process", that is,
+        * cl_io_loop() is never called for it.
+        */
+       CIT_MISC,
+       CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+       /** Not initialized. */
+       CIS_ZERO,
+       /** Initialized. */
+       CIS_INIT,
+       /** IO iteration started. */
+       CIS_IT_STARTED,
+       /** Locks taken. */
+       CIS_LOCKED,
+       /** Actual IO is in progress. */
+       CIS_IO_GOING,
+       /** IO for the current iteration finished. */
+       CIS_IO_FINISHED,
+       /** Locks released. */
+       CIS_UNLOCKED,
+       /** Iteration completed. */
+       CIS_IT_ENDED,
+       /** cl_io finalized. */
+       CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+       struct cl_io              *cis_io;
+       /** corresponding object slice. Immutable after creation. */
+       struct cl_object              *cis_obj;
+       /** io operations. Immutable after creation. */
+       const struct cl_io_operations *cis_iop;
+       /**
+        * linkage into a list of all slices for a given cl_io, hanging off
+        * cl_io::ci_layers. Immutable after creation.
+        */
+       struct list_head                     cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+       /**
+        * Vector of io state transition methods for every io type.
+        *
+        * \see cl_page_operations::io
+        */
+       struct {
+               /**
+                * Prepare io iteration at a given layer.
+                *
+                * Called top-to-bottom at the beginning of each iteration of
+                * "io loop" (if it makes sense for this type of io). Here
+                * layer selects what work it will do during this iteration.
+                *
+                * \see cl_io_operations::cio_iter_fini()
+                */
+               int (*cio_iter_init) (const struct lu_env *env,
+                                     const struct cl_io_slice *slice);
+               /**
+                * Finalize io iteration.
+                *
+                * Called bottom-to-top at the end of each iteration of "io
+                * loop". Here layers can decide whether IO has to be
+                * continued.
+                *
+                * \see cl_io_operations::cio_iter_init()
+                */
+               void (*cio_iter_fini) (const struct lu_env *env,
+                                      const struct cl_io_slice *slice);
+               /**
+                * Collect locks for the current iteration of io.
+                *
+                * Called top-to-bottom to collect all locks necessary for
+                * this iteration. This methods shouldn't actually enqueue
+                * anything, instead it should post a lock through
+                * cl_io_lock_add(). Once all locks are collected, they are
+                * sorted and enqueued in the proper order.
+                */
+               int  (*cio_lock) (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Finalize unlocking.
+                *
+                * Called bottom-to-top to finish layer specific unlocking
+                * functionality, after generic code released all locks
+                * acquired by cl_io_operations::cio_lock().
+                */
+               void  (*cio_unlock)(const struct lu_env *env,
+                                   const struct cl_io_slice *slice);
+               /**
+                * Start io iteration.
+                *
+                * Once all locks are acquired, called top-to-bottom to
+                * commence actual IO. In the current implementation,
+                * top-level vvp_io_{read,write}_start() does all the work
+                * synchronously by calling generic_file_*(), so other layers
+                * are called when everything is done.
+                */
+               int  (*cio_start)(const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Called top-to-bottom at the end of io loop. Here layer
+                * might wait for an unfinished asynchronous io.
+                */
+               void (*cio_end)  (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Called bottom-to-top to notify layers that read/write IO
+                * iteration finished, with \a nob bytes transferred.
+                */
+               void (*cio_advance)(const struct lu_env *env,
+                                   const struct cl_io_slice *slice,
+                                   size_t nob);
+               /**
+                * Called once per io, bottom-to-top to release io resources.
+                */
+               void (*cio_fini) (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+       } op[CIT_OP_NR];
+       struct {
+               /**
+                * Submit pages from \a queue->c2_qin for IO, and move
+                * successfully submitted pages into \a queue->c2_qout. Return
+                * non-zero if failed to submit even the single page. If
+                * submission failed after some pages were moved into \a
+                * queue->c2_qout, completion callback with non-zero ioret is
+                * executed on them.
+                */
+               int  (*cio_submit)(const struct lu_env *env,
+                                  const struct cl_io_slice *slice,
+                                  enum cl_req_type crt,
+                                  struct cl_2queue *queue);
+       } req_op[CRT_NR];
+       /**
+        * Read missing page.
+        *
+        * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+        * method, when it hits not-up-to-date page in the range. Optional.
+        *
+        * \pre io->ci_type == CIT_READ
+        */
+       int (*cio_read_page)(const struct lu_env *env,
+                            const struct cl_io_slice *slice,
+                            const struct cl_page_slice *page);
+       /**
+        * Prepare write of a \a page. Called bottom-to-top by a top-level
+        * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+        * get data from user-level buffer.
+        *
+        * \pre io->ci_type == CIT_WRITE
+        *
+        * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+        * osc_io_prepare_write().
+        */
+       int (*cio_prepare_write)(const struct lu_env *env,
+                                const struct cl_io_slice *slice,
+                                const struct cl_page_slice *page,
+                                unsigned from, unsigned to);
+       /**
+        *
+        * \pre io->ci_type == CIT_WRITE
+        *
+        * \see vvp_io_commit_write(), lov_io_commit_write(),
+        * osc_io_commit_write().
+        */
+       int (*cio_commit_write)(const struct lu_env *env,
+                               const struct cl_io_slice *slice,
+                               const struct cl_page_slice *page,
+                               unsigned from, unsigned to);
+       /**
+        * Optional debugging helper. Print given io slice.
+        */
+       int (*cio_print)(const struct lu_env *env, void *cookie,
+                        lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+       /**
+        * instruct server to not block, if conflicting lock is found. Instead
+        * -EWOULDBLOCK is returned immediately.
+        */
+       CEF_NONBLOCK     = 0x00000001,
+       /**
+        * take lock asynchronously (out of order), as it cannot
+        * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+        */
+       CEF_ASYNC       = 0x00000002,
+       /**
+        * tell the server to instruct (though a flag in the blocking ast) an
+        * owner of the conflicting lock, that it can drop dirty pages
+        * protected by this lock, without sending them to the server.
+        */
+       CEF_DISCARD_DATA = 0x00000004,
+       /**
+        * tell the sub layers that it must be a `real' lock. This is used for
+        * mmapped-buffer locks and glimpse locks that must be never converted
+        * into lockless mode.
+        *
+        * \see vvp_mmap_locks(), cl_glimpse_lock().
+        */
+       CEF_MUST         = 0x00000008,
+       /**
+        * tell the sub layers that never request a `real' lock. This flag is
+        * not used currently.
+        *
+        * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+        * conversion policy: ci_lockreq describes generic information of lock
+        * requirement for this IO, especially for locks which belong to the
+        * object doing IO; however, lock itself may have precise requirements
+        * that are described by the enqueue flags.
+        */
+       CEF_NEVER       = 0x00000010,
+       /**
+        * for async glimpse lock.
+        */
+       CEF_AGL   = 0x00000020,
+       /**
+        * mask of enq_flags.
+        */
+       CEF_MASK         = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+       /** linkage into one of cl_lockset lists. */
+       struct list_head           cill_linkage;
+       struct cl_lock_descr cill_descr;
+       struct cl_lock      *cill_lock;
+       /** optional destructor */
+       void           (*cill_fini)(const struct lu_env *env,
+                                       struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *     "cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *     see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *     is a part of a memory mapped lustre file, client has to take a dlm
+ *     locks on file0, and all files that back up the buffer (or a part of
+ *     the buffer, that is being processed in the current chunk, in any
+ *     case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+       /** locks to be acquired. */
+       struct list_head  cls_todo;
+       /** locks currently being processed. */
+       struct list_head  cls_curr;
+       /** locks acquired. */
+       struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+       /** Always lock data (e.g., O_APPEND). */
+       CILR_MANDATORY = 0,
+       /** Layers are free to decide between local and global locking. */
+       CILR_MAYBE,
+       /** Never lock: there is no cache (e.g., liblustre). */
+       CILR_NEVER
+};
+
+enum cl_fsync_mode {
+       /** start writeback, do not wait for them to finish */
+       CL_FSYNC_NONE  = 0,
+       /** start writeback and wait for them to finish */
+       CL_FSYNC_LOCAL = 1,
+       /** discard all of dirty pages in a specific file range */
+       CL_FSYNC_DISCARD = 2,
+       /** start writeback and make sure they have reached storage before
+        * return. OST_SYNC RPC must be issued and finished */
+       CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+       loff_t      crw_pos;
+       size_t      crw_count;
+       int      crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+       /** type of this IO. Immutable after creation. */
+       enum cl_io_type         ci_type;
+       /** current state of cl_io state machine. */
+       enum cl_io_state               ci_state;
+       /** main object this io is against. Immutable after creation. */
+       struct cl_object              *ci_obj;
+       /**
+        * Upper layer io, of which this io is a part of. Immutable after
+        * creation.
+        */
+       struct cl_io              *ci_parent;
+       /** List of slices. Immutable after creation. */
+       struct list_head                     ci_layers;
+       /** list of locks (to be) acquired by this io. */
+       struct cl_lockset             ci_lockset;
+       /** lock requirements, this is just a help info for sublayers. */
+       enum cl_io_lock_dmd         ci_lockreq;
+       union {
+               struct cl_rd_io {
+                       struct cl_io_rw_common rd;
+               } ci_rd;
+               struct cl_wr_io {
+                       struct cl_io_rw_common wr;
+                       int                 wr_append;
+                       int                 wr_sync;
+               } ci_wr;
+               struct cl_io_rw_common ci_rw;
+               struct cl_setattr_io {
+                       struct ost_lvb   sa_attr;
+                       unsigned int     sa_valid;
+                       struct obd_capa *sa_capa;
+               } ci_setattr;
+               struct cl_fault_io {
+                       /** page index within file. */
+                       pgoff_t  ft_index;
+                       /** bytes valid byte on a faulted page. */
+                       int          ft_nob;
+                       /** writable page? for nopage() only */
+                       int          ft_writable;
+                       /** page of an executable? */
+                       int          ft_executable;
+                       /** page_mkwrite() */
+                       int          ft_mkwrite;
+                       /** resulting page */
+                       struct cl_page *ft_page;
+               } ci_fault;
+               struct cl_fsync_io {
+                       loff_t       fi_start;
+                       loff_t       fi_end;
+                       struct obd_capa   *fi_capa;
+                       /** file system level fid */
+                       struct lu_fid     *fi_fid;
+                       enum cl_fsync_mode fi_mode;
+                       /* how many pages were written/discarded */
+                       unsigned int       fi_nr_written;
+               } ci_fsync;
+       } u;
+       struct cl_2queue     ci_queue;
+       size_t         ci_nob;
+       int               ci_result;
+       unsigned int     ci_continue:1,
+       /**
+        * This io has held grouplock, to inform sublayers that
+        * don't do lockless i/o.
+        */
+                            ci_no_srvlock:1,
+       /**
+        * The whole IO need to be restarted because layout has been changed
+        */
+                            ci_need_restart:1,
+       /**
+        * to not refresh layout - the IO issuer knows that the layout won't
+        * change(page operations, layout change causes all page to be
+        * discarded), or it doesn't matter if it changes(sync).
+        */
+                            ci_ignore_layout:1,
+       /**
+        * Check if layout changed after the IO finishes. Mainly for HSM
+        * requirement. If IO occurs to openning files, it doesn't need to
+        * verify layout because HSM won't release openning files.
+        * Right now, only two opertaions need to verify layout: glimpse
+        * and setattr.
+        */
+                            ci_verify_layout:1;
+       /**
+        * Number of pages owned by this IO. For invariant checking.
+        */
+       unsigned             ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+       /** Generic attributes for the server consumption. */
+       struct obdo     *cra_oa;
+       /** Capability. */
+       struct obd_capa *cra_capa;
+       /** Jobid */
+       char             cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+       /**
+        * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+        * complete (all pages are added).
+        *
+        * \see osc_req_prep()
+        */
+       int  (*cro_prep)(const struct lu_env *env,
+                        const struct cl_req_slice *slice);
+       /**
+        * Called top-to-bottom to fill in \a oa fields. This is called twice
+        * with different flags, see bug 10150 and osc_build_req().
+        *
+        * \param obj an object from cl_req which attributes are to be set in
+        *          \a oa.
+        *
+        * \param oa struct obdo where attributes are placed
+        *
+        * \param flags \a oa fields to be filled.
+        */
+       void (*cro_attr_set)(const struct lu_env *env,
+                            const struct cl_req_slice *slice,
+                            const struct cl_object *obj,
+                            struct cl_req_attr *attr, obd_valid flags);
+       /**
+        * Called top-to-bottom from cl_req_completion() to notify layers that
+        * transfer completed. Has to free all state allocated by
+        * cl_device_operations::cdo_req_init().
+        */
+       void (*cro_completion)(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+       /** object itself */
+       struct cl_object   *ro_obj;
+       /** reference to cl_req_obj::ro_obj. For debugging. */
+       struct lu_ref_link *ro_obj_ref;
+       /* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+       enum cl_req_type      crq_type;
+       /** A list of pages being transfered */
+       struct list_head            crq_pages;
+       /** Number of pages in cl_req::crq_pages */
+       unsigned              crq_nrpages;
+       /** An array of objects which pages are in ->crq_pages */
+       struct cl_req_obj    *crq_o;
+       /** Number of elements in cl_req::crq_objs[] */
+       unsigned              crq_nrobjs;
+       struct list_head            crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+       struct cl_req    *crs_req;
+       struct cl_device *crs_dev;
+       struct list_head        crs_linkage;
+       const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+       /** how many cache lookups were performed */
+       CS_lookup = 0,
+       /** how many times cache lookup resulted in a hit */
+       CS_hit,
+       /** how many entities are in the cache right now */
+       CS_total,
+       /** how many entities in the cache are actively used (and cannot be
+        * evicted) right now */
+       CS_busy,
+       /** how many entities were created at all */
+       CS_create,
+       CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+       const char    *cs_name;
+       atomic_t   cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+int  cache_stats_print(const struct cache_stats *cs,
+                      char *page, int count, int header);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+       struct lu_site  cs_lu;
+       /**
+        * Statistical counters. Atomics do not scale, something better like
+        * per-cpu counters is needed.
+        *
+        * These are exported as /proc/fs/lustre/llite/.../site
+        *
+        * When interpreting keep in mind that both sub-locks (and sub-pages)
+        * and top-locks (and top-pages) are accounted here.
+        */
+       struct cache_stats    cs_pages;
+       struct cache_stats    cs_locks;
+       atomic_t          cs_pages_state[CPS_NR];
+       atomic_t          cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *s, char *page, int count);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+       return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+       return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+       LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+       return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+       return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+       return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+       return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+       return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+       return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+       return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+       return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+       return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+       return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+       lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                    struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                     struct cl_device *dev,
+                     const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+                                const struct lu_fid *fid,
+                                const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put     (const struct lu_env *env, struct cl_object *o);
+void cl_object_get     (struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+                          struct ost_lvb *lvb);
+int  cl_conf_set         (const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int  cl_object_has_locks  (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+       return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+       clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+       cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+                                        struct cl_page *page)
+{
+       return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+       CLP_GANG_OKAY = 0,
+       CLP_GANG_RESCHED,
+       CLP_GANG_AGAIN,
+       CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+                                    struct cl_page *, void *);
+int         cl_page_gang_lookup (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    struct cl_io *io,
+                                    pgoff_t start, pgoff_t end,
+                                    cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
+                                    pgoff_t index);
+struct cl_page *cl_page_find   (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    pgoff_t idx, struct page *vmpage,
+                                    enum cl_page_type type);
+struct cl_page *cl_page_find_sub    (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    pgoff_t idx, struct page *vmpage,
+                                    struct cl_page *parent);
+void       cl_page_get  (struct cl_page *page);
+void       cl_page_put  (const struct lu_env *env,
+                                    struct cl_page *page);
+void       cl_page_print       (const struct lu_env *env, void *cookie,
+                                    lu_printer_t printer,
+                                    const struct cl_page *pg);
+void       cl_page_header_print(const struct lu_env *env, void *cookie,
+                                    lu_printer_t printer,
+                                    const struct cl_page *pg);
+struct page     *cl_page_vmpage      (const struct lu_env *env,
+                                    struct cl_page *page);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top     (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                      const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own       (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+                        struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+                        enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+                        int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap  (const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+                             const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env,
+                             struct cl_page *pg, int uptodate);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *page);
+loff_t  cl_offset          (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index            (const struct cl_object *obj, loff_t offset);
+int     cl_page_size    (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+                        lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                        lu_printer_t printer,
+                        const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                               const struct cl_lock_descr *need,
+                               const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+                                struct cl_object *obj, pgoff_t index,
+                                struct cl_lock *except, int pending,
+                                int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+                                             struct cl_object *obj,
+                                             struct cl_page *page,
+                                             struct cl_lock *except,
+                                             int pending, int canceld)
+{
+       LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+       return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+                               pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                      const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+                                    struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+                        int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY       NON-TRY      METHOD                           FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE        cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+                    struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                    struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+                         enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+                         const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+                        const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+                        const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+                        const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+                          struct cl_lock_closure *closure,
+                          struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                          struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+                          struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+                          struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init        (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop        (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini        (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock        (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start      (const struct lu_env *env, struct cl_io *io);
+void  cl_io_end          (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+                         struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                          struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+                         enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+                         enum cl_req_type iot, struct cl_2queue *queue,
+                         long timeout);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+                         size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+       return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+       return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+       return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+       return io->ci_type == CIT_SETATTR &&
+               (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+                lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
+do {                                                               \
+       typeof(foo_io) __foo_io = (foo_io);                          \
+                                                                       \
+       CLASSERT(offsetof(typeof(*__foo_io), base) == 0);              \
+       memset(&__foo_io->base + 1, 0,                            \
+              (sizeof *__foo_io) - sizeof __foo_io->base);          \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+       LASSERT(plist->pl_nr > 0);
+       return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)                             \
+       list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)               \
+       list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+                         struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+                         struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+                         struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                           enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+                      struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+                      struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+       /** number of pages yet to be transferred. */
+       atomic_t                csi_sync_nr;
+       /** error code. */
+       int                     csi_sync_rc;
+       /** barrier of destroy this structure */
+       atomic_t                csi_barrier;
+       /** completion to be signaled when transfer is complete. */
+       wait_queue_head_t               csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page_list *queue, struct cl_sync_io *anchor,
+                    long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread Top-level lustre code
+ *     allocates an environment and makes it current, then calls into
+ *     non-lustre code, that in turn calls lustre back. Low-level lustre
+ *     code thus called can fetch environment created by the top-level code
+ *     and reuse it, avoiding additional environment allocation.
+ *       Right now, three interfaces can attach the cl_env to running thread:
+ *       - cl_env_get
+ *       - cl_env_implant
+ *       - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+       int   cen_refcheck;
+       void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get      (int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void      cl_env_put   (struct lu_env *env, int *refcheck);
+void      cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void     *cl_env_reenter    (void);
+void      cl_env_reexit     (void *cookie);
+void      cl_env_implant    (struct lu_env *env, int *refcheck);
+void      cl_env_unplant    (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                               struct lu_device_type *ldt,
+                               struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644 (file)
index 0000000..e116bb2
--- /dev/null
@@ -0,0 +1,1498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+       MNTOPT_USERXATTR        = 0x00000001,
+       MNTOPT_ACL            = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+       unsigned           ddp_max_name_len;
+       unsigned           ddp_max_nlink;
+       unsigned           ddp_block_shift;
+       mntopt_t           ddp_mntopts;
+       unsigned           ddp_max_ea_size;
+       void          *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+       int             ddp_mount_type;
+       unsigned long long ddp_maxbytes;
+       /* percentage of available space to reserve for grant error margin */
+       int             ddp_grant_reserved;
+       /* per-inode space consumption */
+       short         ddp_inodespace;
+       /* per-fragment grant overhead to be used by client for grant
+        * calculation */
+       int             ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+                       struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC  0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN  32
+
+struct dt_txn_commit_cb {
+       struct list_head        dcb_linkage;
+       dt_cb_t         dcb_func;
+       __u32           dcb_magic;
+       char            dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+       /**
+        * Return device-wide statistics.
+        */
+       int   (*dt_statfs)(const struct lu_env *env,
+                          struct dt_device *dev, struct obd_statfs *osfs);
+       /**
+        * Create transaction, described by \a param.
+        */
+       struct thandle *(*dt_trans_create)(const struct lu_env *env,
+                                          struct dt_device *dev);
+       /**
+        * Start transaction, described by \a param.
+        */
+       int   (*dt_trans_start)(const struct lu_env *env,
+                               struct dt_device *dev, struct thandle *th);
+       /**
+        * Finish previously started transaction.
+        */
+       int   (*dt_trans_stop)(const struct lu_env *env,
+                              struct thandle *th);
+       /**
+        * Add commit callback to the transaction.
+        */
+       int   (*dt_trans_cb_add)(struct thandle *th,
+                                struct dt_txn_commit_cb *dcb);
+       /**
+        * Return fid of root index object.
+        */
+       int   (*dt_root_get)(const struct lu_env *env,
+                            struct dt_device *dev, struct lu_fid *f);
+       /**
+        * Return device configuration data.
+        */
+       void  (*dt_conf_get)(const struct lu_env *env,
+                            const struct dt_device *dev,
+                            struct dt_device_param *param);
+       /**
+        *  handling device state, mostly for tests
+        */
+       int   (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+       int   (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+       /**
+         * Start a transaction commit asynchronously
+         *
+         * \param env environment
+         * \param dev dt_device to start commit on
+         *
+         * \return 0 success, negative value if error
+         */
+        int   (*dt_commit_async)(const struct lu_env *env,
+                                 struct dt_device *dev);
+       /**
+        * Initialize capability context.
+        */
+       int   (*dt_init_capa_ctxt)(const struct lu_env *env,
+                                  struct dt_device *dev,
+                                  int mode, unsigned long timeout,
+                                  __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+       /** required feature flags from enum dt_index_flags */
+       __u32 dif_flags;
+       /** minimal required key size */
+       size_t dif_keysize_min;
+       /** maximal required key size, 0 if no limit */
+       size_t dif_keysize_max;
+       /** minimal required record size */
+       size_t dif_recsize_min;
+       /** maximal required record size, 0 if no limit */
+       size_t dif_recsize_max;
+       /** pointer size for record */
+       size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+       /** index supports variable sized keys */
+       DT_IND_VARKEY = 1 << 0,
+       /** index supports variable sized records */
+       DT_IND_VARREC = 1 << 1,
+       /** index can be modified */
+       DT_IND_UPDATE = 1 << 2,
+       /** index supports records with non-unique (duplicate) keys */
+       DT_IND_NONUNQ = 1 << 3,
+       /**
+        * index support fixed-size keys sorted with natural numerical way
+        * and is able to return left-side value if no exact value found
+        */
+       DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+       struct dt_object           *dah_parent;
+       __u32                  dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+       DFT_REGULAR,
+       DFT_DIR,
+       /** for mknod */
+       DFT_NODE,
+       /** for special index */
+       DFT_INDEX,
+       /** for symbolic link */
+       DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+       /** type for dt object */
+       enum dt_format_type dof_type;
+       union {
+               struct dof_regular {
+                       int striped;
+               } dof_reg;
+               struct dof_dir {
+               } dof_dir;
+               struct dof_node {
+               } dof_node;
+               /**
+                * special index need feature as parameter to create
+                * special idx
+                */
+               struct dof_index {
+                       const struct dt_index_features *di_feat;
+               } dof_idx;
+       } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+       void  (*do_read_lock)(const struct lu_env *env,
+                             struct dt_object *dt, unsigned role);
+       void  (*do_write_lock)(const struct lu_env *env,
+                              struct dt_object *dt, unsigned role);
+       void  (*do_read_unlock)(const struct lu_env *env,
+                               struct dt_object *dt);
+       void  (*do_write_unlock)(const struct lu_env *env,
+                                struct dt_object *dt);
+       int  (*do_write_locked)(const struct lu_env *env,
+                               struct dt_object *dt);
+       /**
+        * Note: following ->do_{x,}attr_{set,get}() operations are very
+        * similar to ->moo_{x,}attr_{set,get}() operations in struct
+        * md_object_operations (see md_object.h). These operations are not in
+        * lu_object_operations, because ->do_{x,}attr_set() versions take
+        * transaction handle as an argument (this transaction is started by
+        * caller). We might factor ->do_{x,}attr_get() into
+        * lu_object_operations, but that would break existing symmetry.
+        */
+
+       /**
+        * Return standard attributes.
+        *
+        * precondition: lu_object_exists(&dt->do_lu);
+        */
+       int   (*do_attr_get)(const struct lu_env *env,
+                            struct dt_object *dt, struct lu_attr *attr,
+                            struct lustre_capa *capa);
+       /**
+        * Set standard attributes.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_attr_set)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct lu_attr *attr,
+                                    struct thandle *handle);
+       int   (*do_attr_set)(const struct lu_env *env,
+                            struct dt_object *dt,
+                            const struct lu_attr *attr,
+                            struct thandle *handle,
+                            struct lustre_capa *capa);
+       /**
+        * Return a value of an extended attribute.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+                             struct lu_buf *buf, const char *name,
+                             struct lustre_capa *capa);
+       /**
+        * Set value of an extended attribute.
+        *
+        * \a fl - flags from enum lu_xattr_flags
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_xattr_set)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_buf *buf,
+                                     const char *name, int fl,
+                                     struct thandle *handle);
+       int   (*do_xattr_set)(const struct lu_env *env,
+                             struct dt_object *dt, const struct lu_buf *buf,
+                             const char *name, int fl, struct thandle *handle,
+                             struct lustre_capa *capa);
+       /**
+        * Delete existing extended attribute.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_xattr_del)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const char *name, struct thandle *handle);
+       int   (*do_xattr_del)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const char *name, struct thandle *handle,
+                             struct lustre_capa *capa);
+       /**
+        * Place list of existing extended attributes into \a buf (which has
+        * length len).
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_xattr_list)(const struct lu_env *env,
+                              struct dt_object *dt, struct lu_buf *buf,
+                              struct lustre_capa *capa);
+       /**
+        * Init allocation hint using parent object and child mode.
+        * (1) The \a parent might be NULL if this is a partial creation for
+        *     remote object.
+        * (2) The type of child is in \a child_mode.
+        * (3) The result hint is stored in \a ah;
+        */
+       void  (*do_ah_init)(const struct lu_env *env,
+                           struct dt_allocation_hint *ah,
+                           struct dt_object *parent,
+                           struct dt_object *child,
+                           umode_t child_mode);
+       /**
+        * Create new object on this device.
+        *
+        * precondition: !dt_object_exists(dt);
+        * postcondition: ergo(result == 0, dt_object_exists(dt));
+        */
+       int   (*do_declare_create)(const struct lu_env *env,
+                                  struct dt_object *dt,
+                                  struct lu_attr *attr,
+                                  struct dt_allocation_hint *hint,
+                                  struct dt_object_format *dof,
+                                  struct thandle *th);
+       int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
+                          struct lu_attr *attr,
+                          struct dt_allocation_hint *hint,
+                          struct dt_object_format *dof,
+                          struct thandle *th);
+
+       /**
+         Destroy object on this device
+        * precondition: !dt_object_exists(dt);
+        * postcondition: ergo(result == 0, dt_object_exists(dt));
+        */
+       int   (*do_declare_destroy)(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct thandle *th);
+       int   (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+                           struct thandle *th);
+
+       /**
+        * Announce that this object is going to be used as an index. This
+        * operation check that object supports indexing operations and
+        * installs appropriate dt_index_operations vector on success.
+        *
+        * Also probes for features. Operation is successful if all required
+        * features are supported.
+        */
+       int   (*do_index_try)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const struct dt_index_features *feat);
+       /**
+        * Add nlink of the object
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_ref_add)(const struct lu_env *env,
+                                   struct dt_object *dt, struct thandle *th);
+       int   (*do_ref_add)(const struct lu_env *env,
+                           struct dt_object *dt, struct thandle *th);
+       /**
+        * Del nlink of the object
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_ref_del)(const struct lu_env *env,
+                                   struct dt_object *dt, struct thandle *th);
+       int   (*do_ref_del)(const struct lu_env *env,
+                           struct dt_object *dt, struct thandle *th);
+
+       struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       struct lustre_capa *old,
+                                       __u64 opc);
+       int (*do_object_sync)(const struct lu_env *, struct dt_object *);
+       /**
+        * Get object info of next level. Currently, only get inode from osd.
+        * This is only used by quota b=16542
+        * precondition: dt_object_exists(dt);
+        */
+       int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+                          void **data);
+
+       /**
+        * Lock object.
+        */
+       int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+                             struct lustre_handle *lh,
+                             struct ldlm_enqueue_info *einfo,
+                             void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+                           struct lu_buf *buf, loff_t *pos,
+                           struct lustre_capa *capa);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       ssize_t (*dbo_declare_write)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const loff_t size, loff_t pos,
+                                    struct thandle *handle);
+       ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+                            const struct lu_buf *buf, loff_t *pos,
+                            struct thandle *handle, struct lustre_capa *capa,
+                            int ignore_quota);
+       /*
+        * methods for zero-copy IO
+        */
+
+       /*
+        * precondition: dt_object_exists(dt);
+        * returns:
+        * < 0 - error code
+        * = 0 - illegal
+        * > 0 - number of local buffers prepared
+        */
+       int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+                           loff_t pos, ssize_t len, struct niobuf_local *lb,
+                           int rw, struct lustre_capa *capa);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+                           struct niobuf_local *lb, int nr);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+                             struct niobuf_local *lb, int nr);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_declare_write_commit)(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       struct niobuf_local *,
+                                       int, struct thandle *);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+                               struct niobuf_local *, int, struct thandle *);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+                            struct niobuf_local *lnb, int nr);
+       int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+                             struct ll_user_fiemap *fm);
+       /**
+        * Punch object's content
+        * precondition: regular object, not index
+        */
+       int   (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+                                 __u64, __u64, struct thandle *th);
+       int   (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+                         __u64 start, __u64 end, struct thandle *th,
+                         struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+                         struct dt_rec *rec, const struct dt_key *key,
+                         struct lustre_capa *capa);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_declare_insert)(const struct lu_env *env,
+                                 struct dt_object *dt,
+                                 const struct dt_rec *rec,
+                                 const struct dt_key *key,
+                                 struct thandle *handle);
+       int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+                         const struct dt_rec *rec, const struct dt_key *key,
+                         struct thandle *handle, struct lustre_capa *capa,
+                         int ignore_quota);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_declare_delete)(const struct lu_env *env,
+                                 struct dt_object *dt,
+                                 const struct dt_key *key,
+                                 struct thandle *handle);
+       int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+                         const struct dt_key *key, struct thandle *handle,
+                         struct lustre_capa *capa);
+       /**
+        * Iterator interface
+        */
+       struct dt_it_ops {
+               /**
+                * Allocate and initialize new iterator.
+                *
+                * precondition: dt_object_exists(dt);
+                */
+               struct dt_it *(*init)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     __u32 attr,
+                                     struct lustre_capa *capa);
+               void      (*fini)(const struct lu_env *env,
+                                     struct dt_it *di);
+               int         (*get)(const struct lu_env *env,
+                                     struct dt_it *di,
+                                     const struct dt_key *key);
+               void       (*put)(const struct lu_env *env,
+                                     struct dt_it *di);
+               int        (*next)(const struct lu_env *env,
+                                     struct dt_it *di);
+               struct dt_key *(*key)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int       (*key_size)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int         (*rec)(const struct lu_env *env,
+                                     const struct dt_it *di,
+                                     struct dt_rec *rec,
+                                     __u32 attr);
+               __u64   (*store)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int        (*load)(const struct lu_env *env,
+                                     const struct dt_it *di, __u64 hash);
+               int     (*key_rec)(const struct lu_env *env,
+                                     const struct dt_it *di, void* key_rec);
+       } dio_it;
+};
+
+enum dt_otable_it_valid {
+       DOIV_ERROR_HANDLE       = 0x0001,
+};
+
+enum dt_otable_it_flags {
+       /* Exit when fail. */
+       DOIF_FAILOUT    = 0x0001,
+
+       /* Reset iteration position to the device beginning. */
+       DOIF_RESET      = 0x0002,
+
+       /* There is up layer component uses the iteration. */
+       DOIF_OUTUSED    = 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT       16
+#define DT_OTABLE_IT_FLAGS_MASK        0xffff0000
+
+struct dt_device {
+       struct lu_device                   dd_lu_dev;
+       const struct dt_device_operations *dd_ops;
+
+       /**
+        * List of dt_txn_callback (see below). This is not protected in any
+        * way, because callbacks are supposed to be added/deleted only during
+        * single-threaded start-up shut-down procedures.
+        */
+       struct list_head                         dd_txn_callbacks;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+       return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+       LASSERT(lu_device_is_dt(l));
+       return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+       struct lu_object                   do_lu;
+       const struct dt_object_operations *do_ops;
+       const struct dt_body_operations   *do_body_ops;
+       const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+       /* all initialized llog systems on this node linked by this */
+       struct list_head          los_list;
+
+       /* how many handle's reference this los has */
+       atomic_t          los_refcount;
+       struct dt_device *los_dev;
+       struct dt_object *los_obj;
+
+       /* data used to generate new fids */
+       struct mutex     los_id_lock;
+       __u64             los_seq;
+       __u32             los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+       LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+       return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+                   struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+       return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+       return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+       LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+       return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+       /** the dt device on which the transactions are executed */
+       struct dt_device *th_dev;
+
+       /** context for this transaction, tag is LCT_TX_HANDLE */
+       struct lu_context th_ctx;
+
+       /** additional tags (layers can add in declare) */
+       __u32        th_tags;
+
+       /** the last operation result in this transaction.
+        * this value is used in recovery */
+       __s32        th_result;
+
+       /** whether we need sync commit */
+       unsigned int            th_sync:1;
+
+       /* local transation, no need to inform other layers */
+       unsigned int            th_local:1;
+
+       /* In DNE, one transaction can be disassemblied into
+        * updates on several different MDTs, and these updates
+        * will be attached to th_remote_update_list per target.
+        * Only single thread will access the list, no need lock
+        */
+       struct list_head                th_remote_update_list;
+       struct update_request   *th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+       int (*dtc_txn_start)(const struct lu_env *env,
+                            struct thandle *txn, void *cookie);
+       int (*dtc_txn_stop)(const struct lu_env *env,
+                           struct thandle *txn, void *cookie);
+       void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+       void            *dtc_cookie;
+       __u32           dtc_tag;
+       struct list_head           dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                     struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+                           const char *name,
+                           void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+                  char *local, dt_entry_func_t entry_func,
+                  void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+                const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+                               struct dt_device *dt,
+                               const char *dirname,
+                               const char *filename,
+                               struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object_format *dof,
+                                   struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+                              struct dt_device *dev,
+                              const struct lu_fid *fid,
+                              struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+         const struct lu_fid *fid)
+{
+       return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+                          const struct lu_fid *first_fid,
+                          struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+                           struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+                             struct local_oid_storage *los,
+                             struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+                               struct local_oid_storage *los,
+                               struct dt_object *o,
+                               struct lu_attr *attr,
+                               struct dt_object_format *dof,
+                               struct thandle *th);
+int local_object_create(const struct lu_env *env,
+                       struct local_oid_storage *los,
+                       struct dt_object *o,
+                       struct lu_attr *attr, struct dt_object_format *dof,
+                       struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+                                           struct local_oid_storage *los,
+                                           struct dt_object *parent,
+                                           const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+                                                    struct dt_device *dt,
+                                                    const struct lu_fid *fid,
+                                                    struct dt_object *parent,
+                                                    const char *name,
+                                                    __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+                          struct local_oid_storage *los,
+                          struct dt_object *parent,
+                          const char *name, __u32 mode,
+                          const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object *parent,
+                                   const char *name, __u32 mode,
+                                   const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+                       struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+                                struct dt_object *o, struct lustre_handle *lh,
+                                struct ldlm_enqueue_info *einfo,
+                                void *policy)
+{
+       LASSERT(o);
+       LASSERT(o->do_ops);
+       LASSERT(o->do_ops->do_object_lock);
+       return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                 const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env,
+                                struct dt_object *o)
+{
+       LASSERT(o);
+       LASSERT(o->do_ops);
+       LASSERT(o->do_ops->do_object_sync);
+       return o->do_ops->do_object_sync(env, o);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                          struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                   dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+           struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                  struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+                                    union lu_page *lp, int nob,
+                                    const struct dt_it_ops *iops,
+                                    struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+                 const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+                 void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+                 struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+                                             struct dt_device *d)
+{
+       LASSERT(d->dd_ops->dt_trans_create);
+       return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+                                struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_start);
+       return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+                                      struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_start);
+       th->th_local = 1;
+       return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+                               struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_stop);
+       return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+                                 struct dt_txn_commit_cb *dcb)
+{
+       LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+       dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+       return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+                                         struct dt_object *dt,
+                                         int size, loff_t pos,
+                                         struct thandle *th)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+       LASSERT(th != NULL);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_declare_write);
+       rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+       return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_create);
+       return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_create);
+       return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_destroy);
+       return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+                            struct dt_object *dt,
+                            struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_destroy);
+       return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+                               struct dt_object *dt,
+                               unsigned role)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_read_lock);
+       dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+                               struct dt_object *dt,
+                               unsigned role)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_lock);
+       dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+                               struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_read_unlock);
+       dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+                               struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_unlock);
+       dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+                                 struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_locked);
+       return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+                             struct lu_attr *la, void *arg)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_attr_get);
+       return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_attr *la,
+                                     struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_attr_set);
+       return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+                             const struct lu_attr *la, struct thandle *th,
+                             struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_attr_set);
+       return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+                                    struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_ref_add);
+       return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_add);
+       return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+                                    struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_ref_del);
+       return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_del);
+       return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+                                          struct dt_object *dt,
+                                          struct lustre_capa *old, __u64 opc)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_del);
+       return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+                             struct niobuf_remote *rnb,
+                             struct niobuf_local *lnb, int rw,
+                             struct lustre_capa *capa)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_bufs_get);
+       return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+                                           rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+                             struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_bufs_put);
+       return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+                               struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_write_prep);
+       return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+                                         struct dt_object *d,
+                                         struct niobuf_local *lnb,
+                                         int n, struct thandle *th)
+{
+       LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+       LASSERT(th != NULL);
+       return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+                                 struct dt_object *d, struct niobuf_local *lnb,
+                                 int n, struct thandle *th)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_write_commit);
+       return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+                              struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_read_prep);
+       return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+                                  struct dt_object *dt, __u64 start,
+                                  __u64 end, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_declare_punch);
+       return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+                          __u64 start, __u64 end, struct thandle *th,
+                          struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_punch);
+       return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+                               struct ll_user_fiemap *fm)
+{
+       LASSERT(d);
+       if (d->do_body_ops == NULL)
+               return -EPROTO;
+       if (d->do_body_ops->dbo_fiemap_get == NULL)
+               return -EOPNOTSUPP;
+       return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+                           struct obd_statfs *osfs)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_statfs);
+       return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+                             struct lu_fid *f)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_root_get);
+       return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+                              const struct dt_device *dev,
+                              struct dt_device_param *param)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_conf_get);
+       return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_sync);
+       return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_ro);
+       return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_rec *rec,
+                                   const struct dt_key *key,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_declare_insert);
+       return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_rec *rec,
+                                   const struct dt_key *key,
+                                   struct thandle *th,
+                                   struct lustre_capa *capa,
+                                   int noquota)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_insert);
+       return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+                                           capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const char *name,
+                                      struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_xattr_del);
+       return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+                              struct dt_object *dt, const char *name,
+                              struct thandle *th,
+                              struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_del);
+       return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_buf *buf,
+                                     const char *name, int fl,
+                                     struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_xattr_set);
+       return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+                             struct dt_object *dt, const struct lu_buf *buf,
+                             const char *name, int fl, struct thandle *th,
+                             struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_set);
+       return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+                             struct dt_object *dt, struct lu_buf *buf,
+                             const char *name, struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_get);
+       return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+                              struct dt_object *dt, struct lu_buf *buf,
+                              struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_list);
+       return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_key *key,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_declare_delete);
+       return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+                           struct dt_object *dt,
+                           const struct dt_key *key,
+                           struct thandle *th,
+                           struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_delete);
+       return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+                                 struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_commit_async);
+       return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+                                   struct dt_device *dev,
+                                   int mode, unsigned long timeout,
+                                   __u32 alg, struct lustre_capa_key *keys)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+       return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+                                             timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+                           struct dt_object *dt,
+                           struct dt_rec *rec,
+                           const struct dt_key *key,
+                           struct lustre_capa *capa)
+{
+       int ret;
+
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_lookup);
+
+       ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+       if (ret > 0)
+               ret = 0;
+       else if (ret == 0)
+               ret = -ENOENT;
+       return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+       struct lu_fid   *dfh_fid;
+       struct dt_device     *dfh_dt;
+       struct dt_object     *dfh_o;
+};
+
+struct dt_thread_info {
+       char                 dti_buf[DT_MAX_PATH];
+       struct dt_find_hint      dti_dfh;
+       struct lu_attr     dti_attr;
+       struct lu_fid       dti_fid;
+       struct dt_object_format  dti_dof;
+       struct lustre_mdt_attrs  dti_lma;
+       struct lu_buf       dti_lb;
+       loff_t             dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+       struct dt_thread_info *dti;
+
+       dti = lu_context_key_get(&env->le_ctx, &dt_key);
+       LASSERT(dti);
+       return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef LPROCFS
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+                           int count, int *eof, void *data);
+# endif /* LPROCFS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644 (file)
index 0000000..dfdb8aa
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/libcfs/libcfs.h>   /* LASSERT. */
+
+struct interval_node {
+       struct interval_node   *in_left;
+       struct interval_node   *in_right;
+       struct interval_node   *in_parent;
+       unsigned                in_color:1,
+                               in_intree:1, /** set if the node is in tree */
+                               in_res1:30;
+       __u8                in_res2[4];  /** tags, 8-bytes aligned */
+       __u64              in_max_high;
+       struct interval_node_extent {
+               __u64 start;
+               __u64 end;
+       } in_extent;
+};
+
+enum interval_iter {
+       INTERVAL_ITER_CONT = 1,
+       INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+       return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+       return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+       return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+                               __u64 start, __u64 end)
+{
+       LASSERT(start <= end);
+       node->in_extent.start = start;
+       node->in_extent.end = end;
+       node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+                                                 void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                     struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+                                  struct interval_node_extent *ex,
+                                  interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+                                   interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                   interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root,
+                    struct interval_node_extent *ext,
+                    struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+                          struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+                                   struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h
new file mode 100644 (file)
index 0000000..227c261
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _IOWR
+
+/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
+ * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H.
+ *
+ * We can avoid any problems with the kernel header being included again by
+ * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h>
+ * does not include the kernel's ioctl.h after this one. b=14746 */
+#define _ASM_I386_IOCTL_H
+#define _ASM_GENERIC_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms.  The i386 ioctl numbering scheme doesn't really enforce
+ * a type field.  De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here.  Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE       0U
+#define _IOC_WRITE      1U
+#define _IOC_READ       2U
+
+#define _IOC(dir,type,nr,size) (((dir)  << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr)   << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)       _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)      _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)      _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size)     _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)       (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)     (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)         (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)     (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN   (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT         (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT       ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK    (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT   (_IOC_SIZESHIFT)
+
+#endif /* _IOWR */
diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644 (file)
index 0000000..d00600c
--- /dev/null
@@ -0,0 +1,441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                   struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+       return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+       return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+       /** Locking is done by server */
+       SETATTR_NOLOCK,
+       /** Extent lock is enqueued */
+       SETATTR_EXTENT_LOCK,
+       /** Existing local extent lock is used */
+       SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+       /** super class */
+       struct cl_io_slice     cui_cl;
+       struct cl_io_lock_link cui_link;
+       /**
+        * I/O vector information to or from which read/write is going.
+        */
+       struct iovec *cui_iov;
+       unsigned long cui_nrsegs;
+       /**
+        * Total iov count for left IO.
+        */
+       unsigned long cui_tot_nrsegs;
+       /**
+        * Old length for iov that was truncated partially.
+        */
+       size_t cui_iov_olen;
+       /**
+        * Total size for the left IO.
+        */
+       size_t cui_tot_count;
+
+       union {
+               struct {
+                       enum ccc_setattr_lock_type cui_local_lock;
+               } setattr;
+       } u;
+       /**
+        * True iff io is processing glimpse right now.
+        */
+       int               cui_glimpse;
+       /**
+        * Layout version when this IO is initialized
+        */
+       __u32           cui_layout_gen;
+       /**
+        * File descriptor against which IO is done.
+        */
+       struct ll_file_data *cui_fd;
+       struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for other (sendfile, splice*).
+ * must be impementated in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+       struct cl_lock_descr cti_descr;
+       struct cl_io     cti_io;
+       struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+       struct ccc_thread_info      *info;
+
+       info = lu_context_key_get(&env->le_ctx, &ccc_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+       struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+       memset(attr, 0, sizeof(*attr));
+       return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+       struct cl_io *io = &ccc_env_info(env)->cti_io;
+       memset(io, 0, sizeof(*io));
+       return io;
+}
+
+struct ccc_session {
+       struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+       struct ccc_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+       return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+       struct cl_object_header cob_header;
+       struct cl_object        cob_cl;
+       struct inode       *cob_inode;
+
+       /**
+        * A list of dirty pages pending IO in the cache. Used by
+        * SOM. Protected by ll_inode_info::lli_lock.
+        *
+        * \see ccc_page::cpg_pending_linkage
+        */
+       struct list_head             cob_pending_list;
+
+       /**
+        * Access this counter is protected by inode->i_sem. Now that
+        * the lifetime of transient pages must be covered by inode sem,
+        * we don't need to hold any lock..
+        */
+       int                  cob_transient_pages;
+       /**
+        * Number of outstanding mmaps on this file.
+        *
+        * \see ll_vm_open(), ll_vm_close().
+        */
+       atomic_t            cob_mmap_cnt;
+
+       /**
+        * various flags
+        * cob_discard_page_warned
+        *     if pages belonging to this object are discarded when a client
+        * is evicted, some debug info will be printed, this flag will be set
+        * during processing the first discarded page, then avoid flooding
+        * debug message for lots of discarded pages.
+        *
+        * \see ll_dirty_page_discard_warn.
+        */
+       unsigned int            cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+       struct cl_page_slice cpg_cl;
+       int               cpg_defer_uptodate;
+       int               cpg_ra_used;
+       int               cpg_write_queued;
+       /**
+        * Non-empty iff this page is already counted in
+        * ccc_object::cob_pending_list. Protected by
+        * ccc_object::cob_pending_guard. This list is only used as a flag,
+        * that is, never iterated through, only checked for list_empty(), but
+        * having a list is useful for debugging.
+        */
+       struct list_head           cpg_pending_linkage;
+       /** VM page */
+       struct page       *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+       return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+       struct cl_device    cdv_cl;
+       struct super_block *cdv_sb;
+       struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+       struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+       struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init     (const struct lu_context *ctx,
+                          struct lu_context_key *key);
+void  ccc_key_fini     (const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+
+int          ccc_device_init  (const struct lu_env *env,
+                                  struct lu_device *d,
+                                  const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+                                  struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                  struct lu_device_type *t,
+                                  struct lustre_cfg *cfg,
+                                  const struct lu_device_operations *luops,
+                                  const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+                                  struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev,
+                                  const struct cl_object_operations *clops,
+                                  const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+                    const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io,
+                 const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+                      const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+                           const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int  ccc_transient_page_own(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+                    const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+                    struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+                      const struct cl_lock_slice *slice,
+                      const struct cl_lock_descr *need,
+                      const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+                   const struct cl_lock_slice *slice,
+                   enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                         __u32 enqflags, enum cl_lock_mode mode,
+                         pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                   __u32 enqflags, enum cl_lock_mode mode,
+                   loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+                   size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+                      struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+                       const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+                     const struct cl_object *obj,
+                     struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu       (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc       (const struct lu_object *obj);
+struct ccc_object  *cl2ccc       (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+                                    const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+struct page     *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+                  struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr)                               \
+       ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+                 struct obd_device *watched,
+                 enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+       struct lu_env   *cg_env;
+       struct cl_io    *cg_io;
+       struct cl_lock  *cg_lock;
+       unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                     struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached pages. A count of
+ * "unstable" pages is maintained, and an LRU of clean pages is
+ * maintained. "unstable" pages are pages pinned by the ptlrpc
+ * layer for recovery purposes.
+ */
+struct cl_client_cache {
+       atomic_t        ccc_users;    /* # of users (OSCs) of this data */
+       struct list_head        ccc_lru;      /* LRU list of cached clean pages */
+       spinlock_t      ccc_lru_lock; /* lock for list */
+       atomic_t        ccc_lru_left; /* # of LRU entries available */
+       unsigned long   ccc_lru_max;  /* Max # of LRU entries possible */
+       unsigned int    ccc_lru_shrinkers; /* # of threads reclaiming */
+       atomic_t        ccc_unstable_nr;    /* # of unstable pages pinned */
+       wait_queue_head_t       ccc_unstable_waitq; /* Signaled on BRW commit */
+};
+
+#endif /*LCLIENT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
new file mode 100644 (file)
index 0000000..5866922
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LINUX_LPROCFS_SNMP_H
+#define _LINUX_LPROCFS_SNMP_H
+
+#ifndef _LPROCFS_SNMP_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include <linux/smp.h>
+#include <linux/rwsem.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/statfs.h>
+
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
new file mode 100644 (file)
index 0000000..ff4fc4f
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_acl.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_LINUX_ACL_H
+#define _LUSTRE_LINUX_ACL_H
+
+#ifndef        _LUSTRE_ACL_H
+#error Shoud not include direectly. use #include <lustre_acl.h> instead
+#endif
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+#  define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+#  define LUSTRE_POSIX_ACL_MAX_SIZE                                    \
+       (sizeof(posix_acl_xattr_header) +                               \
+        LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE
+# define LUSTRE_POSIX_ACL_MAX_SIZE   0
+#endif
+
+#endif /* _LUSTRE_LINUX_ACL_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
new file mode 100644 (file)
index 0000000..d1783a3
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef LUSTRE_COMMON_H
+#define LUSTRE_COMMON_H
+
+#include <linux/sched.h>
+
+static inline int cfs_cleanup_group_info(void)
+{
+       struct group_info *ginfo;
+
+       ginfo = groups_alloc(0);
+       if (!ginfo)
+               return -ENOMEM;
+
+       set_current_groups(ginfo);
+       put_group_info(ginfo);
+
+       return 0;
+}
+
+#define ll_inode_blksize(a)            (1<<(a)->i_blkbits)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644 (file)
index 0000000..dff0468
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/lustre_patchless_compat.h>
+
+# define LOCK_FS_STRUCT(fs)    spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)  spin_unlock(&(fs)->lock)
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+                                struct dentry *dentry)
+{
+       struct path path;
+       struct path old_pwd;
+
+       path.mnt = mnt;
+       path.dentry = dentry;
+       LOCK_FS_STRUCT(fs);
+       old_pwd = fs->pwd;
+       path_get(&path);
+       fs->pwd = path;
+       UNLOCK_FS_STRUCT(fs);
+
+       if (old_pwd.dentry)
+               path_put(&old_pwd);
+}
+
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+
+#define LTIME_S(time)             (time.tv_sec)
+
+#define ll_permission(inode,mask,nd)    inode_permission(inode,mask)
+
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+        generic_permission(inode, mask)
+
+#define ll_blkdev_put(a, b) blkdev_put(a, b)
+
+#define ll_dentry_open(a,b,c)  dentry_open(a,b,c)
+
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                      vfs_symlink(dir, dentry, path)
+
+
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+               generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)       do {} while (0) /* for write unlock */
+# define inode_dio_read(i)             atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+#define TREE_READ_LOCK_IRQ(mapping)    spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping)  spin_unlock_irq(&(mapping)->tree_lock)
+
+static inline
+int ll_unregister_blkdev(unsigned int dev, const char *name)
+{
+       unregister_blkdev(dev, name);
+       return 0;
+}
+
+#define ll_invalidate_bdev(a,b)         invalidate_bdev((a))
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP                  (0)
+#endif
+
+
+
+/* add a lustre compatible layer for crypto API */
+#include <linux/crypto.h>
+#define ll_crypto_hash   crypto_hash
+#define ll_crypto_cipher       crypto_blkcipher
+#define ll_crypto_alloc_hash(name, type, mask)  crypto_alloc_hash(name, type, mask)
+#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen)
+#define ll_crypto_hash_init(desc)             crypto_hash_init(desc)
+#define ll_crypto_hash_update(desc, sl, bytes)  crypto_hash_update(desc, sl, bytes)
+#define ll_crypto_hash_final(desc, out)         crypto_hash_final(desc, out)
+#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
+               crypto_blkcipher_setkey(tfm, key, keylen)
+#define ll_crypto_blkcipher_set_iv(tfm, src, len) \
+               crypto_blkcipher_set_iv(tfm, src, len)
+#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \
+               crypto_blkcipher_get_iv(tfm, dst, len)
+#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \
+               crypto_blkcipher_encrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \
+               crypto_blkcipher_decrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \
+               crypto_blkcipher_encrypt_iv(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \
+               crypto_blkcipher_decrypt_iv(desc, dst, src, bytes)
+
+static inline
+struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name,
+                                                  u32 type, u32 mask)
+{
+       struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask);
+
+       return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn);
+}
+
+static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
+                                u8 *key, unsigned int *keylen,
+                                struct scatterlist *sg,
+                                unsigned int size, u8 *result)
+{
+       struct hash_desc desc;
+       int           rv;
+       desc.tfm   = tfm;
+       desc.flags = 0;
+       rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+       if (rv) {
+               CERROR("failed to hash setkey: %d\n", rv);
+               return rv;
+       }
+       return crypto_hash_digest(&desc, sg, size, result);
+}
+static inline
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+{
+       return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
+}
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+       return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+#define ll_crypto_hash_blocksize(tfm)       crypto_hash_blocksize(tfm)
+#define ll_crypto_hash_digestsize(tfm)      crypto_hash_digestsize(tfm)
+#define ll_crypto_blkcipher_ivsize(tfm)     crypto_blkcipher_ivsize(tfm)
+#define ll_crypto_blkcipher_blocksize(tfm)  crypto_blkcipher_blocksize(tfm)
+#define ll_crypto_free_hash(tfm)           crypto_free_hash(tfm)
+#define ll_crypto_free_blkcipher(tfm)       crypto_free_blkcipher(tfm)
+
+#define ll_vfs_rmdir(dir,entry,mnt)         vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)       vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt)   vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+               vfs_rename(old,old_dir,new,new_dir)
+
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#define cfs_bio_io_error(a,b)   bio_io_error((a))
+#define cfs_bio_endio(a,b,c)    bio_endio((a),(c))
+
+#define cfs_fs_pwd(fs)       ((fs)->pwd.dentry)
+#define cfs_fs_mnt(fs)       ((fs)->pwd.mnt)
+#define cfs_path_put(nd)     path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+       int rc;
+
+       if (sb->s_qcop->quota_on) {
+               struct path path;
+
+               rc = kern_path(name, LOOKUP_FOLLOW, &path);
+               if (!rc)
+                       return rc;
+               rc = sb->s_qcop->quota_on(sb, off, ver
+                                           , &path
+                                          );
+               path_put(&path);
+               return rc;
+       }
+       else
+               return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+       if (sb->s_qcop->quota_off) {
+               return sb->s_qcop->quota_off(sb, off
+                                           );
+       }
+       else
+               return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init             dquot_initialize
+# define ll_vfs_dq_drop             dquot_drop
+# define ll_vfs_dq_transfer     dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)       queue_max_segments(rq)
+
+#define ll_kmap_atomic(a, b)   kmap_atomic(a)
+#define ll_kunmap_atomic(a, b) kunmap_atomic(a)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+       p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold)       do {} while (0)
+#define ll_pagevec_add(pv, pg)   (0)
+#define ll_pagevec_lru_add_file(pv)     do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET  ((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit       __test_and_set_bit_le
+# define ext2_clear_bit           __test_and_clear_bit_le
+# define ext2_test_bit     test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+       flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+       return flag;
+}
+
+# define ll_mrf_ret void
+# define LL_MRF_RETURN(rc)
+
+#include <linux/fs.h>
+
+# define ll_umode_t    umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag)   (inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
new file mode 100644 (file)
index 0000000..11deac7
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DEBUG_H
+#define _LINUX_LUSTRE_DEBUG_H
+
+#ifndef _LUSTRE_DEBUG_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)                               \
+       CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
+              fmt, page, page->mapping, page->index, (long)page->flags,      \
+              page_count(page), page_private(page), ## arg)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
new file mode 100644 (file)
index 0000000..207df03
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DLM_H__
+#define _LINUX_LUSTRE_DLM_H__
+
+#ifndef _LUSTRE_DLM_H__
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+# include <linux/proc_fs.h>
+#  include <asm/processor.h>
+#  include <linux/bit_spinlock.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
new file mode 100644 (file)
index 0000000..6c72609
--- /dev/null
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LINUX_LUSTRE_FSFILT_H
+#define _LINUX_LUSTRE_FSFILT_H
+
+#ifndef _LUSTRE_FSFILT_H
+#error Do not #include this file directly. #include <lustre_fsfilt.h> instead
+#endif
+
+
+#include <obd.h>
+#include <obd_class.h>
+
+typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
+                           void *data, int error);
+
+struct fsfilt_operations {
+       struct list_head fs_list;
+       module_t *fs_owner;
+       char   *fs_type;
+       char   *(* fs_getlabel)(struct super_block *sb);
+       void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
+                            int logs);
+       int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
+       int     (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+                                      int pages, unsigned long *blocks,
+                                      int create, struct mutex *sem);
+       int     (* fs_write_record)(struct file *, void *, int size, loff_t *,
+                                   int force_sync);
+       int     (* fs_read_record)(struct file *, void *, int size, loff_t *);
+       int     (* fs_setup)(struct super_block *sb);
+};
+
+extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
+extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
+extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
+extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+
+static inline char *fsfilt_get_label(struct obd_device *obd,
+                                    struct super_block *sb)
+{
+       if (obd->obd_fsops->fs_getlabel == NULL)
+               return NULL;
+       if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
+               return NULL;
+
+       return obd->obd_fsops->fs_getlabel(sb);
+}
+
+#define FSFILT_OP_UNLINK               1
+#define FSFILT_OP_CANCEL_UNLINK         10
+
+#define __fsfilt_check_slow(obd, start, msg)                         \
+do {                                                                 \
+       if (cfs_time_before(jiffies, start + 15 * HZ))          \
+               break;                                              \
+       else if (cfs_time_before(jiffies, start + 30 * HZ))        \
+               CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name,   \
+                      msg, (jiffies-start) / HZ);                  \
+       else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
+               CWARN("%s: slow %s %lus\n", obd->obd_name, msg,    \
+                     (jiffies - start) / HZ);                  \
+       else                                                          \
+               CERROR("%s: slow %s %lus\n", obd->obd_name, msg,          \
+                      (jiffies - start) / HZ);                \
+} while (0)
+
+#define fsfilt_check_slow(obd, start, msg)           \
+do {                                               \
+       __fsfilt_check_slow(obd, start, msg);      \
+       start = jiffies;                                \
+} while (0)
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+                                    struct inode *inode, int op,
+                                    struct obd_trans_info *oti, int logs)
+{
+       unsigned long now = jiffies;
+       void *parent_handle = oti ? oti->oti_handle : NULL;
+       void *handle;
+
+       handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
+       CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+
+       if (oti != NULL) {
+               if (parent_handle == NULL) {
+                       oti->oti_handle = handle;
+               } else if (handle != parent_handle) {
+                       CERROR("mismatch: parent %p, handle %p, oti %p\n",
+                              parent_handle, handle, oti);
+                       LBUG();
+               }
+       }
+       fsfilt_check_slow(obd, now, "journal start");
+       return handle;
+}
+
+static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
+                               void *handle, int force_sync)
+{
+       unsigned long now = jiffies;
+       int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
+       CDEBUG(D_INFO, "committing handle %p\n", handle);
+
+       fsfilt_check_slow(obd, now, "journal start");
+
+       return rc;
+}
+
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+                                        struct inode *inode,
+                                        struct page **page, int pages,
+                                        unsigned long *blocks,
+                                        int create, struct mutex *mutex)
+{
+       return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+                                                 create, mutex);
+}
+
+static inline int fsfilt_read_record(struct obd_device *obd, struct file *file,
+                                    void *buf, loff_t size, loff_t *offs)
+{
+       return obd->obd_fsops->fs_read_record(file, buf, size, offs);
+}
+
+static inline int fsfilt_write_record(struct obd_device *obd, struct file *file,
+                                     void *buf, loff_t size, loff_t *offs,
+                                     int force_sync)
+{
+       return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync);
+}
+
+static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs)
+{
+       if (obd->obd_fsops->fs_setup)
+               return obd->obd_fsops->fs_setup(fs);
+       return 0;
+}
+
+
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
new file mode 100644 (file)
index 0000000..ecf1840
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_LUSTRE_HANDLES_H_
+#define __LINUX_LUSTRE_HANDLES_H_
+
+#ifndef __LUSTRE_HANDLES_H_
+#error Do not #include this file directly. #include <lustre_handles.h> instead
+#endif
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/rcupdate.h> /* for rcu_head{} */
+typedef struct rcu_head cfs_rcu_head_t;
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
new file mode 100644 (file)
index 0000000..b10ddfa
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+       int             it_disposition;
+       int             it_status;
+       __u64           it_lock_handle;
+       __u64           it_lock_bits;
+       int             it_lock_mode;
+       int             it_remote_lock_mode;
+       __u64      it_remote_lock_handle;
+       void       *it_data;
+       unsigned int    it_lock_set:1;
+};
+
+struct lookup_intent {
+       int     it_op;
+       int     it_flags;
+       int     it_create_mode;
+       union {
+               struct lustre_intent_data lustre;
+       } d;
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
new file mode 100644 (file)
index 0000000..b2f755a
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LINUX_LUSTRE_LIB_H
+#define _LINUX_LUSTRE_LIB_H
+
+#ifndef _LUSTRE_LIB_H
+#error Do not #include this file directly. #include <lustre_lib.h> instead
+#endif
+
+# include <linux/rwsem.h>
+# include <linux/sched.h>
+# include <linux/signal.h>
+# include <linux/types.h>
+# include <linux/lustre_compat25.h>
+# include <linux/lustre_common.h>
+
+#ifndef LP_POISON
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
+#endif
+
+/* This macro is only for compatibility reasons with older Linux Lustre user
+ * tools. New ioctls should NOT use this macro as the ioctl "size". Instead
+ * the ioctl should get a "size" argument which is the actual data type used
+ * by the ioctl, to ensure the ioctl interface is versioned correctly. */
+#define OBD_IOC_DATA_TYPE             long
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |                \
+                          sigmask(SIGTERM) | sigmask(SIGQUIT) |               \
+                          sigmask(SIGALRM))
+
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+       lvb->lvb_size = i_size_read(inode);
+       lvb->lvb_blocks = inode->i_blocks;
+       lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+       lvb->lvb_atime = LTIME_S(inode->i_atime);
+       lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644 (file)
index 0000000..c95dff9
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <linux/version.h>
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_ha.h>
+
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+        LPROC_LL_DIRTY_HITS = 0,
+        LPROC_LL_DIRTY_MISSES,
+        LPROC_LL_READ_BYTES,
+        LPROC_LL_WRITE_BYTES,
+        LPROC_LL_BRW_READ,
+        LPROC_LL_BRW_WRITE,
+        LPROC_LL_OSC_READ,
+        LPROC_LL_OSC_WRITE,
+        LPROC_LL_IOCTL,
+        LPROC_LL_OPEN,
+        LPROC_LL_RELEASE,
+        LPROC_LL_MAP,
+        LPROC_LL_LLSEEK,
+        LPROC_LL_FSYNC,
+        LPROC_LL_READDIR,
+        LPROC_LL_SETATTR,
+        LPROC_LL_TRUNC,
+        LPROC_LL_FLOCK,
+        LPROC_LL_GETATTR,
+        LPROC_LL_CREATE,
+        LPROC_LL_LINK,
+        LPROC_LL_UNLINK,
+        LPROC_LL_SYMLINK,
+        LPROC_LL_MKDIR,
+        LPROC_LL_RMDIR,
+        LPROC_LL_MKNOD,
+        LPROC_LL_RENAME,
+        LPROC_LL_STAFS,
+        LPROC_LL_ALLOC_INODE,
+        LPROC_LL_SETXATTR,
+        LPROC_LL_GETXATTR,
+        LPROC_LL_LISTXATTR,
+        LPROC_LL_REMOVEXATTR,
+        LPROC_LL_INODE_PERM,
+        LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
new file mode 100644 (file)
index 0000000..e9c8e56
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *  - orphan recovery: OST adds record on create
+ *  - mtime/size consistency: the OST adds a record on first write
+ *  - open/unlinked objects: OST adds a record on destroy
+ *
+ *  - mds unlink log: the MDS adds an entry upon delete
+ *
+ *  - raid1 replication log between OST's
+ *  - MDS replication logs
+ */
+
+#ifndef _LINUX_LUSTRE_LOG_H
+#define _LINUX_LUSTRE_LOG_H
+
+#ifndef _LUSTRE_LOG_H
+#error Do not #include this file directly. #include <lustre_log.h> instead
+#endif
+
+#define LUSTRE_LOG_SERVER
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
new file mode 100644 (file)
index 0000000..2d7c425
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_NET_H
+#define _LINUX_LUSTRE_NET_H
+
+#ifndef _LUSTRE_NET_H
+#error Do not #include this file directly. #include <lustre_net.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+/* XXX Liang: should be moved to other header instead of here */
+#ifndef WITH_GROUP_INFO
+#define WITH_GROUP_INFO
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644 (file)
index 0000000..f050808
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+       if (page->mapping != mapping)
+               return;
+
+       if (PagePrivate(page))
+               page->mapping->a_ops->invalidatepage(page, 0);
+
+       cancel_dirty_page(page, PAGE_SIZE);
+       ClearPageMappedToDisk(page);
+       ll_delete_from_page_cache(page);
+}
+
+#  define d_refcount(d)                 ((d)->d_count)
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
new file mode 100644 (file)
index 0000000..421866b
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_QUOTA_H
+#define _LINUX_LUSTRE_QUOTA_H
+
+#ifndef _LUSTRE_QUOTA_H
+#error Do not #include this file directly. #include <lustre_quota.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644 (file)
index 0000000..ebaf929
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/version.h>
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs.  fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
+typedef struct stat     lstat_t;
+#define lstat_f         lstat
+#define HAVE_LOV_USER_MDS_DATA
+#else
+typedef struct stat64   lstat_t;
+#define lstat_f         lstat64
+#define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h
new file mode 100644 (file)
index 0000000..b4db6cb
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LINUX_LVFS_H__
+#define __LINUX_LVFS_H__
+
+#ifndef __LVFS_H__
+#error Do not #include this file directly. #include <lvfs.h> instead
+#endif
+
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/lvfs_linux.h>
+
+#define LLOG_LVFS
+
+/* simple.c */
+
+struct lvfs_ucred {
+       __u32              luc_uid;
+       __u32              luc_gid;
+       __u32              luc_fsuid;
+       __u32              luc_fsgid;
+       kernel_cap_t    luc_cap;
+       __u32              luc_umask;
+       struct group_info      *luc_ginfo;
+       struct md_identity     *luc_identity;
+};
+
+struct lvfs_callback_ops {
+       struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data);
+};
+
+#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
+#define OBD_CTXT_DEBUG   /* development-only debugging */
+struct lvfs_run_ctxt {
+       struct vfsmount  *pwdmnt;
+       struct dentry      *pwd;
+       mm_segment_t         fs;
+       struct lvfs_ucred       luc;
+       int                   ngroups;
+       struct lvfs_callback_ops cb_ops;
+       struct group_info       *group_info;
+       struct dt_device        *dt;
+#ifdef OBD_CTXT_DEBUG
+       __u32               magic;
+#endif
+};
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+                 char *newname);
+
+static inline void l_dput(struct dentry *de)
+{
+       if (!de || IS_ERR(de))
+               return;
+       //shrink_dcache_parent(de);
+       LASSERT(d_refcount(de) > 0);
+       dput(de);
+}
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(const char *fid_name,
+                                              struct dentry *dparent,
+                                              int fid_namelen)
+{
+       struct dentry *dchild;
+
+       mutex_lock(&dparent->d_inode->i_mutex);
+       dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+       mutex_unlock(&dparent->d_inode->i_mutex);
+
+       if (IS_ERR(dchild) || dchild->d_inode == NULL)
+               return dchild;
+
+       if (is_bad_inode(dchild->d_inode)) {
+               CERROR("bad inode returned %lu/%u\n",
+                      dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+               dput(dchild);
+               dchild = ERR_PTR(-ENOENT);
+       }
+       return dchild;
+}
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
new file mode 100644 (file)
index 0000000..140a60f
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LVFS_LINUX_H__
+#define __LVFS_LINUX_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include <lvfs.h>
+
+#define l_file file
+#define l_dentry dentry
+
+#define l_filp_open filp_open
+
+struct lvfs_run_ctxt;
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *,
+                            int flags);
+
+struct l_linux_dirent {
+       struct list_head      lld_list;
+       ino_t      lld_ino;
+       unsigned long   lld_off;
+       char        lld_name[LL_FID_NAMELEN];
+};
+struct l_readdir_callback {
+       struct l_linux_dirent *lrc_dirent;
+       struct list_head            *lrc_list;
+};
+
+#endif /*  __LVFS_LINUX_H__ */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644 (file)
index 0000000..2c36c0d
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include <obd_support.h>
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h>  /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+# include <linux/lustre_intent.h>
+
+struct ll_iattr {
+       struct iattr    iattr;
+       unsigned int    ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+       spinlock_t              lock;
+
+       unsigned long       time;
+       struct task_struct *task;
+       const char       *func;
+       int              line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+                                         const char *func, int line)
+{
+       unsigned long cur = jiffies;
+       while (1) {
+               if (spin_trylock(&lock->lock)) {
+                       LASSERT(lock->task == NULL);
+                       lock->task = current;
+                       lock->func = func;
+                       lock->line = line;
+                       lock->time = jiffies;
+                       break;
+               }
+
+               if ((jiffies - cur > 5 * HZ) &&
+                   (jiffies - lock->time > 5 * HZ)) {
+                       struct task_struct *task = lock->task;
+
+                       if (task == NULL)
+                               continue;
+
+                       LCONSOLE_WARN("%s:%d: lock %p was acquired"
+                                     " by <%s:%d:%s:%d> for %lu seconds.\n",
+                                     current->comm, current->pid,
+                                     lock, task->comm, task->pid,
+                                     lock->func, lock->line,
+                                     (jiffies - lock->time) / HZ);
+                       LCONSOLE_WARN("====== for process holding the "
+                                     "lock =====\n");
+                       libcfs_debug_dumpstack(task);
+                       LCONSOLE_WARN("====== for current process =====\n");
+                       libcfs_debug_dumpstack(NULL);
+                       LCONSOLE_WARN("====== end =======\n");
+                       cfs_pause(1000 * HZ);
+               }
+               cpu_relax();
+       }
+}
+
+#define client_obd_list_lock(lock) \
+       __client_obd_list_lock(lock, __FUNCTION__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+       LASSERT(lock->task != NULL);
+       lock->task = NULL;
+       lock->time = jiffies;
+       spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+       spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h
new file mode 100644 (file)
index 0000000..021ead6
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_CLASS_OBD_H
+#define __LINUX_CLASS_OBD_H
+
+#ifndef __CLASS_OBD_H
+#error Do not #include this file directly. #include <obd_class.h> instead
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+/* obdo.c */
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+#define ll_inode_flags(inode)   (inode->i_flags)
+
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h
new file mode 100644 (file)
index 0000000..9166503
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_OBD_SUPPORT
+#define _LINUX_OBD_SUPPORT
+
+#ifndef _OBD_SUPPORT
+#error Do not #include this file directly. #include <obd_support.h> instead
+#endif
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <asm/processor.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+
+# include <linux/types.h>
+# include <linux/blkdev.h>
+# include <lvfs.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644 (file)
index 0000000..e4e8f72
--- /dev/null
@@ -0,0 +1,1100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/libcfs/params_tree.h>
+
+struct lprocfs_vars {
+       const char           *name;
+       read_proc_t     *read_fptr;
+       write_proc_t       *write_fptr;
+       void               *data;
+       struct file_operations *fops;
+       /**
+        * /proc file mode.
+        */
+       mode_t            proc_mode;
+};
+
+struct lprocfs_static_vars {
+       struct lprocfs_vars *module_vars;
+       struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+       spinlock_t      oh_lock;
+       unsigned long   oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+       BRW_R_PAGES = 0,
+       BRW_W_PAGES,
+       BRW_R_RPC_HIST,
+       BRW_W_RPC_HIST,
+       BRW_R_IO_TIME,
+       BRW_W_IO_TIME,
+       BRW_R_DISCONT_PAGES,
+       BRW_W_DISCONT_PAGES,
+       BRW_R_DISCONT_BLOCKS,
+       BRW_W_DISCONT_BLOCKS,
+       BRW_R_DISK_IOSIZE,
+       BRW_W_DISK_IOSIZE,
+       BRW_R_DIO_FRAGS,
+       BRW_W_DIO_FRAGS,
+       BRW_LAST,
+};
+
+struct brw_stats {
+       struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+       RENAME_SAMEDIR_SIZE = 0,
+       RENAME_CROSSDIR_SRC_SIZE,
+       RENAME_CROSSDIR_TGT_SIZE,
+       RENAME_LAST,
+};
+
+struct rename_stats {
+       struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+       LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+       LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+       LPROCFS_CNTR_STDDEV       = 0x0004,
+
+       /* counter data type */
+       LPROCFS_TYPE_REGS        = 0x0100,
+       LPROCFS_TYPE_BYTES      = 0x0200,
+       LPROCFS_TYPE_PAGES      = 0x0400,
+       LPROCFS_TYPE_CYCLE      = 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+       unsigned int            lc_config;
+       const char              *lc_name;   /* must be static */
+       const char              *lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+       __s64   lc_count;
+       __s64   lc_min;
+       __s64   lc_max;
+       __s64   lc_sumsquare;
+       /*
+        * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+        * for irq context counter, i.e. stats with
+        * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+        * lc_array_sum[1]
+        */
+       __s64   lc_array_sum[1];
+};
+#define lc_sum         lc_array_sum[0]
+#define lc_sum_irq     lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+       __s64                   pad;
+#endif
+       struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID  0x0002
+
+enum lprocfs_stats_flags {
+       LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+       LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+                                              * area and need locking */
+       LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+       LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+       LPROCFS_FIELDS_FLAGS_SUM        = 0x0002,
+       LPROCFS_FIELDS_FLAGS_MIN        = 0x0003,
+       LPROCFS_FIELDS_FLAGS_MAX        = 0x0004,
+       LPROCFS_FIELDS_FLAGS_AVG        = 0x0005,
+       LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+       LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+       /* # of counters */
+       unsigned short                  ls_num;
+       /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+       unsigned short                  ls_biggest_alloc_num;
+       enum lprocfs_stats_flags        ls_flags;
+       /* Lock used when there are no percpu stats areas; For percpu stats,
+        * it is used to protect ls_biggest_alloc_num change */
+       spinlock_t                      ls_lock;
+
+       /* has ls_num of counter headers */
+       struct lprocfs_counter_header   *ls_cnt_header;
+       struct lprocfs_percpu           *ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+       if (opc < OST_LAST_OPC) {
+                /* OST opcode */
+               return (opc - OST_FIRST_OPC);
+       } else if (opc < MDS_LAST_OPC) {
+               /* MDS opcode */
+               return (opc - MDS_FIRST_OPC +
+                       OPC_RANGE(OST));
+       } else if (opc < LDLM_LAST_OPC) {
+               /* LDLM Opcode */
+               return (opc - LDLM_FIRST_OPC +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < MGS_LAST_OPC) {
+               /* MGS Opcode */
+               return (opc - MGS_FIRST_OPC +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < OBD_LAST_OPC) {
+               /* OBD Ping */
+               return (opc - OBD_FIRST_OPC +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < LLOG_LAST_OPC) {
+               /* LLOG Opcode */
+               return (opc - LLOG_FIRST_OPC +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < QUOTA_LAST_OPC) {
+               /* LQUOTA Opcode */
+               return (opc - QUOTA_FIRST_OPC +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < SEQ_LAST_OPC) {
+               /* SEQ opcode */
+               return (opc - SEQ_FIRST_OPC +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < SEC_LAST_OPC) {
+               /* SEC opcode */
+               return (opc - SEC_FIRST_OPC +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < FLD_LAST_OPC) {
+               /* FLD opcode */
+                return (opc - FLD_FIRST_OPC +
+                       OPC_RANGE(SEC) +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < UPDATE_LAST_OPC) {
+               /* update opcode */
+               return (opc - UPDATE_FIRST_OPC +
+                       OPC_RANGE(FLD) +
+                       OPC_RANGE(SEC) +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else {
+               /* Unknown Opcode */
+               return -1;
+       }
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+                           OPC_RANGE(MDS)  + \
+                           OPC_RANGE(LDLM) + \
+                           OPC_RANGE(MGS)  + \
+                           OPC_RANGE(OBD)  + \
+                           OPC_RANGE(LLOG) + \
+                           OPC_RANGE(SEC)  + \
+                           OPC_RANGE(SEQ)  + \
+                           OPC_RANGE(SEC)  + \
+                           OPC_RANGE(FLD)  + \
+                           OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+                           OPC_RANGE(EXTRA))
+
+enum {
+       PTLRPC_REQWAIT_CNTR = 0,
+       PTLRPC_REQQDEPTH_CNTR,
+       PTLRPC_REQACTIVE_CNTR,
+       PTLRPC_TIMEOUT,
+       PTLRPC_REQBUF_AVAIL_CNTR,
+       PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+       LDLM_GLIMPSE_ENQUEUE = 0,
+       LDLM_PLAIN_ENQUEUE,
+       LDLM_EXTENT_ENQUEUE,
+       LDLM_FLOCK_ENQUEUE,
+       LDLM_IBITS_ENQUEUE,
+       MDS_REINT_SETATTR,
+       MDS_REINT_CREATE,
+       MDS_REINT_LINK,
+       MDS_REINT_UNLINK,
+       MDS_REINT_RENAME,
+       MDS_REINT_OPEN,
+       MDS_REINT_SETXATTR,
+       BRW_READ_BYTES,
+       BRW_WRITE_BYTES,
+       EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern proc_dir_entry_t *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+       int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+       ts->d = secs / 86400;
+       secs = secs % 86400;
+       ts->h = secs / 3600;
+       secs = secs % 3600;
+       ts->m = secs / 60;
+       ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN     20
+#define JOBSTATS_DISABLE               "disable"
+#define JOBSTATS_PROCNAME_UID          "procname_uid"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+       cfs_hash_t      *ojs_hash;
+       struct list_head         ojs_list;
+       rwlock_t       ojs_lock; /* protect the obj_list */
+       cntr_init_callback ojs_cntr_init_fn;
+       int             ojs_cntr_num;
+       int             ojs_cleanup_interval;
+       time_t             ojs_last_cleanup;
+};
+
+#ifdef LPROCFS
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+                                  unsigned int cpuid);
+/*
+ * \return value
+ *      < 0     : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+                                    unsigned long *flags)
+{
+       int             rc = 0;
+
+       switch (opc) {
+       default:
+               LBUG();
+
+       case LPROCFS_GET_SMP_ID:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, *flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       return 0;
+               } else {
+                       unsigned int cpuid = get_cpu();
+
+                       if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+                               rc = lprocfs_stats_alloc_one(stats, cpuid);
+                               if (rc < 0) {
+                                       put_cpu();
+                                       return rc;
+                               }
+                       }
+                       return cpuid;
+               }
+
+       case LPROCFS_GET_NUM_CPU:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, *flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       return 1;
+               } else {
+                       return stats->ls_biggest_alloc_num;
+               }
+       }
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+                                       unsigned long *flags)
+{
+       switch (opc) {
+       default:
+               LBUG();
+
+       case LPROCFS_GET_SMP_ID:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock,
+                                                          *flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               } else {
+                       put_cpu();
+               }
+               return;
+
+       case LPROCFS_GET_NUM_CPU:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock,
+                                                          *flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               }
+               return;
+       }
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+       unsigned int percpusize;
+
+       percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+       /* irq safe stats need lc_array_sum[1] */
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+               percpusize += stats->ls_num * sizeof(__s64);
+
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+               percpusize = L1_CACHE_ALIGN(percpusize);
+
+       return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+                         int index)
+{
+       struct lprocfs_counter *cntr;
+
+       cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+               cntr = (void *)cntr + index * sizeof(__s64);
+
+       return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+                               long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+                               long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+       lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+       lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+                                struct lprocfs_counter_header *header,
+                                enum lprocfs_stats_flags flags,
+                                enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+                                           int idx,
+                                           enum lprocfs_fields_flags field)
+{
+       int           i;
+       unsigned int  num_cpu;
+       unsigned long flags     = 0;
+       __u64         ret       = 0;
+
+       LASSERT(stats != NULL);
+
+       num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       for (i = 0; i < num_cpu; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               ret += lprocfs_read_helper(
+                               lprocfs_stats_counter_get(stats, i, idx),
+                               &stats->ls_cnt_header[idx], stats->ls_flags,
+                               field);
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+                                  struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+                                  struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                  unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                 unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+                                unsigned conf, const char *name,
+                                const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+                                  proc_dir_entry_t *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+                            lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                               char *name,
+                                               read_proc_t *read_proc,
+                                               write_proc_t *write_proc,
+                                               void *data,
+                                               struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                   const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                             unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+                                       int count, int *eof,  void *data);
+
+extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name,
+                                 struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(proc_dir_entry_t *root,
+                           struct lprocfs_vars *var,
+                           void *data);
+
+extern proc_dir_entry_t *lprocfs_register(const char *name,
+                                             proc_dir_entry_t *parent,
+                                             struct lprocfs_vars *list,
+                                             void *data);
+
+extern void lprocfs_remove(proc_dir_entry_t **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+                                     struct proc_dir_entry *parent);
+extern void lprocfs_try_remove_proc_entry(const char *name,
+                                         struct proc_dir_entry *parent);
+
+extern proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *root,
+                                         const char *name);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+extern struct file_operations lprocfs_evict_client_fops;
+
+extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
+                             mode_t mode,
+                             const struct file_operations *seq_fops,
+                             void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+                                 mode_t mode,
+                                 const struct file_operations *seq_fops,
+                                 void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+extern int lprocfs_rd_atomic(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
+                            unsigned long count, void *data);
+extern int lprocfs_rd_uint(char *page, char **start, off_t off,
+                          int count, int *eof, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char *buffer,
+                          unsigned long count, void *data);
+extern int lprocfs_rd_uuid(char *page, char **start, off_t off,
+                          int count, int *eof, void *data);
+extern int lprocfs_rd_name(char *page, char **start, off_t off,
+                          int count, int *eof, void *data);
+extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data);
+extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
+                               int count, int *eof, void *data);
+extern int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+                            int *eof, void *data);
+extern int lprocfs_rd_state(char *page, char **start, off_t off, int count,
+                           int *eof, void *data);
+extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data);
+extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data);
+extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(char *page, int count, int rc,
+                                 struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                              int count, int *eof, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                              unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+extern int lprocfs_wr_ping(struct file *file, const char *buffer,
+                          unsigned long count, void *data);
+extern int lprocfs_wr_import(struct file *file, const char *buffer,
+                            unsigned long count, void *data);
+extern int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                                  unsigned long count, void *data);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+extern int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data);
+extern int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
+                                int count, int *eof, void *data);
+extern int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
+                                int count, int *eof, void *data);
+extern int lprocfs_rd_filestotal(char *page, char **start, off_t off,
+                                int count, int *eof, void *data);
+extern int lprocfs_rd_filesfree(char *page, char **start, off_t off,
+                               int count, int *eof, void *data);
+extern int lprocfs_rd_filegroups(char *page, char **start, off_t off,
+                                int count, int *eof, void *data);
+
+extern int lprocfs_write_helper(const char *buffer, unsigned long count,
+                               int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+                                    int *val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+                                   long val, int mult);
+extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
+                                   __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+                                        unsigned long count,
+                                        __u64 *val, int mult);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+                               unsigned long *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt);
+
+/* lprocfs_status.c: recovery status */
+int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data);
+
+/* lprocfs_statuc.c: hash statistics */
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+                       int count, int *eof, void *data);
+
+/* lprocfs_status.c: IR factor */
+int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer,
+                            unsigned long count, void *data);
+
+extern int lprocfs_single_release(cfs_inode_t *, struct file *);
+extern int lprocfs_seq_release(cfs_inode_t *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {     \
+       typecheck(struct obd_device *, obd);    \
+       down_read(&(obd)->u.cli.cl_sem);    \
+       if ((obd)->u.cli.cl_import == NULL) {   \
+            up_read(&(obd)->u.cli.cl_sem); \
+            return -ENODEV;                \
+       }                                      \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd)                 \
+       up_read(&(obd)->u.cli.cl_sem);
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)                          \
+static int name##_single_open(cfs_inode_t *inode, struct file *file) {     \
+       struct proc_dir_entry *dp = PDE(inode);                     \
+       int rc;                                                     \
+       LPROCFS_ENTRY_AND_CHECK(dp);                                   \
+       rc = single_open(file, name##_seq_show, dp->data);               \
+       if (rc) {                                                         \
+               LPROCFS_EXIT();                                     \
+               return rc;                                               \
+       }                                                                 \
+       return 0;                                                         \
+}                                                                        \
+struct file_operations name##_fops = {                              \
+       .owner   = THIS_MODULE,                                     \
+       .open    = name##_single_open,                               \
+       .read    = seq_read,                                           \
+       .write   = custom_seq_write,                                   \
+       .llseek  = seq_lseek,                                         \
+       .release = lprocfs_single_release,                               \
+}
+
+#define LPROC_SEQ_FOPS_RO(name)         __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)       __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+                         int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback fn);
+int lprocfs_rd_job_interval(char *page, char **start, off_t off,
+                           int count, int *eof, void *data);
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+                           unsigned long count, void *data);
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_soft(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_hard(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+                                    unsigned long count, void *data);
+int lprocfs_target_rd_instance(char *page, char **start, off_t off,
+                              int count, int *eof, void *data);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+                                 off_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+                                 off_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+                                 off_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+                                 off_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
+                                int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+                                unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+                                          const char *buffer,
+                                          unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+                                    unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off,
+                                           int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+                                           const char *buffer,
+                                           unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off,
+                                       int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+                                       const char *buffer,
+                                       unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off,
+                                       int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+                                       const char *buffer,
+                                       unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+
+
+
+#else
+/* LPROCFS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+                                      int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+                                       int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+                                      int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+                                       int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+                                       int index, unsigned conf,
+                                       const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+                                  enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(proc_dir_entry_t *root,
+                                        const char *name,
+                                        struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+                                         struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+                                         struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                         unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                        unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid,
+                                   int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline proc_dir_entry_t *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+                  read_proc_t *read_proc, write_proc_t *write_proc,
+                  void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                   const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+                                int count, int *eof,  void *data)
+{return count;}
+
+static inline proc_dir_entry_t *
+lprocfs_register(const char *name, proc_dir_entry_t *parent,
+                struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(proc_dir_entry_t *root,
+                                  struct lprocfs_vars *var,
+                                  void *data)
+{ return 0; }
+static inline void lprocfs_remove(proc_dir_entry_t **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+                                            struct proc_dir_entry *parent)
+{ return; }
+static inline void lprocfs_try_remove_proc_entry(const char *name,
+                                                struct proc_dir_entry *parent)
+{ return; }
+static inline proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *head,
+                                                const char *name)
+{ return 0; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+                                   struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+                                         int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_state(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
+                                        struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file,
+                                         const char *buffer,
+                                         unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file, const char *buffer,
+                                   unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                                   unsigned long count, void *data)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(char *page, char **start, off_t off,
+                      int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filegroups(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+                              enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+                         long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* LPROCFS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644 (file)
index 0000000..4bd11bb
--- /dev/null
@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *     - it's envisaged that lu_object will be used much more often than
+ *     lu_object_header;
+ *
+ *     - we want lower (non-top) layers to be able to initiate operations
+ *     on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+       /**
+        * Allocate object for the given device (without lower-layer
+        * parts). This is called by lu_object_operations::loo_object_init()
+        * from the parent layer, and should setup at least lu_object::lo_dev
+        * and lu_object::lo_ops fields of resulting lu_object.
+        *
+        * Object creation protocol.
+        *
+        * Due to design goal of avoiding recursion, object creation (see
+        * lu_object_alloc()) is somewhat involved:
+        *
+        *  - first, lu_device_operations::ldo_object_alloc() method of the
+        *  top-level device in the stack is called. It should allocate top
+        *  level object (including lu_object_header), but without any
+        *  lower-layer sub-object(s).
+        *
+        *  - then lu_object_alloc() sets fid in the header of newly created
+        *  object.
+        *
+        *  - then lu_object_operations::loo_object_init() is called. It has
+        *  to allocate lower-layer object(s). To do this,
+        *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+        *  of the lower-layer device(s).
+        *
+        *  - for all new objects allocated by
+        *  lu_object_operations::loo_object_init() (and inserted into object
+        *  stack), lu_object_operations::loo_object_init() is called again
+        *  repeatedly, until no new objects are created.
+        *
+        * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+        *                           result->lo_ops != NULL);
+        */
+       struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+                                             const struct lu_object_header *h,
+                                             struct lu_device *d);
+       /**
+        * process config specific for device.
+        */
+       int (*ldo_process_config)(const struct lu_env *env,
+                                 struct lu_device *, struct lustre_cfg *);
+       int (*ldo_recovery_complete)(const struct lu_env *,
+                                    struct lu_device *);
+
+       /**
+        * initialize local objects for device. this method called after layer has
+        * been initialized (after LCFG_SETUP stage) and before it starts serving
+        * user requests.
+        */
+
+       int (*ldo_prepare)(const struct lu_env *,
+                          struct lu_device *parent,
+                          struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+       /* This is a new object to be allocated, or the file
+        * corresponding to the object does not exists. */
+       LOC_F_NEW       = 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+       /**
+        * Some hints for obj find and alloc.
+        */
+       loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+                           void *cookie, const char *format, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+       /**
+        * Allocate lower-layer parts of the object by calling
+        * lu_device_operations::ldo_object_alloc() of the corresponding
+        * underlying device.
+        *
+        * This method is called once for each object inserted into object
+        * stack. It's responsibility of this method to insert lower-layer
+        * object(s) it create into appropriate places of object stack.
+        */
+       int (*loo_object_init)(const struct lu_env *env,
+                              struct lu_object *o,
+                              const struct lu_object_conf *conf);
+       /**
+        * Called (in top-to-bottom order) during object allocation after all
+        * layers were allocated and initialized. Can be used to perform
+        * initialization depending on lower layers.
+        */
+       int (*loo_object_start)(const struct lu_env *env,
+                               struct lu_object *o);
+       /**
+        * Called before lu_object_operations::loo_object_free() to signal
+        * that object is being destroyed. Dual to
+        * lu_object_operations::loo_object_init().
+        */
+       void (*loo_object_delete)(const struct lu_env *env,
+                                 struct lu_object *o);
+       /**
+        * Dual to lu_device_operations::ldo_object_alloc(). Called when
+        * object is removed from memory.
+        */
+       void (*loo_object_free)(const struct lu_env *env,
+                               struct lu_object *o);
+       /**
+        * Called when last active reference to the object is released (and
+        * object returns to the cache). This method is optional.
+        */
+       void (*loo_object_release)(const struct lu_env *env,
+                                  struct lu_object *o);
+       /**
+        * Optional debugging helper. Print given object.
+        */
+       int (*loo_object_print)(const struct lu_env *env, void *cookie,
+                               lu_printer_t p, const struct lu_object *o);
+       /**
+        * Optional debugging method. Returns true iff method is internally
+        * consistent.
+        */
+       int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+       /**
+        * reference count. This is incremented, in particular, on each object
+        * created at this layer.
+        *
+        * \todo XXX which means that atomic_t is probably too small.
+        */
+       atomic_t                       ld_ref;
+       /**
+        * Pointer to device type. Never modified once set.
+        */
+       struct lu_device_type       *ld_type;
+       /**
+        * Operation vector for this device.
+        */
+       const struct lu_device_operations *ld_ops;
+       /**
+        * Stack this device belongs to.
+        */
+       struct lu_site              *ld_site;
+       struct proc_dir_entry        *ld_proc_entry;
+
+       /** \todo XXX: temporary back pointer into obd. */
+       struct obd_device                *ld_obd;
+       /**
+        * A list of references to this object, for debugging.
+        */
+       struct lu_ref                 ld_reference;
+       /**
+        * Link the device to the site.
+        **/
+       struct list_head                         ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+       /** this is meta-data device */
+       LU_DEVICE_MD = (1 << 0),
+       /** this is data device */
+       LU_DEVICE_DT = (1 << 1),
+       /** data device in the client stack */
+       LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+       /**
+        * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+        */
+       __u32                              ldt_tags;
+       /**
+        * Name of this class. Unique system-wide. Never modified once set.
+        */
+       char                               *ldt_name;
+       /**
+        * Operations for this type.
+        */
+       const struct lu_device_type_operations *ldt_ops;
+       /**
+        * \todo XXX: temporary pointer to associated obd_type.
+        */
+       struct obd_type                 *ldt_obd_type;
+       /**
+        * \todo XXX: temporary: context tags used by obd_*() calls.
+        */
+       __u32                              ldt_ctx_tags;
+       /**
+        * Number of existing device type instances.
+        */
+       unsigned                                ldt_device_nr;
+       /**
+        * Linkage into a global list of all device types.
+        *
+        * \see lu_device_types.
+        */
+       struct list_head                              ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+       /**
+        * Allocate new device.
+        */
+       struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+                                              struct lu_device_type *t,
+                                              struct lustre_cfg *lcfg);
+       /**
+        * Free device. Dual to
+        * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+        * the next device in the stack.
+        */
+       struct lu_device *(*ldto_device_free)(const struct lu_env *,
+                                             struct lu_device *);
+
+       /**
+        * Initialize the devices after allocation
+        */
+       int  (*ldto_device_init)(const struct lu_env *env,
+                                struct lu_device *, const char *,
+                                struct lu_device *);
+       /**
+        * Finalize device. Dual to
+        * lu_device_type_operations::ldto_device_init(). Returns pointer to
+        * the next device in the stack.
+        */
+       struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+                                             struct lu_device *);
+       /**
+        * Initialize device type. This is called on module load.
+        */
+       int  (*ldto_init)(struct lu_device_type *t);
+       /**
+        * Finalize device type. Dual to
+        * lu_device_type_operations::ldto_init(). Called on module unload.
+        */
+       void (*ldto_fini)(struct lu_device_type *t);
+       /**
+        * Called when the first device is created.
+        */
+       void (*ldto_start)(struct lu_device_type *t);
+       /**
+        * Called when number of devices drops to 0.
+        */
+       void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+       return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Flags for the object layers.
+ */
+enum lu_object_flags {
+       /**
+        * this flags is set if lu_object_operations::loo_object_init() has
+        * been called for this layer. Used by lu_object_alloc().
+        */
+       LU_OBJECT_ALLOCATED = (1 << 0)
+};
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+       /** size in bytes */
+       __u64     la_size;
+       /** modification time in seconds since Epoch */
+       obd_time       la_mtime;
+       /** access time in seconds since Epoch */
+       obd_time       la_atime;
+       /** change time in seconds since Epoch */
+       obd_time       la_ctime;
+       /** 512-byte blocks allocated to object */
+       __u64     la_blocks;
+       /** permission bits and file type */
+       __u32     la_mode;
+       /** owner id */
+       __u32     la_uid;
+       /** group id */
+       __u32     la_gid;
+       /** object flags */
+       __u32     la_flags;
+       /** number of persistent references to this object */
+       __u32     la_nlink;
+       /** blk bits of the object*/
+       __u32     la_blkbits;
+       /** blk size of the object*/
+       __u32     la_blksize;
+       /** real device */
+       __u32     la_rdev;
+       /**
+        * valid bits
+        *
+        * \see enum la_valid
+        */
+       __u64     la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+       LA_ATIME = 1 << 0,
+       LA_MTIME = 1 << 1,
+       LA_CTIME = 1 << 2,
+       LA_SIZE  = 1 << 3,
+       LA_MODE  = 1 << 4,
+       LA_UID   = 1 << 5,
+       LA_GID   = 1 << 6,
+       LA_BLOCKS = 1 << 7,
+       LA_TYPE   = 1 << 8,
+       LA_FLAGS  = 1 << 9,
+       LA_NLINK  = 1 << 10,
+       LA_RDEV   = 1 << 11,
+       LA_BLKSIZE = 1 << 12,
+       LA_KILL_SUID = 1 << 13,
+       LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+       /**
+        * Header for this object.
+        */
+       struct lu_object_header    *lo_header;
+       /**
+        * Device for this layer.
+        */
+       struct lu_device                  *lo_dev;
+       /**
+        * Operations for this object.
+        */
+       const struct lu_object_operations *lo_ops;
+       /**
+        * Linkage into list of all layers.
+        */
+       struct list_head                         lo_linkage;
+       /**
+        * Depth. Top level layer depth is 0.
+        */
+       int                             lo_depth;
+       /**
+        * Flags from enum lu_object_flags.
+        */
+       __u32                                   lo_flags;
+       /**
+        * Link to the device, for debugging.
+        */
+       struct lu_ref_link              *lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+       /**
+        * Don't keep this object in cache. Object will be destroyed as soon
+        * as last reference to it is released. This flag cannot be cleared
+        * once set.
+        */
+       LU_OBJECT_HEARD_BANSHEE = 0,
+       /**
+        * Mark this object has already been taken out of cache.
+        */
+       LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+       LOHA_EXISTS   = 1 << 0,
+       LOHA_REMOTE   = 1 << 1,
+       /**
+        * UNIX file type is stored in S_IFMT bits.
+        */
+       LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+       LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+       /**
+        * Object flags from enum lu_object_header_flags. Set and checked
+        * atomically.
+        */
+       unsigned long     loh_flags;
+       /**
+        * Object reference count. Protected by lu_site::ls_guard.
+        */
+       atomic_t           loh_ref;
+       /**
+        * Fid, uniquely identifying this object.
+        */
+       struct lu_fid     loh_fid;
+       /**
+        * Common object attributes, cached for efficiency. From enum
+        * lu_object_header_attr.
+        */
+       __u32             loh_attr;
+       /**
+        * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+        */
+       struct hlist_node       loh_hash;
+       /**
+        * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+        */
+       struct list_head             loh_lru;
+       /**
+        * Linkage into list of layers. Never modified once set (except lately
+        * during object destruction). No locking is necessary.
+        */
+       struct list_head             loh_layers;
+       /**
+        * A list of references to this object, for debugging.
+        */
+       struct lu_ref     loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+       /**
+        * number of busy object on this bucket
+        */
+       long                  lsb_busy;
+       /**
+        * LRU list, updated on each access to object. Protected by
+        * bucket lock of lu_site::ls_obj_hash.
+        *
+        * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+        * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+        * of list_for_each_entry_safe_reverse()).
+        */
+       struct list_head                lsb_lru;
+       /**
+        * Wait-queue signaled when an object in this site is ultimately
+        * destroyed (lu_object_free()). It is used by lu_object_find() to
+        * wait before re-trying when object in the process of destruction is
+        * found in the hash table.
+        *
+        * \see htable_lookup().
+        */
+       wait_queue_head_t              lsb_marche_funebre;
+};
+
+enum {
+       LU_SS_CREATED    = 0,
+       LU_SS_CACHE_HIT,
+       LU_SS_CACHE_MISS,
+       LU_SS_CACHE_RACE,
+       LU_SS_CACHE_DEATH_RACE,
+       LU_SS_LRU_PURGED,
+       LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+       /**
+        * objects hash table
+        */
+       cfs_hash_t             *ls_obj_hash;
+       /**
+        * index of bucket on hash table while purging
+        */
+       int                    ls_purge_start;
+       /**
+        * Top-level device for this stack.
+        */
+       struct lu_device         *ls_top_dev;
+       /**
+        * Bottom-level device for this stack
+        */
+       struct lu_device        *ls_bottom_dev;
+       /**
+        * Linkage into global list of sites.
+        */
+       struct list_head                ls_linkage;
+       /**
+        * List for lu device for this site, protected
+        * by ls_ld_lock.
+        **/
+       struct list_head                ls_ld_linkage;
+       spinlock_t              ls_ld_lock;
+
+       /**
+        * lu_site stats
+        */
+       struct lprocfs_stats    *ls_stats;
+       /**
+        * XXX: a hack! fld has to find md_site via site, remove when possible
+        */
+       struct seq_server_site  *ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+       cfs_hash_bd_t bd;
+
+       cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+       return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init       (struct lu_site *s, struct lu_device *d);
+void lu_site_fini       (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini     (const struct lu_env *env, struct lu_device *top);
+void lu_device_get     (struct lu_device *d);
+void lu_device_put     (struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+                          struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add     (struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+       LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+       atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+       return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                  lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                struct lu_device *dev, const struct lu_fid *f,
+                                const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                   struct lu_device *dev,
+                                   const struct lu_fid *f,
+                                   const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+       LASSERT(!list_empty(&h->loh_layers));
+       return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+       return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+       return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+       return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                  const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                     void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)                   \
+do {                                                                 \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                  \
+                                                                         \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                \
+               lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+               CDEBUG(mask, format , ## __VA_ARGS__);              \
+       }                                                                \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)               \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+                                      (object)->lo_header);        \
+               lu_cdebug_printer(env, &msgdata, "\n");          \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+                           lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t printer,
+                           const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+       return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+       return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+       LASSERT(lu_object_exists(o) != 0);
+       return o->lo_header->loh_attr;
+}
+
+static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
+                                                   const char *scope,
+                                                   const void *source)
+{
+       return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+                                    const char *scope, const void *source)
+{
+       lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+                                       struct lu_ref_link *link,
+                                       const char *scope, const void *source)
+{
+       lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+       /** hash */
+       __u64              rp_hash;
+       /** count in bytes */
+       unsigned int        rp_count;
+       /** number of pages */
+       unsigned int        rp_npages;
+       /** requested attr */
+       __u32              rp_attrs;
+       /** pointers to pages */
+       struct page        **rp_pages;
+};
+
+enum lu_xattr_flags {
+       LU_XATTR_REPLACE = (1 << 0),
+       LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+       LCS_INITIALIZED = 1,
+       LCS_ENTERED,
+       LCS_LEFT,
+       LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+       /**
+        * lu_context is used on the client side too. Yet we don't want to
+        * allocate values of server-side keys for the client contexts and
+        * vice versa.
+        *
+        * To achieve this, set of tags in introduced. Contexts and keys are
+        * marked with tags. Key value are created only for context whose set
+        * of tags has non-empty intersection with one for key. Tags are taken
+        * from enum lu_context_tag.
+        */
+       __u32             lc_tags;
+       enum lu_context_state  lc_state;
+       /**
+        * Pointer to the home service thread. NULL for other execution
+        * contexts.
+        */
+       struct ptlrpc_thread  *lc_thread;
+       /**
+        * Pointer to an array with key values. Internal implementation
+        * detail.
+        */
+       void             **lc_value;
+       /**
+        * Linkage into a list of all remembered contexts. Only
+        * `non-transient' contexts, i.e., ones created for service threads
+        * are placed here.
+        */
+       struct list_head             lc_remember;
+       /**
+        * Version counter used to skip calls to lu_context_refill() when no
+        * keys were registered.
+        */
+       unsigned               lc_version;
+       /**
+        * Debugging cookie.
+        */
+       unsigned               lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+       /**
+        * Thread on md server
+        */
+       LCT_MD_THREAD = 1 << 0,
+       /**
+        * Thread on dt server
+        */
+       LCT_DT_THREAD = 1 << 1,
+       /**
+        * Context for transaction handle
+        */
+       LCT_TX_HANDLE = 1 << 2,
+       /**
+        * Thread on client
+        */
+       LCT_CL_THREAD = 1 << 3,
+       /**
+        * A per-request session on a server, and a per-system-call session on
+        * a client.
+        */
+       LCT_SESSION   = 1 << 4,
+       /**
+        * A per-request data on OSP device
+        */
+       LCT_OSP_THREAD = 1 << 5,
+       /**
+        * MGS device thread
+        */
+       LCT_MG_THREAD = 1 << 6,
+       /**
+        * Context for local operations
+        */
+       LCT_LOCAL = 1 << 7,
+       /**
+        * Set when at least one of keys, having values in this context has
+        * non-NULL lu_context_key::lct_exit() method. This is used to
+        * optimize lu_context_exit() call.
+        */
+       LCT_HAS_EXIT  = 1 << 28,
+       /**
+        * Don't add references for modules creating key values in that context.
+        * This is only for contexts used internally by lu_object framework.
+        */
+       LCT_NOREF     = 1 << 29,
+       /**
+        * Key is being prepared for retiring, don't create new values for it.
+        */
+       LCT_QUIESCENT = 1 << 30,
+       /**
+        * Context should be remembered.
+        */
+       LCT_REMEMBER  = 1 << 31,
+       /**
+        * Contexts usable in cache shrinker thread.
+        */
+       LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+       /**
+        * Set of tags for which values of this key are to be instantiated.
+        */
+       __u32 lct_tags;
+       /**
+        * Value constructor. This is called when new value is created for a
+        * context. Returns pointer to new value of error pointer.
+        */
+       void  *(*lct_init)(const struct lu_context *ctx,
+                          struct lu_context_key *key);
+       /**
+        * Value destructor. Called when context with previously allocated
+        * value of this slot is destroyed. \a data is a value that was returned
+        * by a matching call to lu_context_key::lct_init().
+        */
+       void   (*lct_fini)(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+       /**
+        * Optional method called on lu_context_exit() for all allocated
+        * keys. Can be used by debugging code checking that locks are
+        * released, etc.
+        */
+       void   (*lct_exit)(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+       /**
+        * Internal implementation detail: index within lu_context::lc_value[]
+        * reserved for this key.
+        */
+       int      lct_index;
+       /**
+        * Internal implementation detail: number of values created for this
+        * key.
+        */
+       atomic_t lct_used;
+       /**
+        * Internal implementation detail: module for this key.
+        */
+       module_t *lct_owner;
+       /**
+        * References to this key. For debugging.
+        */
+       struct lu_ref  lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)                             \
+       static void* mod##_key_init(const struct lu_context *ctx, \
+                                   struct lu_context_key *key)   \
+       {                                                        \
+               type *value;                                  \
+                                                                 \
+               CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value));       \
+                                                                 \
+               OBD_ALLOC_PTR(value);                        \
+               if (value == NULL)                              \
+                       value = ERR_PTR(-ENOMEM);                \
+                                                                 \
+               return value;                                \
+       }                                                        \
+       struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)                                       \
+       static void mod##_key_fini(const struct lu_context *ctx,            \
+                                   struct lu_context_key *key, void* data) \
+       {                                                                  \
+               type *info = data;                                        \
+                                                                           \
+               OBD_FREE_PTR(info);                                      \
+       }                                                                  \
+       struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+       LU_KEY_INIT(mod,type);  \
+       LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)               \
+       struct lu_context_key mod##_thread_key = {      \
+               .lct_tags = tags,                      \
+               .lct_init = mod##_key_init,          \
+               .lct_fini = mod##_key_fini            \
+       }
+
+#define LU_CONTEXT_KEY_INIT(key)                       \
+do {                                               \
+       (key)->lct_owner = THIS_MODULE;          \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+                              const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)                                       \
+       static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+       {                                                              \
+               struct lu_context_key *key = k;                  \
+               va_list args;                                      \
+                                                                       \
+               va_start(args, k);                                    \
+               do {                                                \
+                       LU_CONTEXT_KEY_INIT(key);                      \
+                       key = va_arg(args, struct lu_context_key *);    \
+               } while (key != NULL);                            \
+               va_end(args);                                      \
+       }
+
+#define LU_TYPE_INIT(mod, ...)                                   \
+       LU_KEY_INIT_GENERIC(mod)                                        \
+       static int mod##_type_init(struct lu_device_type *t)        \
+       {                                                              \
+               mod##_key_init_generic(__VA_ARGS__, NULL);            \
+               return lu_context_key_register_many(__VA_ARGS__, NULL); \
+       }                                                              \
+       struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)                                   \
+       static void mod##_type_fini(struct lu_device_type *t)      \
+       {                                                              \
+               lu_context_key_degister_many(__VA_ARGS__, NULL);        \
+       }                                                              \
+       struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)                                 \
+       static void mod##_type_start(struct lu_device_type *t)  \
+       {                                                      \
+               lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+       }                                                      \
+       struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)                           \
+       static void mod##_type_stop(struct lu_device_type *t)   \
+       {                                                      \
+               lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+       }                                                      \
+       struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)         \
+       LU_TYPE_INIT(mod, __VA_ARGS__);  \
+       LU_TYPE_FINI(mod, __VA_ARGS__);  \
+       LU_TYPE_START(mod, __VA_ARGS__);        \
+       LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+       /**
+        * "Local" context, used to store data instead of stack.
+        */
+       struct lu_context  le_ctx;
+       /**
+        * "Session" context for per-request data.
+        */
+       struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, char *page, int count);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+       const char    *ln_name;
+       int         ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+       void   *lb_buf;
+       ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+       struct kmem_cache **ckd_cache;
+       const char       *ckd_name;
+       const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644 (file)
index 0000000..624c19b
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *      struct foo {
+ *              atomic_t      foo_refcount;
+ *              struct lu_ref foo_reference;
+ *              ...
+ *      };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *     struct bar *bar;
+ *
+ *     // bar owns a reference to foo.
+ *     bar->bar_foo = foo_get(foo);
+ *     lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *     ...
+ *
+ *     // reference from bar to foo is released.
+ *     lu_ref_del(&foo->foo_reference, "bar", bar);
+ *     foo_put(bar->bar_foo);
+ *
+ *
+ *     // current thread acquired a temporary reference to foo.
+ *     foo_get(foo);
+ *     lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ *     ...
+ *
+ *     // temporary reference is released.
+ *     lu_ref_del(&foo->reference, __FUNCTION__, current);
+ *     foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *     // There is a large number of bar's for a single foo.
+ *     bar->bar_foo     = foo_get(foo);
+ *     bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *     ...
+ *
+ *     // reference from bar to foo is released.
+ *     lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *     foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+struct lu_ref  {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+                                            const char *scope,
+                                            const void *source)
+{
+       return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+                                                   const char *scope,
+                                                   const void *source)
+{
+       return NULL;
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+                             const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                const char *scope, const void *source0,
+                                const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+       return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h
new file mode 100644 (file)
index 0000000..8d48cf4
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_disk.h>
+
+struct lu_target {
+       struct obd_device       *lut_obd;
+       struct dt_device        *lut_bottom;
+       /** last_rcvd file */
+       struct dt_object        *lut_last_rcvd;
+       /* transaction callbacks */
+       struct dt_txn_callback   lut_txn_cb;
+       /** server data in last_rcvd file */
+       struct lr_server_data    lut_lsd;
+       /** Server last transaction number */
+       __u64               lut_last_transno;
+       /** Lock protecting last transaction number */
+       spinlock_t               lut_translock;
+       /** Lock protecting client bitmap */
+       spinlock_t               lut_client_bitmap_lock;
+       /** Bitmap of known clients */
+       unsigned long      *lut_client_bitmap;
+};
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+                        void *data, int err);
+struct tgt_commit_cb {
+       tgt_cb_t  tgt_cb_func;
+       void     *tgt_cb_data;
+};
+
+void tgt_boot_epoch_update(struct lu_target *lut);
+int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut,
+                          struct obd_export *exp, __u64 transno);
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+            struct obd_device *obd, struct dt_device *dt);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg,
+                        struct lsd_client_data *lcd, loff_t *off, int index);
+int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg,
+                         struct lsd_client_data *lcd, loff_t *off, struct thandle *th);
+int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg);
+int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg,
+                         struct thandle *th);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync);
+int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off);
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644 (file)
index 0000000..e8e0b08
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+       FMT_LFIX,
+       FMT_LVAR
+};
+
+struct iam_uapi_info {
+       __u16 iui_keysize;
+       __u16 iui_recsize;
+       __u16 iui_ptrsize;
+       __u16 iui_height;
+       char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+             int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *keybuf,
+              int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *key_buf,
+              int *keysize, char *save_key,
+              int rec_need_convert, char *rec_buf,
+              int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *keybuf,
+              int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *key_buf,
+                int *keysize, char *save_key,
+                int rec_need_convert, char *rec_buf,
+                int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *key_buf,
+               int *keysize, char *save_key,
+               int rec_need_convert, char *rec_buf,
+               int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
new file mode 100644 (file)
index 0000000..707eb74
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644 (file)
index 0000000..ad253c6
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+       __u64 fe_logical;  /* logical offset in bytes for the start of
+                           * the extent from the beginning of the file */
+       __u64 fe_physical; /* physical offset in bytes for the start
+                           * of the extent from the beginning of the disk */
+       __u64 fe_length;   /* length in bytes for this extent */
+       __u64 fe_reserved64[2];
+       __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+       __u32 fe_device;   /* device number for this extent */
+       __u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+       __u64 fm_start;  /* logical offset (inclusive) at
+                         * which to start mapping (in) */
+       __u64 fm_length; /* logical length of mapping which
+                         * userspace wants (in) */
+       __u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+       __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+       __u32 fm_extent_count;  /* size of fm_extents array (in) */
+       __u32 fm_reserved;
+       struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC        0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR      0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST           0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN     0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC   0x00000004 /* Location still pending.
+                                                   * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED     0x00000008 /* Data can not be read
+                                                   * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+                                                   * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+                                                   * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL         0x00000400 /* Multiple files in block.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN         0x00000800 /* Space allocated, but
+                                                   * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED       0x00001000 /* File does not natively
+                                                   * support extents. Result
+                                                   * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+       return (sizeof(struct ll_user_fiemap) + extent_count *
+                                              sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+       return ((array_size - sizeof(struct ll_user_fiemap)) /
+                                              sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT         0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET             0x80000000 /* Data stored remotely.
+                                                   * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644 (file)
index 0000000..93a3d7d
--- /dev/null
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644 (file)
index 0000000..029aa2f
--- /dev/null
@@ -0,0 +1,3629 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct.  Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#if !defined(LASSERT) && !defined(LPU64)
+#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */
+#endif
+
+/* Defn's shared with user-space. */
+#include <lustre/lustre_user.h>
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL   1
+#define CONNMGR_REPLY_PORTAL       2
+//#define OSC_REQUEST_PORTAL       3
+#define OSC_REPLY_PORTAL               4
+//#define OSC_BULK_PORTAL             5
+#define OST_IO_PORTAL             6
+#define OST_CREATE_PORTAL             7
+#define OST_BULK_PORTAL                 8
+//#define MDC_REQUEST_PORTAL       9
+#define MDC_REPLY_PORTAL              10
+//#define MDC_BULK_PORTAL            11
+#define MDS_REQUEST_PORTAL          12
+//#define MDS_REPLY_PORTAL          13
+#define MDS_BULK_PORTAL                14
+#define LDLM_CB_REQUEST_PORTAL  15
+#define LDLM_CB_REPLY_PORTAL      16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL    19
+//#define PTLBD_REPLY_PORTAL        20
+//#define PTLBD_BULK_PORTAL          21
+#define MDS_SETATTR_PORTAL          22
+#define MDS_READPAGE_PORTAL        23
+#define MDS_MDS_PORTAL          24
+
+#define MGC_REPLY_PORTAL              25
+#define MGS_REQUEST_PORTAL          26
+#define MGS_REPLY_PORTAL              27
+#define OST_REQUEST_PORTAL          28
+#define FLD_REQUEST_PORTAL          29
+#define SEQ_METADATA_PORTAL        30
+#define SEQ_DATA_PORTAL                31
+#define SEQ_CONTROLLER_PORTAL    32
+#define MGS_BULK_PORTAL                33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+typedef __u32 mdsno_t;
+typedef __u64 seqno_t;
+typedef __u64 obd_id;
+typedef __u64 obd_seq;
+typedef __s64 obd_time;
+typedef __u64 obd_size;
+typedef __u64 obd_off;
+typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
+typedef __u32 obd_blksize;
+typedef __u32 obd_mode;
+typedef __u32 obd_uid;
+typedef __u32 obd_gid;
+typedef __u32 obd_flag;
+typedef __u32 obd_count;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+       __u64 lsr_start;
+       __u64 lsr_end;
+       __u32 lsr_index;
+       __u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT       0x0
+#define LU_SEQ_RANGE_OST       0x1
+#define LU_SEQ_RANGE_ANY       0x3
+
+#define LU_SEQ_RANGE_MASK      0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+       return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+                                     unsigned flags)
+{
+       LASSERT(!(flags & ~LU_SEQ_RANGE_MASK));
+       range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+       return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+       range->lsr_start = range->lsr_end = range->lsr_index = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+                              __u64 s)
+{
+       return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+       return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+       return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+       return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+                                   const struct lu_seq_range *r2)
+{
+       return r1->lsr_index != r2->lsr_index ||
+              r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s"
+
+#define PRANGE(range)          \
+       (range)->lsr_start,     \
+       (range)->lsr_end,       \
+       (range)->lsr_index,     \
+       fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+       LMAC_HSM = 0x00000001,
+       LMAC_SOM = 0x00000002,
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+       LMAI_RELEASED = 0x0000001, /* file is released */
+       LMAI_AGENT = 0x00000002, /* agent inode */
+       LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+                                           is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP      (LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+                           const struct lu_fid *fid, __u32 incompat);
+/**
+ * SOM on-disk attributes stored in a separate xattr.
+ */
+struct som_attrs {
+       /** Bitfield for supported data in this structure. For future use. */
+       __u32   som_compat;
+
+       /** Incompat feature list. The supported feature mask is availabe in
+        * SOM_INCOMPAT_SUPP */
+       __u32   som_incompat;
+
+       /** IO Epoch SOM attributes belongs to */
+       __u64   som_ioepoch;
+       /** total file size in objects */
+       __u64   som_size;
+       /** total fs blocks in objects */
+       __u64   som_blocks;
+       /** mds mount id the size is valid for */
+       __u64   som_mountid;
+};
+extern void lustre_som_swab(struct som_attrs *attrs);
+
+#define SOM_INCOMPAT_SUPP 0x0
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+       /** Bitfield for supported data in this structure. For future use. */
+       __u32   hsm_compat;
+
+       /** HSM flags, see hsm_flags enum below */
+       __u32   hsm_flags;
+       /** backend archive id associated with the file */
+       __u64   hsm_arch_id;
+       /** version associated with the last archiving, if any */
+       __u64   hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+       /** initial fid id value */
+       LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+       return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+       return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+       return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+       memset(fid, 0, sizeof(*fid));
+}
+
+static inline obd_id fid_ver_oid(const struct lu_fid *fid)
+{
+       return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+       FID_SEQ_OST_MDT0        = 0,
+       FID_SEQ_LLOG            = 1, /* unnamed llogs */
+       FID_SEQ_ECHO            = 2,
+       FID_SEQ_OST_MDT1        = 3,
+       FID_SEQ_OST_MAX         = 9, /* Max MDT count before OST_on_FID */
+       FID_SEQ_LLOG_NAME       = 10, /* named llogs */
+       FID_SEQ_RSVD            = 11,
+       FID_SEQ_IGIF            = 12,
+       FID_SEQ_IGIF_MAX        = 0x0ffffffffULL,
+       FID_SEQ_IDIF            = 0x100000000ULL,
+       FID_SEQ_IDIF_MAX        = 0x1ffffffffULL,
+       /* Normal FID sequence starts from this value, i.e. 1<<33 */
+       FID_SEQ_START           = 0x200000000ULL,
+       /* sequence for local pre-defined FIDs listed in local_oid */
+       FID_SEQ_LOCAL_FILE      = 0x200000001ULL,
+       FID_SEQ_DOT_LUSTRE      = 0x200000002ULL,
+       /* sequence is used for local named objects FIDs generated
+        * by local_object_storage library */
+       FID_SEQ_LOCAL_NAME      = 0x200000003ULL,
+       /* Because current FLD will only cache the fid sequence, instead
+        * of oid on the client side, if the FID needs to be exposed to
+        * clients sides, it needs to make sure all of fids under one
+        * sequence will be located in one MDT. */
+       FID_SEQ_SPECIAL         = 0x200000004ULL,
+       FID_SEQ_QUOTA           = 0x200000005ULL,
+       FID_SEQ_QUOTA_GLB       = 0x200000006ULL,
+       FID_SEQ_ROOT            = 0x200000007ULL,  /* Located on MDT0 */
+       FID_SEQ_NORMAL          = 0x200000400ULL,
+       FID_SEQ_LOV_DEFAULT     = 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS         32
+#define OBIF_MAX_OID           (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK         ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS         48
+#define IDIF_MAX_OID           (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK         ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+       /* Big Filesystem Lock to serialize rename operations */
+       FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+       FID_OID_DOT_LUSTRE  = 1UL,
+       FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(obd_seq seq)
+{
+       return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+       return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(obd_seq seq)
+{
+       return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+       return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(obd_seq seq)
+{
+       return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+       /* file with OID == 1 is not llog but contains last oid */
+       return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 1;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+       return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+       return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+       return seq == FID_SEQ_LOCAL_FILE ||
+              seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+       return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+       return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+       return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+       return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+       fid->f_seq = FID_SEQ_ROOT;
+       fid->f_oid = 1;
+       fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+       return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+       return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+       return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+       return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+       return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+       return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+       return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
+{
+       return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver)
+{
+       return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+       LASSERT(fid_is_idif(fid));
+       return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline obd_seq ostid_seq(const struct ost_id *ostid)
+{
+       if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+               return FID_SEQ_OST_MDT0;
+
+       if (fid_seq_is_default(ostid->oi.oi_seq))
+               return FID_SEQ_LOV_DEFAULT;
+
+       if (fid_is_idif(&ostid->oi_fid))
+               return FID_SEQ_OST_MDT0;
+
+       return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline obd_id ostid_id(const struct ost_id *ostid)
+{
+       if (fid_seq_is_mdt0(ostid_seq(ostid)))
+               return ostid->oi.oi_id & IDIF_OID_MASK;
+
+       if (fid_is_idif(&ostid->oi_fid))
+               return fid_idif_id(fid_seq(&ostid->oi_fid),
+                                  fid_oid(&ostid->oi_fid), 0);
+
+       return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+       if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+               oi->oi.oi_seq = seq;
+       } else {
+               oi->oi_fid.f_seq = seq;
+               /* Note: if f_oid + f_ver is zero, we need init it
+                * to be 1, otherwise, ostid_seq will treat this
+                * as old ostid (oi_seq == 0) */
+               if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+                       oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+       }
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               if (oid >= IDIF_MAX_OID) {
+                       CERROR("Bad "LPU64" to set "DOSTID"\n",
+                               oid, POSTID(oi));
+                       return;
+               }
+               oi->oi.oi_id = oid;
+       } else {
+               if (oid > OBIF_MAX_OID) {
+                       CERROR("Bad "LPU64" to set "DOSTID"\n",
+                               oid, POSTID(oi));
+                       return;
+               }
+               oi->oi_fid.f_oid = oid;
+       }
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+                       CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+                       return;
+               }
+               oi->oi.oi_id++;
+       } else {
+               oi->oi_fid.f_oid++;
+       }
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi)))
+               oi->oi.oi_id--;
+       else
+               oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.  For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+                              __u32 ost_idx)
+{
+       if (ost_idx > 0xffff) {
+               CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+                      ost_idx);
+               return -EBADF;
+       }
+
+       if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+               /* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+                * that we map into the IDIF namespace.  It allows up to 2^48
+                * objects per OST, as this is the object namespace that has
+                * been in production for years.  This can handle create rates
+                * of 1M objects/s/OST for 9 years, or combinations thereof. */
+               if (ostid_id(ostid) >= IDIF_MAX_OID) {
+                        CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+                               POSTID(ostid), ost_idx);
+                        return -EBADF;
+               }
+               fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+               /* truncate to 32 bits by assignment */
+               fid->f_oid = ostid_id(ostid);
+               /* in theory, not currently used */
+               fid->f_ver = ostid_id(ostid) >> 48;
+       } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+              /* This is either an IDIF object, which identifies objects across
+               * all OSTs, or a regular FID.  The IDIF namespace maps legacy
+               * OST objects into the FID namespace.  In both cases, we just
+               * pass the FID through, no conversion needed. */
+               if (ostid->oi_fid.f_ver != 0) {
+                       CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+                               POSTID(ostid), ost_idx);
+                       return -EBADF;
+               }
+               *fid = ostid->oi_fid;
+       }
+
+       return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+       if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+               CERROR("bad IGIF, "DFID"\n", PFID(fid));
+               return -EBADF;
+       }
+
+       if (fid_is_idif(fid)) {
+               ostid_set_seq_mdt0(ostid);
+               ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+                                               fid_ver(fid)));
+       } else {
+               ostid->oi_fid = *fid;
+       }
+
+       return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+       return (fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
+               fid_oid(fid) == 0;
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+       return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+       return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+       fid->f_seq = ino;
+       fid->f_oid = gen;
+       fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = cpu_to_le64(fid_seq(src));
+       dst->f_oid = cpu_to_le32(fid_oid(src));
+       dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = le64_to_cpu(fid_seq(src));
+       dst->f_oid = le32_to_cpu(fid_oid(src));
+       dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = cpu_to_be64(fid_seq(src));
+       dst->f_oid = cpu_to_be32(fid_oid(src));
+       dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = be64_to_cpu(fid_seq(src));
+       dst->f_oid = be32_to_cpu(fid_oid(src));
+       dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+       return fid != NULL &&
+              ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+               fid_is_igif(fid) || fid_is_idif(fid) ||
+               fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+       /* Check that there is no alignment padding. */
+       CLASSERT(sizeof *f0 ==
+                sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+       return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+#define __diff_normalize(val0, val1)                       \
+({                                                           \
+       typeof(val0) __val0 = (val0);                      \
+       typeof(val1) __val1 = (val1);                      \
+                                                               \
+       (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1);     \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+                            const struct lu_fid *f1)
+{
+       return
+               __diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+               __diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+               __diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(struct ost_id *src_oi,
+                                  struct ost_id *dst_oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+               dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+               dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+       } else {
+               fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+       }
+}
+
+static inline void ostid_le_to_cpu(struct ost_id *src_oi,
+                                  struct ost_id *dst_oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+               dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+               dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+       } else {
+               fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+       }
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+       LUDA_FID                = 0x0001,
+       LUDA_TYPE               = 0x0002,
+       LUDA_64BITHASH          = 0x0004,
+
+       /* The following attrs are used for MDT interanl only,
+        * not visible to client */
+
+       /* Verify the dirent consistency */
+       LUDA_VERIFY             = 0x8000,
+       /* Only check but not repair the dirent inconsistency */
+       LUDA_VERIFY_DRYRUN      = 0x4000,
+       /* The dirent has been repaired, or to be repaired (dryrun). */
+       LUDA_REPAIR             = 0x2000,
+       /* The system is upgraded, has beed or to be repaired (dryrun). */
+       LUDA_UPGRADE            = 0x1000,
+       /* Ignore this record, go to next directly. */
+       LUDA_IGNORE             = 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK   0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+       /** valid if LUDA_FID is set. */
+       struct lu_fid lde_fid;
+       /** a unique entry identifier: a hash or an offset. */
+       __u64    lde_hash;
+       /** total record length, including all attributes. */
+       __u16    lde_reclen;
+       /** name length */
+       __u16    lde_namelen;
+       /** optional variable size attributes following this entry.
+        *  taken from enum lu_dirent_attrs.
+        */
+       __u32    lde_attrs;
+       /** name is followed by the attributes indicated in ->ldp_attrs, in
+        *  their natural order. After the last attribute, padding bytes are
+        *  added to make ->lde_reclen a multiple of 8.
+        */
+       char      lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+       __u16 lt_type;
+};
+
+struct lu_dirpage {
+       __u64       ldp_hash_start;
+       __u64       ldp_hash_end;
+       __u32       ldp_flags;
+       __u32       ldp_pad0;
+       struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+       /**
+        * dirpage contains no entry.
+        */
+       LDF_EMPTY   = 1 << 0,
+       /**
+        * last entry's lde_hash equals ldp_hash_end.
+        */
+       LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+       if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+               return NULL;
+       else
+               return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+       struct lu_dirent *next;
+
+       if (le16_to_cpu(ent->lde_reclen) != 0)
+               next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+       else
+               next = NULL;
+
+       return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+       int size;
+
+       if (attr & LUDA_TYPE) {
+               const unsigned align = sizeof(struct luda_type) - 1;
+               size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+               size += sizeof(struct luda_type);
+       } else
+               size = sizeof(struct lu_dirent) + namelen;
+
+       return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+       if (le16_to_cpu(ent->lde_reclen) == 0) {
+               return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+                                          le32_to_cpu(ent->lde_attrs));
+       }
+       return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+       __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+       return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+                                     const struct lustre_handle *lh2)
+{
+       return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+                                     struct lustre_handle *src)
+{
+       tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT             0x1
+#define MSGHDR_CKSUM_INCOMPAT18         0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+       __u32 lm_bufcount;
+       __u32 lm_secflvr;
+       __u32 lm_magic;
+       __u32 lm_repsize;
+       __u32 lm_cksum;
+       __u32 lm_flags;
+       __u32 lm_padding_2;
+       __u32 lm_padding_3;
+       __u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+#define JOBSTATS_JOBID_SIZE     32  /* 32 bytes string */
+struct ptlrpc_body_v3 {
+       struct lustre_handle pb_handle;
+       __u32 pb_type;
+       __u32 pb_version;
+       __u32 pb_opc;
+       __u32 pb_status;
+       __u64 pb_last_xid;
+       __u64 pb_last_seen;
+       __u64 pb_last_committed;
+       __u64 pb_transno;
+       __u32 pb_flags;
+       __u32 pb_op_flags;
+       __u32 pb_conn_cnt;
+       __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+       __u32 pb_service_time; /* for rep, actual service time */
+       __u32 pb_limit;
+       __u64 pb_slv;
+       /* VBR: pre-versions */
+       __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+       /* padding for future needs */
+       __u64 pb_padding[4];
+       char  pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+       struct lustre_handle pb_handle;
+       __u32 pb_type;
+       __u32 pb_version;
+       __u32 pb_opc;
+       __u32 pb_status;
+       __u64 pb_last_xid;
+       __u64 pb_last_seen;
+       __u64 pb_last_committed;
+       __u64 pb_transno;
+       __u32 pb_flags;
+       __u32 pb_op_flags;
+       __u32 pb_conn_cnt;
+       __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+       __u32 pb_service_time; /* for rep, actual service time, also used for
+                                 net_latency of req */
+       __u32 pb_limit;
+       __u64 pb_slv;
+       /* VBR: pre-versions */
+       __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+       /* padding for future needs */
+       __u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF         0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF                 1
+#define REPLY_REC_OFF             1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF                 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF                 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF             2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF           3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF             1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF             2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF     31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY           0x0001
+#define MSG_RESENT             0x0002
+#define MSG_REPLAY             0x0004
+/* #define MSG_AT_SUPPORT       0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY         0x0010
+#define MSG_VERSION_REPLAY     0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER     0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY             0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX               0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT               0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK          0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION          0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL      0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR             0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW               0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK     0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO         0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS            0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN              0x2000ULL /*files can be concatenated.
+                                                 *We do not support JOIN FILE
+                                                 *anymore, reserve this flags
+                                                 *just for preventing such bit
+                                                 *to be reused.*/
+#define OBD_CONNECT_ATTRFID        0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH        0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE     0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64       0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA    0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA    0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET  0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT       0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS     0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL           0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM        0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3     0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+                                                 * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+                                                 * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+                                                 * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+                                                  * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE   0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS   0x4000000000000ULL/* pings not required */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB            OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+       (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+                               OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+                               OBD_CONNECT_IBITS | \
+                               OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+                               OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+                               OBD_CONNECT_RMT_CLIENT | \
+                               OBD_CONNECT_RMT_CLIENT_FORCE | \
+                               OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+                               OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+                               OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+                               OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+                               OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+                               OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+                               OBD_CONNECT_EINPROGRESS | \
+                               OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+                               OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+                               OBD_CONNECT_PINGLESS)
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+                               OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+                               OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+                               OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+                               OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+                               LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+                               OBD_CONNECT_RMT_CLIENT | \
+                               OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+                               OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+                               OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+                               OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+                               OBD_CONNECT_MAX_EASIZE | \
+                               OBD_CONNECT_EINPROGRESS | \
+                               OBD_CONNECT_JOBSTATS | \
+                               OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+                               OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+                               OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+                               OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+                               OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+                                OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+                                               ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+       __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+       __u32 ocd_version;       /* lustre release version number */
+       __u32 ocd_grant;         /* initial cache grant amount (bytes) */
+       __u32 ocd_index;         /* LOV index to connect to */
+       __u32 ocd_brw_size;      /* Maximum BRW size in bytes, must be 2^n */
+       __u64 ocd_ibits_known;   /* inode bits this client understands */
+       __u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+       __u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+       __u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+       __u32 ocd_unused;       /* also fix lustre_swab_connect */
+       __u64 ocd_transno;       /* first transno from client to be replayed */
+       __u32 ocd_group;         /* MDS group on OST */
+       __u32 ocd_cksum_types;   /* supported checksum algorithms */
+       __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+       __u32 ocd_instance;      /* also fix lustre_swab_connect */
+       __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+       __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+       __u32 ocd_version;       /* lustre release version number */
+       __u32 ocd_grant;         /* initial cache grant amount (bytes) */
+       __u32 ocd_index;         /* LOV index to connect to */
+       __u32 ocd_brw_size;      /* Maximum BRW size in bytes */
+       __u64 ocd_ibits_known;   /* inode bits this client understands */
+       __u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+       __u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+       __u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+       __u32 ocd_unused;       /* also fix lustre_swab_connect */
+       __u64 ocd_transno;       /* first transno from client to be replayed */
+       __u32 ocd_group;         /* MDS group on OST */
+       __u32 ocd_cksum_types;   /* supported checksum algorithms */
+       __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+       __u32 ocd_instance;      /* instance # of this target */
+       __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+       /* Fields after ocd_maxbytes are only accessible by the receiver
+        * if the corresponding flag in ocd_connect_flags is set. Accessing
+        * any field after ocd_maxbytes on the receiver without a valid flag
+        * may result in out-of-bound memory access and kernel oops. */
+       __u64 padding1;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding2;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding3;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding4;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding5;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding6;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding7;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding8;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding9;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingA;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingB;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingC;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingD;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingE;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingF;   /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+       OBD_CKSUM_CRC32 = 0x00000001,
+       OBD_CKSUM_ADLER = 0x00000002,
+       OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+       OST_REPLY      =  0,       /* reply ? */
+       OST_GETATTR    =  1,
+       OST_SETATTR    =  2,
+       OST_READ       =  3,
+       OST_WRITE      =  4,
+       OST_CREATE     =  5,
+       OST_DESTROY    =  6,
+       OST_GET_INFO   =  7,
+       OST_CONNECT    =  8,
+       OST_DISCONNECT =  9,
+       OST_PUNCH      = 10,
+       OST_OPEN       = 11,
+       OST_CLOSE      = 12,
+       OST_STATFS     = 13,
+       OST_SYNC       = 16,
+       OST_SET_INFO   = 17,
+       OST_QUOTACHECK = 18,
+       OST_QUOTACTL   = 19,
+       OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+       OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+       OBD_FL_INLINEDATA   = 0x00000001,
+       OBD_FL_OBDMDEXISTS  = 0x00000002,
+       OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+       OBD_FL_NORPC    = 0x00000008, /* set in o_flags do in OSC not OST */
+       OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+       OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+       OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+       OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+       OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+       OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+       OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+       OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+       OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+       OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+       OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+       OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+       OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+       OBD_FL_MMAP      = 0x00040000, /* object is mmapped on the client.
+                                          * XXX: obsoleted - reserved for old
+                                          * clients prior than 2.2 */
+       OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+       OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+
+       /* Note that while these checksum values are currently separate bits,
+        * in 2.x we can actually allow all values from 1-31 if we wanted. */
+       OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+                             OBD_FL_CKSUM_CRC32C,
+
+       /* mask for local-only flag, which won't be sent over network */
+       OBD_FL_LOCAL_MASK   = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1      0x0BD10BD0
+#define LOV_MAGIC       LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF  0x0CD10BD0
+#define LOV_MAGIC_V3_DEF  0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100   /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {         /* per-stripe data structure (little-endian)*/
+       struct ost_id l_ost_oi;   /* OST object ID */
+       __u32 l_ost_gen;          /* generation of this l_ost_idx */
+       __u32 l_ost_idx;          /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {     /* LOV EA mds/wire data (little-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_MAGIC_V1 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id   lmm_oi;   /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       /* lmm_stripe_count used to be __u32 */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       __u16 lmm_layout_gen;     /* layout generation number */
+       struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *     ........
+ *     __u64 lmm_object_id;
+ *     __u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+                                struct ost_id *oi)
+{
+       oi->oi.oi_id = fid_oid(fid);
+       oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+       oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+       return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+       return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+                                   struct ost_id *src_oi)
+{
+       dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+       dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+                                   struct ost_id *src_oi)
+{
+       dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+       dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_LUSTRE_PREFIX     "lustre."
+
+#define XATTR_NAME_LOV   "trusted.lov"
+#define XATTR_NAME_LMA   "trusted.lma"
+#define XATTR_NAME_LMV   "trusted.lmv"
+#define XATTR_NAME_LINK         "trusted.link"
+#define XATTR_NAME_FID   "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM         "trusted.som"
+#define XATTR_NAME_HSM         "trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 {     /* LOV EA mds/wire data (little-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id   lmm_oi;   /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       /* lmm_stripe_count used to be __u32 */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       __u16 lmm_layout_gen;     /* layout generation number */
+       char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+       struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define OBD_MD_FLID    (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+                                          /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS      (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA      (0x0000000400000000ULL) /* CMD split EA  */
+
+/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
+ * and it is already obsolete since 2.3 */
+/* #define OBD_MD_MDTIDX      (0x0000000800000000ULL) */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL    (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM     (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+                                                     * under lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+                         OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+                         OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+                         OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+                         OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+       HSS_SETMASK     = 0x01,
+       HSS_CLEARMASK   = 0x02,
+       HSS_ARCHIVE_ID  = 0x04,
+};
+
+struct hsm_state_set {
+       __u32   hss_valid;
+       __u32   hss_archive_id;
+       __u64   hss_setmask;
+       __u64   hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ       0x01
+#define OBD_BRW_WRITE     0x02
+#define OBD_BRW_RWMASK   (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC       0x08 /* this page is a part of synchronous
+                                     * transfer and is not accounted in
+                                     * the grant. */
+#define OBD_BRW_CHECK     0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED         0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA        0x100
+#define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC    0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+       struct ost_id   ioo_oid;        /* object ID, if multi-obj BRW */
+       __u32           ioo_max_brw;    /* low 16 bits were o_mode before 2.4,
+                                        * now (PTLRPC_BULK_OPS_COUNT - 1) in
+                                        * high 16 bits in 2.4 and later */
+       __u32           ioo_bufcnt;     /* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS     16
+#define IOOBJ_TYPE_MASK                ((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)                                    \
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+       __u64 offset;
+       __u32 len;
+       __u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)                                   \
+       ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)                                 \
+       do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+       __u64           lvb_size;
+       obd_time        lvb_mtime;
+       obd_time        lvb_atime;
+       obd_time        lvb_ctime;
+       __u64           lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+       __u64           lvb_size;
+       obd_time        lvb_mtime;
+       obd_time        lvb_atime;
+       obd_time        lvb_ctime;
+       __u64           lvb_blocks;
+       __u32           lvb_mtime_ns;
+       __u32           lvb_atime_ns;
+       __u32           lvb_ctime_ns;
+       __u32           lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+       struct lu_fid   qid_fid; /* FID for per-directory quota */
+       __u64           qid_uid; /* user identifier */
+       __u64           qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+       __u32                   qc_cmd;
+       __u32                   qc_type; /* see Q_* flag below */
+       __u32                   qc_id;
+       __u32                   qc_stat;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK   0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA    0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO     0x800102 /* get obd quota info */
+#define Q_GETOQUOTA    0x800103 /* get obd quotas */
+#define Q_FINVALIDATE  0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)             \
+do {                                   \
+       Q_COPY(out, in, qc_cmd);        \
+       Q_COPY(out, in, qc_type);       \
+       Q_COPY(out, in, qc_id);         \
+       Q_COPY(out, in, qc_stat);       \
+       Q_COPY(out, in, qc_dqinfo);     \
+       Q_COPY(out, in, qc_dqblk);      \
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+       struct lu_fid   qb_fid;     /* FID of global index packing the pool ID
+                                     * and type (data or metadata) as well as
+                                     * the quota type (user or group). */
+       union lquota_id qb_id;      /* uid or gid or directory FID */
+       __u32           qb_flags;   /* see below */
+       __u32           qb_padding;
+       __u64           qb_count;   /* acquire/release count (kbytes/inodes) */
+       __u64           qb_usage;   /* current slave usage (kbytes/inodes) */
+       __u64           qb_slv_ver; /* slave index file version */
+       struct lustre_handle    qb_lockh;     /* per-ID lock handle */
+       struct lustre_handle    qb_glb_lockh; /* global lock handle */
+       __u64           qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid     qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit       qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ     0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ  0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL     0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT  0x8  /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+       LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */
+       LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */
+       LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+       LQUOTA_RES_MD           = 0x01, /* skip 0 to avoid null oid in FID */
+       LQUOTA_RES_DT           = 0x02,
+       LQUOTA_LAST_RES,
+       LQUOTA_FIRST_RES        = LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+       __u64 bspace;  /* current space in use */
+       __u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+       __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+       __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+       __u64 qbr_time;      /* grace time, in seconds */
+       __u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+                             * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+       __u64 qsr_granted; /* space granted to the slave for the key=ID,
+                           * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+       union lquota_id gl_id;    /* quota ID subject to the glimpse */
+       __u64           gl_flags; /* see LQUOTA_FL* below */
+       __u64           gl_ver;   /* new index version */
+       __u64           gl_hardlimit; /* new hardlimit or qunit value */
+       __u64           gl_softlimit; /* new softlimit */
+       __u64           gl_time;
+       __u64           gl_pad2;
+};
+#define gl_qunit       gl_hardlimit /* current qunit value used when
+                                     * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+       __u64   lvb_flags;      /* see LQUOTA_FL* above */
+       __u64   lvb_id_may_rel; /* space that might be released later */
+       __u64   lvb_id_rel;     /* space released by the slave for this ID */
+       __u64   lvb_id_qunit;   /* current qunit value */
+       __u64   lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+       QUOTA_DQACQ     = 601,
+       QUOTA_DQREL     = 602,
+       QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC        QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+       MDS_GETATTR             = 33,
+       MDS_GETATTR_NAME        = 34,
+       MDS_CLOSE               = 35,
+       MDS_REINT               = 36,
+       MDS_READPAGE            = 37,
+       MDS_CONNECT             = 38,
+       MDS_DISCONNECT          = 39,
+       MDS_GETSTATUS           = 40,
+       MDS_STATFS              = 41,
+       MDS_PIN                 = 42,
+       MDS_UNPIN               = 43,
+       MDS_SYNC                = 44,
+       MDS_DONE_WRITING        = 45,
+       MDS_SET_INFO            = 46,
+       MDS_QUOTACHECK          = 47,
+       MDS_QUOTACTL            = 48,
+       MDS_GETXATTR            = 49,
+       MDS_SETXATTR            = 50, /* obsolete, now it's MDS_REINT op */
+       MDS_WRITEPAGE           = 51,
+       MDS_IS_SUBDIR           = 52,
+       MDS_GET_INFO            = 53,
+       MDS_HSM_STATE_GET       = 54,
+       MDS_HSM_STATE_SET       = 55,
+       MDS_HSM_ACTION          = 56,
+       MDS_HSM_PROGRESS        = 57,
+       MDS_HSM_REQUEST         = 58,
+       MDS_HSM_CT_REGISTER     = 59,
+       MDS_HSM_CT_UNREGISTER   = 60,
+       MDS_SWAP_LAYOUTS        = 61,
+       MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+       UPDATE_OBJ      = 1000,
+       UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC    UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+       REINT_SETATTR  = 1,
+       REINT_CREATE   = 2,
+       REINT_LINK     = 3,
+       REINT_UNLINK   = 4,
+       REINT_RENAME   = 5,
+       REINT_OPEN     = 6,
+       REINT_SETXATTR = 7,
+       REINT_RMENTRY  = 8,
+//      REINT_WRITE    = 9,
+       REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD  0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001       /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002       /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004       /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008       /* for layout */
+#define MDS_INODELOCK_PERM   0x000010       /* for permission */
+
+#define MDS_INODELOCK_MAXSHIFT 4
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+       LUSTRE_RES_ID_SEQ_OFF = 0,
+       LUSTRE_RES_ID_VER_OID_OFF = 1,
+       LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+       LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+       LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+       LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+       /* The flag indicates Size-on-MDS attributes are changed. */
+       MF_SOM_CHANGE      = (1 << 0),
+       /* Flags indicates an epoch opens or closes. */
+       MF_EPOCH_OPEN      = (1 << 1),
+       MF_EPOCH_CLOSE    = (1 << 2),
+       MF_MDC_CANCEL_FID1      = (1 << 3),
+       MF_MDC_CANCEL_FID2      = (1 << 4),
+       MF_MDC_CANCEL_FID3      = (1 << 5),
+       MF_MDC_CANCEL_FID4      = (1 << 6),
+       /* There is a pending attribute update. */
+       MF_SOM_AU              = (1 << 7),
+       /* Cancel OST locks while getattr OST attributes. */
+       MF_GETATTR_LOCK  = (1 << 8),
+       MF_GET_MDT_IDX    = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES   0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL  0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL    0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL       0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL      0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL      0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+       return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+               ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+               ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+               ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+               ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+       return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+               ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+               ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+               ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+               ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct mdt_body {
+       struct lu_fid  fid1;
+       struct lu_fid  fid2;
+       struct lustre_handle handle;
+       __u64     valid;
+       __u64     size;   /* Offset, in the case of MDS_READPAGE */
+       obd_time        mtime;
+       obd_time        atime;
+       obd_time        ctime;
+       __u64     blocks; /* XID, in the case of MDS_READPAGE */
+       __u64     ioepoch;
+       __u64          unused1; /* was "ino" until 2.4.0 */
+       __u32     fsuid;
+       __u32     fsgid;
+       __u32     capability;
+       __u32     mode;
+       __u32     uid;
+       __u32     gid;
+       __u32     flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+       __u32     rdev;
+       __u32     nlink; /* #bytes to read in the case of MDS_READPAGE */
+       __u32          unused2; /* was "generation" until 2.4.0 */
+       __u32     suppgid;
+       __u32     eadatasize;
+       __u32     aclsize;
+       __u32     max_mdsize;
+       __u32     max_cookiesize;
+       __u32     uid_h; /* high 32-bits of uid, for FUID */
+       __u32     gid_h; /* high 32-bits of gid, for FUID */
+       __u32     padding_5; /* also fix lustre_swab_mdt_body */
+       __u64     padding_6;
+       __u64     padding_7;
+       __u64     padding_8;
+       __u64     padding_9;
+       __u64     padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+       struct lustre_handle handle;
+       __u64  ioepoch;
+       __u32  flags;
+       __u32  padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+       CFS_SETUID_PERM = 0x01,
+       CFS_SETGID_PERM = 0x02,
+       CFS_SETGRP_PERM = 0x04,
+       CFS_RMTACL_PERM = 0x08,
+       CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+       __u32      rp_uid;
+       __u32      rp_gid;
+       __u32      rp_fsuid;
+       __u32      rp_fsuid_h;
+       __u32      rp_fsgid;
+       __u32      rp_fsgid_h;
+       __u32      rp_access_perm; /* MAY_READ/WRITE/EXEC */
+       __u32      rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+       __u32      sa_opcode;
+       __u32      sa_cap;
+       __u32      sa_fsuid;
+       __u32      sa_fsuid_h;
+       __u32      sa_fsgid;
+       __u32      sa_fsgid_h;
+       __u32      sa_suppgid;
+       __u32      sa_suppgid_h;
+       __u32      sa_padding_1;
+       __u32      sa_padding_1_h;
+       struct lu_fid   sa_fid;
+       __u64      sa_valid;
+       __u32      sa_uid;
+       __u32      sa_gid;
+       __u64      sa_size;
+       __u64      sa_blocks;
+       obd_time        sa_mtime;
+       obd_time        sa_atime;
+       obd_time        sa_ctime;
+       __u32      sa_attr_flags;
+       __u32      sa_mode;
+       __u32      sa_bias;      /* some operation flags */
+       __u32      sa_padding_3;
+       __u32      sa_padding_4;
+       __u32      sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE    0x1ULL /* = 1 */
+#define MDS_ATTR_UID      0x2ULL /* = 2 */
+#define MDS_ATTR_GID      0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE    0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME 0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME 0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME 0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ            00000001
+#define FMODE_WRITE          00000002
+#endif
+
+#define MDS_FMODE_CLOSED        00000000
+#define MDS_FMODE_EXEC    00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH          01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC          02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM      04000000
+
+#define MDS_OPEN_CREATED        00000010
+#define MDS_OPEN_CROSS    00000020
+
+#define MDS_OPEN_CREAT    00000100
+#define MDS_OPEN_EXCL      00000200
+#define MDS_OPEN_TRUNC    00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC      00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID                040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+                                          * We do not support JOIN FILE
+                                          * anymore, reserve this flags
+                                          * just for preventing such bit
+                                          * to be reused. */
+
+#define MDS_OPEN_LOCK   04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+                                             * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+                                               unlinked */
+
+/* permission for create non-directory file */
+#define MAY_CREATE      (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK       (1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK      (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC  (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR  (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART    (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL    (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+enum {
+       MDS_CHECK_SPLIT         = 1 << 0,
+       MDS_CROSS_REF           = 1 << 1,
+       MDS_VTX_BYPASS          = 1 << 2,
+       MDS_PERM_BYPASS         = 1 << 3,
+       MDS_SOM                 = 1 << 4,
+       MDS_QUOTA_IGNORE        = 1 << 5,
+       MDS_CLOSE_CLEANUP       = 1 << 6,
+       MDS_KEEP_ORPHAN         = 1 << 7,
+       MDS_RECOV_OPEN          = 1 << 8,
+       MDS_DATA_MODIFIED       = 1 << 9,
+       MDS_CREATE_VOLATILE     = 1 << 10,
+       MDS_OWNEROVERRIDE       = 1 << 11,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+       __u32      cr_opcode;
+       __u32      cr_cap;
+       __u32      cr_fsuid;
+       __u32      cr_fsuid_h;
+       __u32      cr_fsgid;
+       __u32      cr_fsgid_h;
+       __u32      cr_suppgid1;
+       __u32      cr_suppgid1_h;
+       __u32      cr_suppgid2;
+       __u32      cr_suppgid2_h;
+       struct lu_fid   cr_fid1;
+       struct lu_fid   cr_fid2;
+       struct lustre_handle cr_old_handle; /* handle in case of open replay */
+       obd_time        cr_time;
+       __u64      cr_rdev;
+       __u64      cr_ioepoch;
+       __u64      cr_padding_1;   /* rr_blocks */
+       __u32      cr_mode;
+       __u32      cr_bias;
+       /* use of helpers set/get_mrc_cr_flags() is needed to access
+        * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+        * extend cr_flags size without breaking 1.8 compat */
+       __u32      cr_flags_l;     /* for use with open, low  32 bits  */
+       __u32      cr_flags_h;     /* for use with open, high 32 bits */
+       __u32      cr_umask;       /* umask for create */
+       __u32      cr_padding_4;   /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+       mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+       mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+       return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+       __u32      lk_opcode;
+       __u32      lk_cap;
+       __u32      lk_fsuid;
+       __u32      lk_fsuid_h;
+       __u32      lk_fsgid;
+       __u32      lk_fsgid_h;
+       __u32      lk_suppgid1;
+       __u32      lk_suppgid1_h;
+       __u32      lk_suppgid2;
+       __u32      lk_suppgid2_h;
+       struct lu_fid   lk_fid1;
+       struct lu_fid   lk_fid2;
+       obd_time        lk_time;
+       __u64      lk_padding_1;   /* rr_atime */
+       __u64      lk_padding_2;   /* rr_ctime */
+       __u64      lk_padding_3;   /* rr_size */
+       __u64      lk_padding_4;   /* rr_blocks */
+       __u32      lk_bias;
+       __u32      lk_padding_5;   /* rr_mode */
+       __u32      lk_padding_6;   /* rr_flags */
+       __u32      lk_padding_7;   /* rr_padding_2 */
+       __u32      lk_padding_8;   /* rr_padding_3 */
+       __u32      lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+       __u32      ul_opcode;
+       __u32      ul_cap;
+       __u32      ul_fsuid;
+       __u32      ul_fsuid_h;
+       __u32      ul_fsgid;
+       __u32      ul_fsgid_h;
+       __u32      ul_suppgid1;
+       __u32      ul_suppgid1_h;
+       __u32      ul_suppgid2;
+       __u32      ul_suppgid2_h;
+       struct lu_fid   ul_fid1;
+       struct lu_fid   ul_fid2;
+       obd_time        ul_time;
+       __u64      ul_padding_2;   /* rr_atime */
+       __u64      ul_padding_3;   /* rr_ctime */
+       __u64      ul_padding_4;   /* rr_size */
+       __u64      ul_padding_5;   /* rr_blocks */
+       __u32      ul_bias;
+       __u32      ul_mode;
+       __u32      ul_padding_6;   /* rr_flags */
+       __u32      ul_padding_7;   /* rr_padding_2 */
+       __u32      ul_padding_8;   /* rr_padding_3 */
+       __u32      ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+       __u32      rn_opcode;
+       __u32      rn_cap;
+       __u32      rn_fsuid;
+       __u32      rn_fsuid_h;
+       __u32      rn_fsgid;
+       __u32      rn_fsgid_h;
+       __u32      rn_suppgid1;
+       __u32      rn_suppgid1_h;
+       __u32      rn_suppgid2;
+       __u32      rn_suppgid2_h;
+       struct lu_fid   rn_fid1;
+       struct lu_fid   rn_fid2;
+       obd_time        rn_time;
+       __u64      rn_padding_1;   /* rr_atime */
+       __u64      rn_padding_2;   /* rr_ctime */
+       __u64      rn_padding_3;   /* rr_size */
+       __u64      rn_padding_4;   /* rr_blocks */
+       __u32      rn_bias;     /* some operation flags */
+       __u32      rn_mode;     /* cross-ref rename has mode */
+       __u32      rn_padding_5;   /* rr_flags */
+       __u32      rn_padding_6;   /* rr_padding_2 */
+       __u32      rn_padding_7;   /* rr_padding_3 */
+       __u32      rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+       __u32      sx_opcode;
+       __u32      sx_cap;
+       __u32      sx_fsuid;
+       __u32      sx_fsuid_h;
+       __u32      sx_fsgid;
+       __u32      sx_fsgid_h;
+       __u32      sx_suppgid1;
+       __u32      sx_suppgid1_h;
+       __u32      sx_suppgid2;
+       __u32      sx_suppgid2_h;
+       struct lu_fid   sx_fid;
+       __u64      sx_padding_1;   /* These three are rr_fid2 */
+       __u32      sx_padding_2;
+       __u32      sx_padding_3;
+       __u64      sx_valid;
+       obd_time        sx_time;
+       __u64      sx_padding_5;   /* rr_ctime */
+       __u64      sx_padding_6;   /* rr_size */
+       __u64      sx_padding_7;   /* rr_blocks */
+       __u32      sx_size;
+       __u32      sx_flags;
+       __u32      sx_padding_8;   /* rr_flags */
+       __u32      sx_padding_9;   /* rr_padding_2 */
+       __u32      sx_padding_10;  /* rr_padding_3 */
+       __u32      sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+       __u32      rr_opcode;
+       __u32      rr_cap;
+       __u32      rr_fsuid;
+       __u32      rr_fsuid_h;
+       __u32      rr_fsgid;
+       __u32      rr_fsgid_h;
+       __u32      rr_suppgid1;
+       __u32      rr_suppgid1_h;
+       __u32      rr_suppgid2;
+       __u32      rr_suppgid2_h;
+       struct lu_fid   rr_fid1;
+       struct lu_fid   rr_fid2;
+       obd_time        rr_mtime;
+       obd_time        rr_atime;
+       obd_time        rr_ctime;
+       __u64      rr_size;
+       __u64      rr_blocks;
+       __u32      rr_bias;
+       __u32      rr_mode;
+       __u32      rr_flags;
+       __u32      rr_flags_h;
+       __u32      rr_umask;
+       __u32      rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+       __u32 ld_tgt_count;             /* how many MDS's */
+       __u32 ld_active_tgt_count;       /* how many active */
+       __u32 ld_default_stripe_count;     /* how many objects are used */
+       __u32 ld_pattern;                 /* default MEA_MAGIC_* */
+       __u64 ld_default_hash_size;
+       __u64 ld_padding_1;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_padding_2;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_qos_maxage;           /* in second */
+       __u32 ld_padding_3;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_padding_4;             /* also fix lustre_swab_lmv_desc */
+       struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+       __u32    mea_magic;
+       __u32    mea_count;
+       __u32    mea_master;
+       __u32    mea_padding;
+       char      mea_pool_name[LOV_MAXPOOLNAME];
+       struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32        0x7fffffffUL
+#define MAX_HASH_SIZE      0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+enum fld_rpc_opc {
+       FLD_QUERY                      = 900,
+       FLD_LAST_OPC,
+       FLD_FIRST_OPC              = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+       SEQ_QUERY                      = 700,
+       SEQ_LAST_OPC,
+       SEQ_FIRST_OPC              = SEQ_QUERY
+};
+
+enum seq_op {
+       SEQ_ALLOC_SUPER = 0,
+       SEQ_ALLOC_META = 1
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+       __u32 ld_tgt_count;             /* how many OBD's */
+       __u32 ld_active_tgt_count;       /* how many active */
+       __u32 ld_default_stripe_count;     /* how many objects are used */
+       __u32 ld_pattern;                 /* default PATTERN_RAID0 */
+       __u64 ld_default_stripe_size;      /* in bytes */
+       __u64 ld_default_stripe_offset;    /* in bytes */
+       __u32 ld_padding_0;             /* unused */
+       __u32 ld_qos_maxage;           /* in second */
+       __u32 ld_padding_1;             /* also fix lustre_swab_lov_desc */
+       __u32 ld_padding_2;             /* also fix lustre_swab_lov_desc */
+       struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+       LDLM_ENQUEUE     = 101,
+       LDLM_CONVERT     = 102,
+       LDLM_CANCEL      = 103,
+       LDLM_BL_CALLBACK = 104,
+       LDLM_CP_CALLBACK = 105,
+       LDLM_GL_CALLBACK = 106,
+       LDLM_SET_INFO    = 107,
+       LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+       __u64 name[RES_NAME_SIZE];
+};
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+                             const struct ldlm_res_id *res1)
+{
+       return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+       LCK_MINMODE = 0,
+       LCK_EX      = 1,
+       LCK_PW      = 2,
+       LCK_PR      = 4,
+       LCK_CW      = 8,
+       LCK_CR      = 16,
+       LCK_NL      = 32,
+       LCK_GROUP   = 64,
+       LCK_COS     = 128,
+       LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum {
+       LDLM_PLAIN     = 10,
+       LDLM_EXTENT    = 11,
+       LDLM_FLOCK     = 12,
+       LDLM_IBITS     = 13,
+       LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+       __u64 start;
+       __u64 end;
+       __u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+                                     struct ldlm_extent *ex2)
+{
+       return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+                                     struct ldlm_extent *ex2)
+{
+       return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+       __u64 bits;
+};
+
+struct ldlm_flock_wire {
+       __u64 lfw_start;
+       __u64 lfw_end;
+       __u64 lfw_owner;
+       __u32 lfw_padding;
+       __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+       struct ldlm_extent l_extent;
+       struct ldlm_flock_wire l_flock;
+       struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+       struct ldlm_gl_lquota_desc      lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+       __u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+       ldlm_type_t lr_type;
+       __u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
+       struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+       struct ldlm_resource_desc l_resource;
+       ldlm_mode_t l_req_mode;
+       ldlm_mode_t l_granted_mode;
+       ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+       __u32 lock_flags;
+       __u32 lock_count;
+       struct ldlm_lock_desc lock_desc;
+       struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count,type)                               \
+({                                                                   \
+       int _avail = LDLM_LOCKREQ_HANDLES;                            \
+       _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+       sizeof(struct ldlm_request) +                              \
+       (count > _avail ? count - _avail : 0) *                  \
+       sizeof(struct lustre_handle);                              \
+})
+
+struct ldlm_reply {
+       __u32 lock_flags;
+       __u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+       struct ldlm_lock_desc lock_desc;
+       struct lustre_handle lock_handle;
+       __u64  lock_policy_res1;
+       __u64  lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+       MGS_CONNECT = 250,
+       MGS_DISCONNECT,
+       MGS_EXCEPTION,   /* node died, etc. */
+       MGS_TARGET_REG, /* whenever target starts up */
+       MGS_TARGET_DEL,
+       MGS_SET_INFO,
+       MGS_CONFIG_READ,
+       MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+       char         mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+       __u32       mti_lustre_ver;
+       __u32       mti_stripe_index;
+       __u32       mti_config_ver;
+       __u32       mti_flags;
+       __u32       mti_nid_count;
+       __u32       mti_instance; /* Running instance of target */
+       char         mti_fsname[MTI_NAME_MAXLEN];
+       char         mti_svname[MTI_NAME_MAXLEN];
+       char         mti_uuid[sizeof(struct obd_uuid)];
+       __u64       mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+       char         mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+       __u64      mne_version;    /* table version of this entry */
+       __u32      mne_instance;   /* target instance # */
+       __u32      mne_index;      /* target index */
+       __u32      mne_length;     /* length of this entry - by bytes */
+       __u8        mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+       __u8        mne_nid_type;   /* type of nid(mbz). for ipv6. */
+       __u8        mne_nid_size;   /* size of each NID, by bytes */
+       __u8        mne_nid_count;  /* # of NIDs in buffer */
+       union {
+               lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+       } u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+       char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+       __u64    mcb_offset;    /* next index of config log to request */
+       __u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+       __u8     mcb_reserved;
+       __u8     mcb_bits;      /* bits unit size of config log */
+       __u32    mcb_units;     /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+       __u64    mcr_offset;    /* index of last config log */
+       __u64    mcr_size;      /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END  0x02
+#define CM_SKIP        0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+       __u32        cm_step;       /* aka config version */
+       __u32        cm_flags;
+       __u32        cm_vers;       /* lustre release version number */
+       __u32        cm_padding;    /* 64 bit align */
+       obd_time          cm_createtime; /*when this record was first created */
+       obd_time          cm_canceltime; /*when this record is no longer valid*/
+       char          cm_tgtname[MTI_NAME_MAXLEN];
+       char          cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+                                  int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+       OBD_PING = 400,
+       OBD_LOG_CANCEL,
+       OBD_QC_CALLBACK,
+       OBD_IDX_READ,
+       OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+       struct ost_id           lgl_oi;
+       __u32              lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+       struct llog_logid       lci_logid;
+       __u32              lci_padding1;
+       __u32              lci_padding2;
+       __u32              lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+       LLOG_PAD_MAGIC          = LLOG_OP_MAGIC | 0x00000,
+       OST_SZ_REC              = LLOG_OP_MAGIC | 0x00f00,
+       /* OST_RAID1_REC        = LLOG_OP_MAGIC | 0x01000, never used */
+       MDS_UNLINK_REC          = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+                                 REINT_UNLINK, /* obsolete after 2.5.0 */
+       MDS_UNLINK64_REC        = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+                                 REINT_UNLINK,
+       /* MDS_SETATTR_REC      = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+       MDS_SETATTR64_REC       = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+                                 REINT_SETATTR,
+       OBD_CFG_REC             = LLOG_OP_MAGIC | 0x20000,
+       /* PTL_CFG_REC          = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+       LLOG_GEN_REC            = LLOG_OP_MAGIC | 0x40000,
+       /* LLOG_JOIN_REC        = LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+       CHANGELOG_REC           = LLOG_OP_MAGIC | 0x60000,
+       CHANGELOG_USER_REC      = LLOG_OP_MAGIC | 0x70000,
+       LLOG_HDR_MAGIC          = LLOG_OP_MAGIC | 0x45539,
+       LLOG_LOGID_MAGIC        = LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+       (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+       __u32   lrh_len;
+       __u32   lrh_index;
+       __u32   lrh_type;
+       __u32   lrh_id;
+};
+
+struct llog_rec_tail {
+       __u32   lrt_len;
+       __u32   lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)                                          \
+       ((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)                                      \
+       (rec->lrh_len - sizeof(struct llog_rec_hdr) -           \
+        sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+       struct llog_rec_hdr     lid_hdr;
+       struct llog_logid       lid_id;
+       __u32                   lid_padding1;
+       __u64                   lid_padding2;
+       __u64                   lid_padding3;
+       struct llog_rec_tail    lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+       struct llog_rec_hdr     lur_hdr;
+       obd_id                  lur_oid;
+       obd_count               lur_oseq;
+       obd_count               lur_count;
+       struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+       struct llog_rec_hdr     lur_hdr;
+       struct lu_fid           lur_fid;
+       obd_count               lur_count; /* to destroy the lost precreated */
+       __u32                   lur_padding1;
+       __u64                   lur_padding2;
+       __u64                   lur_padding3;
+       struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+       struct llog_rec_hdr     lsr_hdr;
+       struct ost_id           lsr_oi;
+       __u32                   lsr_uid;
+       __u32                   lsr_uid_h;
+       __u32                   lsr_gid;
+       __u32                   lsr_gid_h;
+       __u64                   lsr_padding;
+       struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+       struct llog_rec_hdr     lsc_hdr;
+       struct ll_fid           lsc_fid;
+       __u32                   lsc_ioepoch;
+       __u32                   lsc_padding1;
+       __u64                   lsc_padding2;
+       __u64                   lsc_padding3;
+       struct llog_rec_tail    lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+       __u64 cs_recno;
+       __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+       struct llog_rec_hdr  cr_hdr;
+       struct changelog_rec cr;
+       struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+       struct llog_rec_hdr      cr_hdr;
+       struct changelog_ext_rec cr;
+       struct llog_rec_tail     cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+       struct llog_rec_hdr   cur_hdr;
+       __u32            cur_id;
+       __u32            cur_padding;
+       __u64            cur_endrec;
+       struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+       __u64 mnt_cnt;
+       __u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+       struct llog_rec_hdr     lgr_hdr;
+       struct llog_gen         lgr_gen;
+       __u64                   padding1;
+       __u64                   padding2;
+       __u64                   padding3;
+       struct llog_rec_tail    lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE         8192
+#define LLOG_HEADER_SIZE       (96)
+#define LLOG_BITMAP_BYTES       (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+       LLOG_F_ZAP_WHEN_EMPTY   = 0x1,
+       LLOG_F_IS_CAT           = 0x2,
+       LLOG_F_IS_PLAIN         = 0x4,
+};
+
+struct llog_log_hdr {
+       struct llog_rec_hdr     llh_hdr;
+       obd_time                llh_timestamp;
+       __u32              llh_count;
+       __u32              llh_bitmap_offset;
+       __u32              llh_size;
+       __u32              llh_flags;
+       __u32              llh_cat_idx;
+       /* for a catalog the first plain slot is next to it */
+       struct obd_uuid  llh_tgtuuid;
+       __u32              llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+       __u32              llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+       struct llog_rec_tail    llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh)  (__u32)((llh->llh_hdr.lrh_len -         \
+                                       llh->llh_bitmap_offset -        \
+                                       sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+       struct llog_logid       lgc_lgl;
+       __u32              lgc_subsys;
+       __u32              lgc_index;
+       __u32              lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+       LLOG_ORIGIN_HANDLE_CREATE       = 501,
+       LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+       LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+       LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+       LLOG_ORIGIN_HANDLE_CLOSE        = 505,
+       LLOG_ORIGIN_CONNECT          = 506,
+       LLOG_CATINFO                    = 507,  /* deprecated */
+       LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+       LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+       LLOG_LAST_OPC,
+       LLOG_FIRST_OPC            = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+       struct llog_logid  lgd_logid;
+       __u32 lgd_ctxt_idx;
+       __u32 lgd_llh_flags;
+       __u32 lgd_index;
+       __u32 lgd_saved_index;
+       __u32 lgd_len;
+       __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+       struct llog_gen  lgdc_gen;
+       struct llog_logid       lgdc_logid;
+       __u32              lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+       obd_valid              o_valid; /* hot fields in this obdo */
+       struct ost_id      o_oi;
+       obd_id            o_parent_seq;
+       obd_size                o_size;  /* o_size-o_blocks == ost_lvb */
+       obd_time                o_mtime;
+       obd_time                o_atime;
+       obd_time                o_ctime;
+       obd_blocks            o_blocks;       /* brw: cli sent cached bytes */
+       obd_size                o_grant;
+
+       /* 32-bit fields start here: keep an even number of them via padding */
+       obd_blksize          o_blksize;      /* optimal IO blocksize */
+       obd_mode                o_mode;  /* brw: cli sent cache remain */
+       obd_uid          o_uid;
+       obd_gid          o_gid;
+       obd_flag                o_flags;
+       obd_count              o_nlink; /* brw: checksum */
+       obd_count              o_parent_oid;
+       obd_count               o_misc;         /* brw: o_dropped */
+
+       __u64              o_ioepoch;      /* epoch in ost writes */
+       __u32              o_stripe_idx;   /* holds stripe idx */
+       __u32              o_parent_ver;
+       struct lustre_handle    o_handle;       /* brw: lock handle to prolong
+                                                * locks */
+       struct llog_cookie      o_lcookie;      /* destroy: unlink cookie from
+                                                * MDS */
+       __u32                   o_uid_h;
+       __u32                   o_gid_h;
+
+       __u64                   o_data_version; /* getattr: sum of iversion for
+                                                * each stripe.
+                                                * brw: grant space consumed on
+                                                * the client for the write */
+       __u64                   o_padding_4;
+       __u64                   o_padding_5;
+       __u64                   o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obdo *wobdo, struct obdo *lobdo)
+{
+       memcpy(wobdo, lobdo, sizeof(*lobdo));
+       wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+}
+
+static inline void lustre_get_wire_obdo(struct obdo *lobdo, struct obdo *wobdo)
+{
+       obd_flag local_flags = 0;
+
+       if (lobdo->o_valid & OBD_MD_FLFLAGS)
+                local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+       LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+
+       memcpy(lobdo, wobdo, sizeof(*lobdo));
+       if (local_flags != 0) {
+                lobdo->o_valid |= OBD_MD_FLFLAGS;
+                lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+                lobdo->o_flags |= local_flags;
+       }
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+       struct  obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+       char    name[8];
+       struct  obdo oa;
+       struct  ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                           int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+       __u32           ii_magic;
+
+       /* reply: see idx_info_flags below */
+       __u32           ii_flags;
+
+       /* request & reply: number of lu_idxpage (to be) transferred */
+       __u16           ii_count;
+       __u16           ii_pad0;
+
+       /* request: requested attributes passed down to the iterator API */
+       __u32           ii_attrs;
+
+       /* request & reply: index file identifier (FID) */
+       struct lu_fid   ii_fid;
+
+       /* reply: version of the index file before starting to walk the index.
+        * Please note that the version can be modified at any time during the
+        * transfer */
+       __u64           ii_version;
+
+       /* request: hash to start with:
+        * reply: hash of the first entry of the first lu_idxpage and hash
+        *      of the entry to read next if any */
+       __u64           ii_hash_start;
+       __u64           ii_hash_end;
+
+       /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+        * set */
+       __u16           ii_keysize;
+
+       /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+        * is set */
+       __u16           ii_recsize;
+
+       __u32           ii_pad1;
+       __u64           ii_pad2;
+       __u64           ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF     MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+       II_FL_NOHASH    = 1 << 0, /* client doesn't care about hash value */
+       II_FL_VARKEY    = 1 << 1, /* keys can be of variable size */
+       II_FL_VARREC    = 1 << 2, /* records can be of variable size */
+       II_FL_NONUNQ    = 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+       /* 16-byte header */
+       __u32   lip_magic;
+       __u16   lip_flags;
+       __u16   lip_nr;   /* number of entries in the container */
+       __u64   lip_pad0; /* additional padding for future use */
+
+       /* key/record pairs are stored in the remaining 4080 bytes.
+        * depending upon the flags in idx_info::ii_flags, each key/record
+        * pair might be preceded by:
+        * - a hash value
+        * - the key size (II_FL_VARKEY is set)
+        * - the record size (II_FL_VARREC is set)
+        *
+        * For the time being, we only support fixed-size key & record. */
+       char    lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+       struct lu_dirpage       lp_dir; /* for MDS_READPAGE */
+       struct lu_idxpage       lp_idx; /* for OBD_IDX_READ */
+       char                    lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+       SEC_CTX_INIT        = 801,
+       SEC_CTX_INIT_CONT       = 802,
+       SEC_CTX_FINI        = 803,
+       SEC_LAST_OPC,
+       SEC_FIRST_OPC      = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+       struct lu_fid   lc_fid;  /** fid */
+       __u64      lc_opc;       /** operations allowed */
+       __u64      lc_uid;       /** file owner */
+       __u64      lc_gid;       /** file group */
+       __u32      lc_flags;       /** HMAC algorithm & flags */
+       __u32      lc_keyid;       /** key# used for the capability */
+       __u32      lc_timeout;     /** capa timeout value (sec) */
+       __u32      lc_expiry;      /** expiry time (sec) */
+       __u8        lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+       CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+       CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+       CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+       CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+       CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+       CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+       CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+       CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+       CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+       CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+       CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY                                                 \
+       (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+        CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY                                                 \
+       (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+        CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+       return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+       return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+       CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+       CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK       0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+       __u64   lk_seq;       /**< mds# */
+       __u32   lk_keyid;     /**< key# */
+       __u32   lk_padding;
+       __u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+       __u32 leh_magic;
+       __u32 leh_reccount;
+       __u64 leh_len;      /* total size */
+       /* future use */
+       __u32 padding1;
+       __u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+       /** __u16 stored big-endian, unaligned */
+       unsigned char      lee_reclen[2];
+       unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+       char           lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+       struct lu_fid   gf_fid;
+       __u64      gf_recno;
+       __u32      gf_linkno;
+       __u32      gf_pathlen;
+       char        gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+       LAYOUT_INTENT_ACCESS    = 0,
+       LAYOUT_INTENT_READ      = 1,
+       LAYOUT_INTENT_WRITE     = 2,
+       LAYOUT_INTENT_GLIMPSE   = 3,
+       LAYOUT_INTENT_TRUNC     = 4,
+       LAYOUT_INTENT_RELEASE   = 5,
+       LAYOUT_INTENT_RESTORE   = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+       __u32 li_opc; /* intent operation for enqueue, read, write etc */
+       __u32 li_flags;
+       __u64 li_start;
+       __u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+       /* Field taken from struct hsm_progress */
+       lustre_fid              hpk_fid;
+       __u64                   hpk_cookie;
+       struct hsm_extent       hpk_extent;
+       __u16                   hpk_flags;
+       __u16                   hpk_errval; /* positive val */
+       __u32                   hpk_padding1;
+       /* Additional fields */
+       __u64                   hpk_data_version;
+       __u64                   hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ *   Update request format
+ *   magic:  UPDATE_BUFFER_MAGIC_V1
+ *   Count:  How many updates in the req.
+ *   bufs[0] : following are packets of object.
+ *   update[0]:
+ *             type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   update[1]:
+ *             type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   ..........
+ *   update[7]:        type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ *   update reply format:
+ *
+ *   ur_version: UPDATE_REPLY_V1
+ *   ur_count:   The count of the reply, which is usually equal
+ *              to the number of updates in the request.
+ *   ur_lens:    The reply lengths of each object update.
+ *
+ *   replies:    1st update reply  [4bytes_ret: other body]
+ *              2nd update reply  [4bytes_ret: other body]
+ *              .....
+ *              nth update reply  [4bytes_ret: other body]
+ *
+ *   For each reply of the update, the format would be
+ *      result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS         10
+#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001
+#define UPDATE_BUFFER_MAGIC    UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT       8
+enum object_update_op {
+       OBJ_CREATE              = 1,
+       OBJ_DESTROY             = 2,
+       OBJ_REF_ADD             = 3,
+       OBJ_REF_DEL             = 4,
+       OBJ_ATTR_SET            = 5,
+       OBJ_ATTR_GET            = 6,
+       OBJ_XATTR_SET           = 7,
+       OBJ_XATTR_GET           = 8,
+       OBJ_INDEX_LOOKUP        = 9,
+       OBJ_INDEX_INSERT        = 10,
+       OBJ_INDEX_DELETE        = 11,
+       OBJ_LAST
+};
+
+struct update {
+       __u32           u_type;
+       __u32           u_batchid;
+       struct lu_fid   u_fid;
+       __u32           u_lens[UPDATE_BUF_COUNT];
+       __u32           u_bufs[0];
+};
+
+struct update_buf {
+       __u32   ub_magic;
+       __u32   ub_count;
+       __u32   ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1                0x00BD0001
+struct update_reply {
+       __u32   ur_version;
+       __u32   ur_count;
+       __u32   ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+       __u64      msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644 (file)
index 0000000..1c87a61
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+       /* Reset LFSCK iterator position to the device beginning. */
+       LPF_RESET       = 0x0001,
+
+       /* Exit when fail. */
+       LPF_FAILOUT     = 0x0002,
+
+       /* Dryrun mode, only check without modification */
+       LPF_DRYRUN      = 0x0004,
+};
+
+enum lfsck_type {
+       /* For MDT-OST consistency check/repair. */
+       LT_LAYOUT       = 0x0001,
+
+       /* For MDT-MDT consistency check/repair. */
+       LT_DNE          = 0x0002,
+
+       /* For FID-in-dirent and linkEA consistency check/repair. */
+       LT_NAMESPACE    = 0x0004,
+};
+
+#define LFSCK_VERSION_V1       1
+#define LFSCK_VERSION_V2       2
+
+#define LFSCK_TYPES_ALL                ((__u16)(~0))
+#define LFSCK_TYPES_DEF                ((__u16)0)
+#define LFSCK_TYPES_SUPPORTED  LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT   0
+#define LFSCK_SPEED_LIMIT_DEF  LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+       LSV_SPEED_LIMIT         = 0x00000001,
+       LSV_ERROR_HANDLE        = 0x00000002,
+       LSV_DRYRUN              = 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+       /* Which arguments are valid, see 'enum lfsck_start_valid'. */
+       __u32   ls_valid;
+
+       /* How many items can be scanned at most per second. */
+       __u32   ls_speed_limit;
+
+       /* For compatibility between user space tools and kernel service. */
+       __u16   ls_version;
+
+       /* Which LFSCK components to be (have been) started. */
+       __u16   ls_active;
+
+       /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+       __u16   ls_flags;
+
+       /* For 64-bits aligned. */
+       __u16   ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644 (file)
index 0000000..eaa94f5
--- /dev/null
@@ -0,0 +1,1146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <lustre/ll_fiemap.h>
+#include <linux/lustre_user.h>
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS           _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS           _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION       _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION       _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD       _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD       _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP               _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+       OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+       OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+       OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+       OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+       OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+       __u64      os_type;
+       __u64      os_blocks;
+       __u64      os_bfree;
+       __u64      os_bavail;
+       __u64      os_files;
+       __u64      os_ffree;
+       __u8        os_fsid[40];
+       __u32      os_bsize;
+       __u32      os_namelen;
+       __u64      os_maxbytes;
+       __u32      os_state;       /**< obd_statfs_state OS_STATE_* flag */
+       __u32      os_fprecreated;      /* objs available now to the caller */
+                                       /* used in QoS code to find preferred
+                                        * OSTs */
+       __u32      os_spare2;
+       __u32      os_spare3;
+       __u32      os_spare4;
+       __u32      os_spare5;
+       __u32      os_spare6;
+       __u32      os_spare7;
+       __u32      os_spare8;
+       __u32      os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+       * FID sequence. Sequence is a unit of migration: all files (objects)
+       * with FIDs from a given sequence are stored on the same server.
+       * Lustre should support 2^64 objects, so even if each sequence
+       * has only a single object we can still enumerate 2^64 objects.
+       **/
+       __u64 f_seq;
+       /* FID number within sequence. */
+       __u32 f_oid;
+       /**
+        * FID version, used to distinguish different versions (in the sense
+        * of snapshots, etc.) of the same file system object. Not currently
+        * used.
+        **/
+       __u32 f_ver;
+};
+
+struct filter_fid {
+       struct lu_fid   ff_parent;  /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+       struct lu_fid   ff_parent;
+       __u64           ff_objid;
+       __u64           ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+       /**
+        * Bitfield for supported data in this structure. From enum lma_compat.
+        * lma_self_fid and lma_flags are always available.
+        */
+       __u32   lma_compat;
+       /**
+        * Per-file incompat feature list. Lustre version should support all
+        * flags set in this field. The supported feature mask is available in
+        * LMA_INCOMPAT_SUPP.
+        */
+       __u32   lma_incompat;
+       /** FID of this inode */
+       struct lu_fid  lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+       union {
+               struct ostid {
+                       __u64   oi_id;
+                       __u64   oi_seq;
+               } oi;
+               struct lu_fid oi_fid;
+       };
+};
+
+#define DOSTID LPX64":"LPU64
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE       _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE       _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA               _IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ         _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID         _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK             _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK         _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK             _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK   _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL                 _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS           _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO                 _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+#define LL_IOC_RMTACL             _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT           _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH         _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH         _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO             _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS       _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX             _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET           _IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET           _IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START            _IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START          _IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END            _IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS            _IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST             _IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION            _IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS                _IOW('f', 219, \
+                                               struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION              _IOR('f', 220, \
+                                               struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE       _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE       _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY        _IOWR('f', 242, __u64)
+
+#define LL_STATFS_LMV     1
+#define LL_STATFS_LOV     2
+#define LL_STATFS_NODELAY      4
+
+#define IOC_MDC_TYPE       'i'
+#define IOC_MDC_LOOKUP   _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags.
+ * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY
+ * which was added since kernel 2.6.36, so we redefine it as 020000000.
+ * To be compatible with old version's statically linked binary, finally we
+ * define it as (020000000 | 0100000000).
+ * */
+#define O_LOV_DELAY_CREATE      0120000000
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA       0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL   0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1      0x0CD10CD0    /*normal stripe lmv magic */
+#define LMV_USER_MAGIC    0x0CD20CD0    /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+       struct ost_id l_ost_oi;   /* OST object ID */
+       __u32 l_ost_gen;          /* generation of this OST index */
+       __u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id lmm_oi;     /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       union {
+               __u16 lmm_stripe_offset;  /* starting stripe offset in
+                                          * lmm_objects, use when writing */
+               __u16 lmm_layout_gen;     /* layout generation number
+                                          * used when reading */
+       };
+       struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id lmm_oi;     /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       union {
+               __u16 lmm_stripe_offset;  /* starting stripe offset in
+                                          * lmm_objects, use when writing */
+               __u16 lmm_layout_gen;     /* layout generation number
+                                          * used when reading */
+       };
+       char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+       struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+       lstat_t lmd_st;          /* MDS stat struct */
+       struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+       lstat_t lmd_st;          /* MDS stat struct */
+       struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+       struct lu_fid   lum_fid;
+       __u32           lum_padding;
+       __u32           lum_mds;
+};
+
+/* lum_type */
+enum {
+       LMV_STRIPE_TYPE = 0,
+       LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+       __u32   lum_magic;       /* must be the first field */
+       __u32   lum_stripe_count;  /* dirstripe count */
+       __u32   lum_stripe_offset; /* MDT idx for default dirstripe */
+       __u32   lum_hash_type;     /* Dir stripe policy */
+       __u32   lum_type;         /* LMV type: default or normal */
+       __u32   lum_padding1;
+       __u32   lum_padding2;
+       __u32   lum_padding3;
+       char    lum_pool_name[LOV_MAXPOOLNAME];
+       struct  lmv_user_mds_data  lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+       return sizeof(struct lmv_user_md) +
+                     stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+       __u64 lrc_id;
+       __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+       __u64 id;        /* holds object id */
+       __u32 generation; /* holds object generation */
+       __u32 f_type;     /* holds object type or stripe idx when passing it to
+                          * OST for saving into EA. */
+};
+
+#define UUID_MAX       40
+struct obd_uuid {
+       char uuid[UUID_MAX];
+};
+
+static inline int obd_uuid_equals(const struct obd_uuid *u1,
+                                 const struct obd_uuid *u2)
+{
+       return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+       return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+       strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+       uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
+{
+       if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+               /* Obviously not safe, but for printfs, no real harm done...
+                  we're always null-terminated, even in a race. */
+               static char temp[sizeof(*uuid)];
+               memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+               temp[sizeof(*uuid) - 1] = '\0';
+               return temp;
+       }
+       return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+       char *p;
+
+       strncpy(buf, uuid, buflen - 1);
+       buf[buflen - 1] = '\0';
+       p = strrchr(buf, '-');
+       if (p)
+          *p = '\0';
+}
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID_NOBRACE LPX64":0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid)     \
+       (fid)->f_seq, \
+       (fid)->f_oid, \
+       (fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX""
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *'
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *'
+*/
+#define SFID "0x"LPX64i":0x%x:0x%x"
+#define RFID(fid)     \
+       &((fid)->f_seq), \
+       &((fid)->f_oid), \
+       &((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
+
+#define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+       __u64 pdd_nid;
+       __u32 pdd_perm;
+       __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+       __u32                       idd_magic;
+       __u32                       idd_err;
+       __u32                       idd_uid;
+       __u32                       idd_gid;
+       __u32                       idd_nperms;
+       __u32                       idd_ngroups;
+       struct perm_downcall_data idd_perms[N_PERMS_MAX];
+       __u32                       idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID      99
+#define NOBODY_GID      99
+
+#define INVALID_ID      (-1)
+
+enum {
+       RMT_LSETFACL    = 1,
+       RMT_LGETFACL    = 2,
+       RMT_RSETFACL    = 3,
+       RMT_RGETFACL    = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR    ".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN        14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX    LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+       LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+       __u64 dqi_bgrace;
+       __u64 dqi_igrace;
+       __u32 dqi_flags;
+       __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+       __u64 dqb_bhardlimit;
+       __u64 dqb_bsoftlimit;
+       __u64 dqb_curspace;
+       __u64 dqb_ihardlimit;
+       __u64 dqb_isoftlimit;
+       __u64 dqb_curinodes;
+       __u64 dqb_btime;
+       __u64 dqb_itime;
+       __u32 dqb_valid;
+       __u32 dqb_padding;
+};
+
+enum {
+       QC_GENERAL      = 0,
+       QC_MDTIDX       = 1,
+       QC_OSTIDX       = 2,
+       QC_UUID  = 3
+};
+
+struct if_quotactl {
+       __u32              qc_cmd;
+       __u32              qc_type;
+       __u32              qc_id;
+       __u32              qc_stat;
+       __u32              qc_valid;
+       __u32              qc_idx;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+
+/* swap layout flags */
+#define        SWAP_LAYOUTS_CHECK_DV1          (1 << 0)
+#define        SWAP_LAYOUTS_CHECK_DV2          (1 << 1)
+#define        SWAP_LAYOUTS_KEEP_MTIME         (1 << 2)
+#define        SWAP_LAYOUTS_KEEP_ATIME         (1 << 3)
+struct lustre_swap_layouts {
+       __u64   sl_flags;
+       __u32   sl_fd;
+       __u32   sl_gid;
+       __u64   sl_dv1;
+       __u64   sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+       CL_MARK     = 0,
+       CL_CREATE   = 1,  /* namespace */
+       CL_MKDIR    = 2,  /* namespace */
+       CL_HARDLINK = 3,  /* namespace */
+       CL_SOFTLINK = 4,  /* namespace */
+       CL_MKNOD    = 5,  /* namespace */
+       CL_UNLINK   = 6,  /* namespace */
+       CL_RMDIR    = 7,  /* namespace */
+       CL_RENAME   = 8,  /* namespace */
+       CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+       CL_OPEN     = 10, /* not currently used */
+       CL_CLOSE    = 11, /* may be written to log only with mtime change */
+       CL_IOCTL    = 12,
+       CL_TRUNC    = 13,
+       CL_SETATTR  = 14,
+       CL_XATTR    = 15,
+       CL_HSM      = 16, /* HSM specific events, see flags */
+       CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+       CL_CTIME    = 18,
+       CL_ATIME    = 19,
+       CL_LAYOUT   = 20,
+       CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+       static const char *changelog_str[] = {
+               "MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+               "RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "IOCTL", "TRUNC",
+               "SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "LAYOUT"
+       };
+
+       if (type >= 0 && type < CL_LAST)
+               return changelog_str[type];
+       return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION     0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+                                    /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST       0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L  0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H  6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST   15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+                                  >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY  1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+       HE_ARCHIVE      = 0,
+       HE_RESTORE      = 1,
+       HE_CANCEL       = 2,
+       HE_RELEASE      = 3,
+       HE_REMOVE       = 4,
+       HE_STATE        = 5,
+       HE_SPARE1       = 6,
+       HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+       *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+       *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+       *flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec))
+
+struct changelog_rec {
+       __u16            cr_namelen;
+       __u16            cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+       __u32            cr_type;  /**< \a changelog_rec_type */
+       __u64            cr_index; /**< changelog record number */
+       __u64            cr_prev;  /**< last index for this target fid */
+       __u64            cr_time;
+       union {
+               lustre_fid    cr_tfid;  /**< target fid */
+               __u32    cr_markerflags; /**< CL_MARK flags */
+       };
+       lustre_fid          cr_pfid;    /**< parent fid */
+       char              cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+       __u16                   cr_namelen;
+       __u16                   cr_flags; /**< (flags & CLF_FLAGMASK) |
+                                               CLF_EXT_VERSION */
+       __u32                   cr_type;  /**< \a changelog_rec_type */
+       __u64                   cr_index; /**< changelog record number */
+       __u64                   cr_prev;  /**< last index for this target fid */
+       __u64                   cr_time;
+       union {
+               lustre_fid      cr_tfid;        /**< target fid */
+               __u32           cr_markerflags; /**< CL_MARK flags */
+       };
+       lustre_fid              cr_pfid;        /**< target parent fid */
+       lustre_fid              cr_sfid;        /**< source fid, or zero */
+       lustre_fid              cr_spfid;       /**< source parent fid, or zero */
+       char                    cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+       (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+       return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+                                            sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+       return CHANGELOG_REC_EXTENDED(rec) ?
+               ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+       return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+       return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+       __u64 icc_recno;
+       __u32 icc_mdtindex;
+       __u32 icc_id;
+       __u32 icc_flags;
+};
+
+enum changelog_message_type {
+       CL_RECORD = 10, /* message is a changelog_rec */
+       CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+       __u64 idv_version;
+       __u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01   /* Do not take READ EXTENT LOCK before sampling
+                               version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+       HS_EXISTS       = 0x00000001,
+       HS_DIRTY        = 0x00000002,
+       HS_RELEASED     = 0x00000004,
+       HS_ARCHIVED     = 0x00000008,
+       HS_NORELEASE    = 0x00000010,
+       HS_NOARCHIVE    = 0x00000020,
+       HS_LOST         = 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+       HPS_WAITING     = 1,
+       HPS_RUNNING     = 2,
+       HPS_DONE        = 3,
+};
+#define HPS_NONE       0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+       switch  (s) {
+       case HPS_WAITING:       return "waiting";
+       case HPS_RUNNING:       return "running";
+       case HPS_DONE:          return "done";
+       default:                return "unknown";
+       }
+}
+
+struct hsm_extent {
+       __u64 offset;
+       __u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+       /** Current HSM states, from enum hsm_states. */
+       __u32                   hus_states;
+       __u32                   hus_archive_id;
+       /**  The current undergoing action, if there is one */
+       __u32                   hus_in_progress_state;
+       __u32                   hus_in_progress_action;
+       struct hsm_extent       hus_in_progress_location;
+       char                    hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+       struct lu_fid   hssi_fid;
+       __u64           hssi_setmask;
+       __u64           hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+       /**  The current undergoing action, if there is one */
+       /* state is one of hsm_progress_states */
+       __u32                   hca_state;
+       /* action is one of hsm_user_action */
+       __u32                   hca_action;
+       struct hsm_extent       hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+       HUA_NONE    =  1, /* no action (noop) */
+       HUA_ARCHIVE = 10, /* copy to hsm */
+       HUA_RESTORE = 11, /* prestage */
+       HUA_RELEASE = 12, /* drop ost objects */
+       HUA_REMOVE  = 13, /* remove from archive */
+       HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action  a)
+{
+       switch  (a) {
+       case HUA_NONE:    return "NOOP";
+       case HUA_ARCHIVE: return "ARCHIVE";
+       case HUA_RESTORE: return "RESTORE";
+       case HUA_RELEASE: return "RELEASE";
+       case HUA_REMOVE:  return "REMOVE";
+       case HUA_CANCEL:  return "CANCEL";
+       default:          return "UNKNOWN";
+       }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+       __u32 hr_action;        /* enum hsm_user_action */
+       __u32 hr_archive_id;    /* archive id, used only with HUA_ARCHIVE */
+       __u64 hr_flags;         /* request flags */
+       __u32 hr_itemcount;     /* item count in hur_user_item vector */
+       __u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid      hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+       struct hsm_request      hur_request;
+       struct hsm_user_item    hur_user_item[0];
+       /* extra data blob at end of struct (after all
+        * hur_user_items), only use helpers to access it
+        */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+       return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/** Compute the current length of the provided hsm_user_request. */
+static inline int hur_len(struct hsm_user_request *hur)
+{
+       return offsetof(struct hsm_user_request,
+                       hur_user_item[hur->hur_request.hr_itemcount]) +
+               hur->hur_request.hr_data_len;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+       HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+       HSMA_NONE    = 10, /* no action */
+       HSMA_ARCHIVE = 20, /* arbitrary offset */
+       HSMA_RESTORE = 21,
+       HSMA_REMOVE  = 22,
+       HSMA_CANCEL  = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+       switch  (a) {
+       case HSMA_NONE:    return "NOOP";
+       case HSMA_ARCHIVE: return "ARCHIVE";
+       case HSMA_RESTORE: return "RESTORE";
+       case HSMA_REMOVE:  return "REMOVE";
+       case HSMA_CANCEL:  return "CANCEL";
+       default:           return "UNKNOWN";
+       }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+       __u32      hai_len;     /* valid size of this struct */
+       __u32      hai_action;  /* hsm_copytool_action, but use known size */
+       lustre_fid hai_fid;     /* Lustre FID to operated on */
+       lustre_fid hai_dfid;    /* fid used for data access */
+       struct hsm_extent hai_extent;  /* byte range to operate on */
+       __u64      hai_cookie;  /* action cookie from coordinator */
+       __u64      hai_gid;     /* grouplock id */
+       char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+                                       char *buffer, int len)
+{
+       int i, sz, data_len;
+       char *ptr;
+
+       ptr = buffer;
+       sz = len;
+       data_len = hai->hai_len - sizeof(*hai);
+       for (i = 0 ; (i < data_len) && (sz > 0) ; i++)
+       {
+               int cnt;
+
+               cnt = snprintf(ptr, sz, "%.2X",
+                              (unsigned char)hai->hai_data[i]);
+               ptr += cnt;
+               sz -= cnt;
+       }
+       *ptr = '\0';
+       return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+       __u32 hal_version;
+       __u32 hal_count;       /* number of hai's to follow */
+       __u64 hal_compound_id; /* returned by coordinator */
+       __u64 hal_flags;
+       __u32 hal_archive_id; /* which archive backend */
+       __u32 padding1;
+       char  hal_fsname[0];   /* null-terminated */
+       /* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+          boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+       return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal)
+{
+       return (struct hsm_action_item *)(hal->hal_fsname +
+                                         cfs_size_round(strlen(hal-> \
+                                                               hal_fsname)));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+       return (struct hsm_action_item *)((char *)hai +
+                                         cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+       int i, sz;
+       struct hsm_action_item *hai;
+
+       sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname));
+       hai = hai_zero(hal);
+       for (i = 0 ; i < hal->hal_count ; i++) {
+               sz += cfs_size_round(hai->hai_len);
+               hai = hai_next(hai);
+       }
+       return(sz);
+}
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+       lustre_fid              hp_fid;
+       __u64                   hp_cookie;
+       struct hsm_extent       hp_extent;
+       __u16                   hp_flags;
+       __u16                   hp_errval; /* positive val */
+       __u32                   padding;
+};
+
+/**
+ * Use by copytool during any hsm request they handled.
+ * This structure is initialized by llapi_hsm_copy_start()
+ * which is an helper over the ioctl() interface
+ * Store Lustre, internal use only, data.
+ */
+struct hsm_copy {
+       __u64                   hc_data_version;
+       __u16                   hc_flags;
+       __u16                   hc_errval; /* positive val */
+       __u32                   padding;
+       struct hsm_action_item  hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
new file mode 100644 (file)
index 0000000..63da665
--- /dev/null
@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <lustre/lustre_user.h>
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+       LLAPI_MSG_OFF    = 0,
+       LLAPI_MSG_FATAL  = 1,
+       LLAPI_MSG_ERROR  = 2,
+       LLAPI_MSG_WARN   = 3,
+       LLAPI_MSG_NORMAL = 4,
+       LLAPI_MSG_INFO   = 5,
+       LLAPI_MSG_DEBUG  = 6,
+       LLAPI_MSG_MAX
+};
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK   0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+extern void llapi_msg_set_level(int level);
+extern void llapi_error(int level, int rc, char *fmt, ...);
+#define llapi_err_noerrno(level, fmt, a...)                         \
+       llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+extern void llapi_printf(int level, char *fmt, ...);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+                            int stripe_offset, int stripe_count,
+                            int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+                          unsigned long long stripe_size, int stripe_offset,
+                          int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+                                 unsigned long long stripe_size,
+                                 int stripe_offset, int stripe_count,
+                                 int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                               unsigned long long stripe_size,
+                               int stripe_offset, int stripe_count,
+                               int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+                             char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+                                int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+#define HAVE_LLAPI_FILE_LOOKUP
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT      0x1
+#define VERBOSE_SIZE       0x2
+#define VERBOSE_OFFSET     0x4
+#define VERBOSE_POOL       0x8
+#define VERBOSE_DETAIL     0x10
+#define VERBOSE_OBJID      0x20
+#define VERBOSE_GENERATION 0x40
+#define VERBOSE_MDTINDEX   0x80
+#define VERBOSE_ALL    (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+                           VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION)
+
+struct find_param {
+       unsigned int maxdepth;
+       time_t  atime;
+       time_t  mtime;
+       time_t  ctime;
+       int     asign;  /* cannot be bitfields due to using pointers to */
+       int     csign;  /* access them during argument parsing. */
+       int     msign;
+       int     type;
+       int          size_sign:2,       /* these need to be signed values */
+                       stripesize_sign:2,
+                       stripecount_sign:2;
+       unsigned long long size;
+       unsigned long long size_units;
+       uid_t uid;
+       gid_t gid;
+
+       unsigned long   zeroend:1,
+                       recursive:1,
+                       exclude_pattern:1,
+                       exclude_type:1,
+                       exclude_obd:1,
+                       exclude_mdt:1,
+                       exclude_gid:1,
+                       exclude_uid:1,
+                       check_gid:1,        /* group ID */
+                       check_uid:1,        /* user ID */
+                       check_pool:1,      /* LOV pool name */
+                       check_size:1,      /* file size */
+                       exclude_pool:1,
+                       exclude_size:1,
+                       exclude_atime:1,
+                       exclude_mtime:1,
+                       exclude_ctime:1,
+                       get_lmv:1,            /* get MDT list from LMV */
+                       raw:1,            /* do not fill in defaults */
+                       check_stripesize:1,     /* LOV stripe size */
+                       exclude_stripesize:1,
+                       check_stripecount:1,    /* LOV stripe count */
+                       exclude_stripecount:1;
+
+       int     verbose;
+       int     quiet;
+
+       /* regular expression */
+       char   *pattern;
+
+       char   *print_fmt;
+
+       struct  obd_uuid       *obduuid;
+       int                  num_obds;
+       int                  num_alloc_obds;
+       int                  obdindex;
+       int                 *obdindexes;
+
+       struct  obd_uuid       *mdtuuid;
+       int                  num_mdts;
+       int                  num_alloc_mdts;
+       int                  mdtindex;
+       int                 *mdtindexes;
+       int                  file_mdtindex;
+
+       int     lumlen;
+       struct  lov_user_mds_data *lmd;
+
+       char poolname[LOV_MAXPOOLNAME + 1];
+
+       int                     fp_lmv_count;
+       struct lmv_user_md      *fp_lmv_md;
+
+       unsigned long long stripesize;
+       unsigned long long stripesize_units;
+       unsigned long long stripecount;
+
+       /* In-process parameters. */
+       unsigned long   got_uuids:1,
+                       obds_printed:1,
+                       have_fileinfo:1;        /* file attrs and LOV xattr */
+       unsigned int    depth;
+       dev_t      st_dev;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+                                int stripe_count, int stripe_pattern,
+                                char *poolname);
+int llapi_direntry_remove(char *dname);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+                    struct obd_statfs *stat_buf,
+                    struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int parse_size(char *optarg, unsigned long long *size,
+                     unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+                              char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+                             char *obd_uuid, void *args);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotachown(char *path, int flag);
+extern int llapi_quotacheck(char *mnt, int check_type);
+extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+                               llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_lsetfacl(int argc, char *argv[]);
+extern int llapi_lgetfacl(int argc, char *argv[]);
+extern int llapi_rsetfacl(int argc, char *argv[]);
+extern int llapi_rgetfacl(int argc, char *argv[]);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+                         int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+
+extern int llapi_get_version(char *buffer, int buffer_size, char **version);
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+                              __u32 archive_id);
+
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+       return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+                              __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+                             __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally
+   by these functions */
+#define CHANGELOG_FLAG_FOLLOW 0x01   /* Not yet implemented */
+#define CHANGELOG_FLAG_BLOCK  0x02   /* Blocking IO makes sense in case of
+   slow user parsing of the records, but it also prevents us from cleaning
+   up if the records are not consumed. */
+
+/* Records received are in extentded format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extented format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, int flags, const char *mdtname,
+                                long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech);
+extern int llapi_changelog_free(struct changelog_ext_rec **rech);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+                                long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv,
+                                   char *fsname, int flags,
+                                   int archive_count, int *archives);
+extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+                                  struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_copytool_free(struct hsm_action_list **hal);
+extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy,
+                               const struct hsm_action_item *hai);
+extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy,
+                             const struct hsm_progress *hp);
+extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp);
+extern int llapi_hsm_import(const char *dst, int archive, struct stat *st,
+                           unsigned long long stripe_size, int stripe_offset,
+                           int stripe_count, int stripe_pattern,
+                           char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+                                                            int data_len);
+extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+                                   struct hsm_current_action *hca);
+/** @} llapi */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644 (file)
index 0000000..5cfb87b
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/lustre_acl.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644 (file)
index 0000000..d77bffc
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include <lustre/lustre_idl.h>
+
+#define CAPA_TIMEOUT 1800              /* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60)  /* sec, == 1 days */
+
+struct capa_hmac_alg {
+       const char     *ha_name;
+       int          ha_len;
+       int          ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen)      \
+[CAPA_HMAC_ALG_ ## type] = {                       \
+       .ha_name         = name,                        \
+       .ha_len   = len,                         \
+       .ha_keylen       = keylen,                    \
+}
+
+struct client_capa {
+       struct inode         *inode;
+       struct list_head                lli_list;     /* link to lli_oss_capas */
+};
+
+struct target_capa {
+       struct hlist_node         c_hash;       /* link to capa hash */
+};
+
+struct obd_capa {
+       struct list_head                c_list;       /* link to capa_list */
+
+       struct lustre_capa      c_capa;       /* capa */
+       atomic_t              c_refc;       /* ref count */
+       cfs_time_t              c_expiry;     /* jiffies */
+       spinlock_t              c_lock; /* protect capa content */
+       int                     c_site;
+
+       union {
+               struct client_capa      cli;
+               struct target_capa      tgt;
+       } u;
+};
+
+enum {
+       CAPA_SITE_CLIENT = 0,
+       CAPA_SITE_SERVER,
+       CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+       return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+       return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+       return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+       return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+       return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+       return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+       return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+       return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+       return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+       return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+       return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+                const char *fmt, ... );
+#define DEBUG_CAPA(level, capa, fmt, args...)                            \
+do {                                                                      \
+       if (((level) & D_CANTMASK) != 0 ||                                   \
+           ((libcfs_debug & (level)) != 0 &&                             \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {               \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);             \
+               _debug_capa((capa), &msgdata, fmt, ##args);                 \
+       }                                                                     \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...)                          \
+do {                                                                      \
+CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n",          \
+       ##args, k, capa_key_seq(k), capa_key_keyid(k));                  \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+                         struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+                            struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+       struct obd_capa *ocapa;
+
+       if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+               return ERR_PTR(-EINVAL);
+
+       OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+       if (unlikely(!ocapa))
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&ocapa->c_list);
+       atomic_set(&ocapa->c_refc, 1);
+       spin_lock_init(&ocapa->c_lock);
+       ocapa->c_site = site;
+       if (ocapa->c_site == CAPA_SITE_CLIENT)
+               INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+       else
+               INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+       return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return NULL;
+
+       atomic_inc(&ocapa->c_refc);
+       return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return;
+
+       if (atomic_read(&ocapa->c_refc) == 0) {
+               DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+               LBUG();
+       }
+
+       if (atomic_dec_and_test(&ocapa->c_refc)) {
+               LASSERT(list_empty(&ocapa->c_list));
+               if (ocapa->c_site == CAPA_SITE_CLIENT) {
+                       LASSERT(list_empty(&ocapa->u.cli.lli_list));
+               } else {
+                       struct hlist_node *hnode;
+
+                       hnode = &ocapa->u.tgt.c_hash;
+                       LASSERT(!hnode->next && !hnode->pprev);
+               }
+               OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+       }
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+       int mode = flags;
+
+       if ((mode + 1) & O_ACCMODE)
+               mode++;
+       if (mode & O_TRUNC)
+               mode |= 2;
+
+       return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+       return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+       cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry,
+                                        cfs_time_current_sec());
+       ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+                                      cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+       return (capa->lc_expiry - cfs_time_current_sec() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+       return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+       return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+       struct list_head              k_list;
+       struct lustre_capa_key  k_key;
+};
+
+enum {
+       LC_ID_NONE      = 0,
+       LC_ID_PLAIN     = 1,
+       LC_ID_CONVERT   = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644 (file)
index 0000000..f12429f
--- /dev/null
@@ -0,0 +1,299 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+    cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED   0x0001000
+
+enum lcfg_command_type {
+       LCFG_ATTACH          = 0x00cf001, /**< create a new obd instance */
+       LCFG_DETACH          = 0x00cf002, /**< destroy obd instance */
+       LCFG_SETUP            = 0x00cf003, /**< call type-specific setup */
+       LCFG_CLEANUP        = 0x00cf004, /**< call type-specific cleanup */
+       LCFG_ADD_UUID      = 0x00cf005, /**< add a nid to a niduuid */
+       LCFG_DEL_UUID      = 0x00cf006, /**< remove a nid from a niduuid */
+       LCFG_MOUNTOPT      = 0x00cf007, /**< create a profile (mdc, osc) */
+       LCFG_DEL_MOUNTOPT       = 0x00cf008, /**< destroy a profile */
+       LCFG_SET_TIMEOUT        = 0x00cf009, /**< set obd_timeout */
+       LCFG_SET_UPCALL  = 0x00cf00a, /**< deprecated */
+       LCFG_ADD_CONN      = 0x00cf00b, /**< add a failover niduuid to an obd */
+       LCFG_DEL_CONN      = 0x00cf00c, /**< remove a failover niduuid */
+       LCFG_LOV_ADD_OBD        = 0x00cf00d, /**< add an osc to a lov */
+       LCFG_LOV_DEL_OBD        = 0x00cf00e, /**< remove an osc from a lov */
+       LCFG_PARAM            = 0x00cf00f, /**< set a proc parameter */
+       LCFG_MARKER          = 0x00cf010, /**< metadata about next cfg rec */
+       LCFG_LOG_START    = 0x00ce011, /**< mgc only, process a cfg log */
+       LCFG_LOG_END        = 0x00ce012, /**< stop processing updates */
+       LCFG_LOV_ADD_INA        = 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+       LCFG_ADD_MDC        = 0x00cf014, /**< add an mdc to a lmv */
+       LCFG_DEL_MDC        = 0x00cf015, /**< remove an mdc from a lmv */
+       LCFG_SPTLRPC_CONF       = 0x00ce016, /**< security */
+       LCFG_POOL_NEW      = 0x00ce020, /**< create an ost pool name */
+       LCFG_POOL_ADD      = 0x00ce021, /**< add an ost to a pool */
+       LCFG_POOL_REM      = 0x00ce022, /**< remove an ost from a pool */
+       LCFG_POOL_DEL      = 0x00ce023, /**< destroy an ost pool name */
+       LCFG_SET_LDLM_TIMEOUT   = 0x00ce030, /**< set ldlm_timeout */
+       LCFG_PRE_CLEANUP        = 0x00cf031, /**< call type-specific pre
+                                             * cleanup cleanup */
+};
+
+struct lustre_cfg_bufs {
+       void    *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+       __u32    lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+       __u32    lcfg_bufcount;
+};
+
+struct lustre_cfg {
+       __u32 lcfg_version;
+       __u32 lcfg_command;
+
+       __u32 lcfg_num;
+       __u32 lcfg_flags;
+       __u64 lcfg_nid;
+       __u32 lcfg_nal;         /* not used any more */
+
+       __u32 lcfg_bufcount;
+       __u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+       PORTALS_CFG_TYPE = 1,
+       LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)       \
+       ((lcfg)->lcfg_bufcount <= (idx)  \
+        ? 0                                \
+        : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+                                      __u32               index,
+                                      void                *buf,
+                                      __u32               buflen)
+{
+       if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+               return;
+       if (bufs == NULL)
+               return;
+
+       if (bufs->lcfg_bufcount <= index)
+               bufs->lcfg_bufcount = index + 1;
+
+       bufs->lcfg_buf[index]    = buf;
+       bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+                                             __u32 index,
+                                             char *str)
+{
+       lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+       memset((bufs), 0, sizeof(*bufs));
+       if (name)
+               lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+       int i;
+       int offset;
+       int bufcount;
+       LASSERT (lcfg != NULL);
+       LASSERT (index >= 0);
+
+       bufcount = lcfg->lcfg_bufcount;
+       if (index >= bufcount)
+               return NULL;
+
+       offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+       for (i = 0; i < index; i++)
+               offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+       return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+                                       struct lustre_cfg *lcfg)
+{
+       int i;
+       bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+       for (i = 0; i < bufs->lcfg_bufcount; i++) {
+               bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+               bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+       }
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+       char *s;
+
+       if (lcfg->lcfg_buflens[index] == 0)
+               return NULL;
+
+       s = lustre_cfg_buf(lcfg, index);
+       if (s == NULL)
+               return NULL;
+
+       /*
+        * make sure it's NULL terminated, even if this kills a char
+        * of data.  Try to use the padding first though.
+        */
+       if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+               int last = min((int)lcfg->lcfg_buflens[index],
+                              cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+               char lost = s[last];
+               s[last] = '\0';
+               if (lost != '\0') {
+                       CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+                             index, s, lost);
+               }
+       }
+       return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+       int i;
+       int len;
+       ENTRY;
+
+       len = LCFG_HDR_SIZE(bufcount);
+       for (i = 0; i < bufcount; i++)
+               len += cfs_size_round(buflens[i]);
+
+       RETURN(cfs_size_round(len));
+}
+
+
+#include <obd_support.h>
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+                                               struct lustre_cfg_bufs *bufs)
+{
+       struct lustre_cfg *lcfg;
+       char *ptr;
+       int i;
+
+       ENTRY;
+
+       OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+                                      bufs->lcfg_buflen));
+       if (!lcfg)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+       lcfg->lcfg_command = cmd;
+       lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+       ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+       for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+               lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+               LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+       }
+       RETURN(lcfg);
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+       int len;
+
+       len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+       OBD_FREE(lcfg, len);
+       EXIT;
+       return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+       struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+       ENTRY;
+       if (!lcfg)
+               RETURN(-EINVAL);
+
+       /* check that the first bits of the struct are valid */
+       if (len < LCFG_HDR_SIZE(0))
+               RETURN(-EINVAL);
+
+       if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+               RETURN(-EINVAL);
+
+       if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+               RETURN(-EINVAL);
+
+       /* check that the buflens are valid */
+       if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+               RETURN(-EINVAL);
+
+       /* make sure all the pointers point inside the data */
+       if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+
+#include <lustre/lustre_user.h>
+
+#ifndef INVALID_UID
+#define INVALID_UID     (-1)
+#endif
+
+/** @} cfg */
+
+#endif // _LUSTRE_CFG_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644 (file)
index 0000000..3d9e446
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#include <linux/lustre_debug.h>
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP)                               \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) {       \
+       CERROR("bad page index %lu > %llu\n", index,                \
+              ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT));      \
+       libcfs_debug = ~0UL;                                        \
+       OP;                                                          \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP)                           \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) {                     \
+       CERROR("bad file offset %llu > %llu\n", offset,          \
+              ASSERT_MAX_SIZE_MB << 20);                              \
+       libcfs_debug = ~0UL;                                        \
+       OP;                                                          \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+void dump_lsm(int level, struct lov_stripe_md *lsm);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644 (file)
index 0000000..c2504c5
--- /dev/null
@@ -0,0 +1,553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR      "LOGS"  /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE      "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE    MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD       "last_rcvd"
+#define LOV_OBJID       "lov_objid"
+#define LOV_OBJSEQ             "lov_objseq"
+#define HEALTH_CHECK      "health_check"
+#define CAPA_KEYS       "capa_keys"
+#define CHANGELOG_USERS   "changelog_users"
+#define MGS_NIDTBL_DIR    "NIDTBL_VERSIONS"
+#define QMT_DIR           "quota_master"
+#define QSD_DIR           "quota_slave"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT  | \
+                           LDD_F_SV_TYPE_OST  | \
+                           LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL   0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX    0x0010
+/** never registered */
+#define LDD_F_VIRGIN   0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE   0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD   0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF     0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14     0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM     0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE   0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE    0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR     0x4000
+
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_ONDISK_MASK  (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK       0xFFFF
+
+enum ldd_mount_type {
+       LDD_MT_EXT3 = 0,
+       LDD_MT_LDISKFS,
+       LDD_MT_SMFS,
+       LDD_MT_REISERFS,
+       LDD_MT_LDISKFS2,
+       LDD_MT_ZFS,
+       LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+       static char *mount_type_string[] = {
+               "ext3",
+               "ldiskfs",
+               "smfs",
+               "reiserfs",
+               "ldiskfs2",
+               "zfs",
+       };
+       return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+       static char *mount_type_string[] = {
+               "osd-ldiskfs",
+               "osd-ldiskfs",
+               "osd-smfs",
+               "osd-reiserfs",
+               "osd-ldiskfs",
+               "osd-zfs",
+       };
+       return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+       __u32      ldd_magic;
+       __u32      ldd_feature_compat;  /* compatible feature flags */
+       __u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+       __u32      ldd_feature_incompat;/* incompatible feature flags */
+
+       __u32      ldd_config_ver;      /* config rewrite count - not used */
+       __u32      ldd_flags;      /* LDD_SV_TYPE */
+       __u32      ldd_svindex;  /* server index (0001), must match
+                                          svname */
+       __u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+       char       ldd_fsname[64];      /* filesystem this server is part of,
+                                          MTI_NAME_MAXLEN */
+       char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+       __u8       ldd_uuid[40];        /* server UUID (COMPAT_146) */
+
+/*200*/ char       ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8       ldd_padding[4096 - 1024];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+
+#define IS_MDT(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+                        LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data)    mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+                                  char *name)
+{
+       if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+               if (!(flags & LDD_F_SV_ALL))
+                       sprintf(name, "%.8s%c%s%04x", fs,
+                               (flags & LDD_F_VIRGIN) ? ':' :
+                                       ((flags & LDD_F_WRITECONF) ? '=' : '-'),
+                               (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+                               index);
+       } else if (flags & LDD_F_SV_TYPE_MGS) {
+               sprintf(name, "MGS");
+       } else {
+               CERROR("unknown server type %#x\n", flags);
+               return 1;
+       }
+       return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+       __u32      lmd_magic;
+       __u32      lmd_flags;    /* lustre mount flags */
+       int     lmd_mgs_failnodes; /* mgs failover node count */
+       int     lmd_exclude_count;
+       int     lmd_recovery_time_soft;
+       int     lmd_recovery_time_hard;
+       char      *lmd_dev;        /* device name */
+       char      *lmd_profile;       /* client only */
+       char      *lmd_mgssec;  /* sptlrpc flavor to mgs */
+       char      *lmd_opts;      /* lustre mount options (as opposed to
+                                        _device_ mount options) */
+       char      *lmd_params;  /* lustre params */
+       __u32     *lmd_exclude;       /* array of OSTs to ignore */
+       char    *lmd_mgs;          /* MGS nid */
+       char    *lmd_osd_type;      /* OSD type */
+};
+
+#define LMD_FLG_SERVER       0x0001  /* Mounting a server */
+#define LMD_FLG_CLIENT       0x0002  /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV  0x0008  /* Abort recovery */
+#define LMD_FLG_NOSVC  0x0010  /* Only start MGS/MGC for servers,
+                                       no other services */
+#define LMD_FLG_NOMGS  0x0020  /* Only start target for servers, reusing
+                                       existing MGS services */
+#define LMD_FLG_WRITECONF    0x0040  /* Rewrite config log */
+#define LMD_FLG_NOIR    0x0080  /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB             0x0100  /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS         0x0200  /* Also start MGS along with server */
+#define LMD_FLG_IAM         0x0400  /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE  0x0800  /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN      0x1000  /* the service registers first time */
+#define LMD_FLG_UPDATE      0x2000  /* update parameters */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST   0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT   0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20     0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST       0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT       0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID       0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM       0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA       0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+       __u8  lsd_uuid[40];     /* server UUID */
+       __u64 lsd_last_transno;    /* last completed transaction ID */
+       __u64 lsd_compat14;     /* reserved - compat with old last_rcvd */
+       __u64 lsd_mount_count;     /* incarnation number */
+       __u32 lsd_feature_compat;  /* compatible feature flags */
+       __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+       __u32 lsd_feature_incompat;/* incompatible feature flags */
+       __u32 lsd_server_size;     /* size of server data area */
+       __u32 lsd_client_start;    /* start of per-client data area */
+       __u16 lsd_client_size;     /* size of per-client data area */
+       __u16 lsd_subdir_count;    /* number of subdirectories for objects */
+       __u64 lsd_catalog_oid;     /* recovery catalog object id */
+       __u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+       __u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+       __u32 lsd_osd_index;       /* index number of OST in LOV */
+       __u32 lsd_padding1;     /* was lsd_mdt_index, unused in 2.4.0 */
+       __u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+       /** transaction values since lsd_trans_table_time */
+       __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+       /** start point of transno table below */
+       __u32 lsd_trans_table_time; /* time of first slot in table above */
+       __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+       __u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+       __u8  lcd_uuid[40];      /* client UUID */
+       __u64 lcd_last_transno; /* last completed transaction ID */
+       __u64 lcd_last_xid;     /* xid for the last transaction */
+       __u32 lcd_last_result;  /* result from last RPC */
+       __u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+       /* for MDS_CLOSE requests */
+       __u64 lcd_last_close_transno; /* last completed transaction ID */
+       __u64 lcd_last_close_xid;     /* xid for the last transaction */
+       __u32 lcd_last_close_result;  /* result from last RPC */
+       __u32 lcd_last_close_data;    /* per-op data */
+       /* VBR: last versions */
+       __u64 lcd_pre_versions[4];
+       __u32 lcd_last_epoch;
+       /** orphans handling for delayed export rely on that */
+       __u32 lcd_first_epoch;
+       __u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+                            struct lsd_client_data *lcd)
+{
+       int length = sizeof(lcd->lcd_uuid);
+       if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+               lcd->lcd_uuid[length - 1] = '\0';
+
+               LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
+                              "stored in last_rcvd(index = %d) is bad!\n",
+                              lcd->lcd_uuid, obd_name, index);
+       }
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+                                struct lr_server_data *lsd)
+{
+       int i;
+       memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+       lsd->lsd_last_transno     = le64_to_cpu(buf->lsd_last_transno);
+       lsd->lsd_compat14        = le64_to_cpu(buf->lsd_compat14);
+       lsd->lsd_mount_count      = le64_to_cpu(buf->lsd_mount_count);
+       lsd->lsd_feature_compat   = le32_to_cpu(buf->lsd_feature_compat);
+       lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+       lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+       lsd->lsd_server_size      = le32_to_cpu(buf->lsd_server_size);
+       lsd->lsd_client_start     = le32_to_cpu(buf->lsd_client_start);
+       lsd->lsd_client_size      = le16_to_cpu(buf->lsd_client_size);
+       lsd->lsd_subdir_count     = le16_to_cpu(buf->lsd_subdir_count);
+       lsd->lsd_catalog_oid      = le64_to_cpu(buf->lsd_catalog_oid);
+       lsd->lsd_catalog_ogen     = le32_to_cpu(buf->lsd_catalog_ogen);
+       memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+       lsd->lsd_osd_index      = le32_to_cpu(buf->lsd_osd_index);
+       lsd->lsd_padding1       = le32_to_cpu(buf->lsd_padding1);
+       lsd->lsd_start_epoch      = le32_to_cpu(buf->lsd_start_epoch);
+       for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+               lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+       lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+       lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+                                struct lr_server_data *buf)
+{
+       int i;
+       memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+       buf->lsd_last_transno     = cpu_to_le64(lsd->lsd_last_transno);
+       buf->lsd_compat14        = cpu_to_le64(lsd->lsd_compat14);
+       buf->lsd_mount_count      = cpu_to_le64(lsd->lsd_mount_count);
+       buf->lsd_feature_compat   = cpu_to_le32(lsd->lsd_feature_compat);
+       buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+       buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+       buf->lsd_server_size      = cpu_to_le32(lsd->lsd_server_size);
+       buf->lsd_client_start     = cpu_to_le32(lsd->lsd_client_start);
+       buf->lsd_client_size      = cpu_to_le16(lsd->lsd_client_size);
+       buf->lsd_subdir_count     = cpu_to_le16(lsd->lsd_subdir_count);
+       buf->lsd_catalog_oid      = cpu_to_le64(lsd->lsd_catalog_oid);
+       buf->lsd_catalog_ogen     = cpu_to_le32(lsd->lsd_catalog_ogen);
+       memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+       buf->lsd_osd_index        = cpu_to_le32(lsd->lsd_osd_index);
+       buf->lsd_padding1         = cpu_to_le32(lsd->lsd_padding1);
+       buf->lsd_start_epoch      = cpu_to_le32(lsd->lsd_start_epoch);
+       for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+               buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+       buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+       buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+                                struct lsd_client_data *lcd)
+{
+       memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+       lcd->lcd_last_transno       = le64_to_cpu(buf->lcd_last_transno);
+       lcd->lcd_last_xid          = le64_to_cpu(buf->lcd_last_xid);
+       lcd->lcd_last_result    = le32_to_cpu(buf->lcd_last_result);
+       lcd->lcd_last_data        = le32_to_cpu(buf->lcd_last_data);
+       lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+       lcd->lcd_last_close_xid     = le64_to_cpu(buf->lcd_last_close_xid);
+       lcd->lcd_last_close_result  = le32_to_cpu(buf->lcd_last_close_result);
+       lcd->lcd_last_close_data    = le32_to_cpu(buf->lcd_last_close_data);
+       lcd->lcd_pre_versions[0]    = le64_to_cpu(buf->lcd_pre_versions[0]);
+       lcd->lcd_pre_versions[1]    = le64_to_cpu(buf->lcd_pre_versions[1]);
+       lcd->lcd_pre_versions[2]    = le64_to_cpu(buf->lcd_pre_versions[2]);
+       lcd->lcd_pre_versions[3]    = le64_to_cpu(buf->lcd_pre_versions[3]);
+       lcd->lcd_last_epoch      = le32_to_cpu(buf->lcd_last_epoch);
+       lcd->lcd_first_epoch    = le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+                                struct lsd_client_data *buf)
+{
+       memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+       buf->lcd_last_transno       = cpu_to_le64(lcd->lcd_last_transno);
+       buf->lcd_last_xid          = cpu_to_le64(lcd->lcd_last_xid);
+       buf->lcd_last_result    = cpu_to_le32(lcd->lcd_last_result);
+       buf->lcd_last_data        = cpu_to_le32(lcd->lcd_last_data);
+       buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+       buf->lcd_last_close_xid     = cpu_to_le64(lcd->lcd_last_close_xid);
+       buf->lcd_last_close_result  = cpu_to_le32(lcd->lcd_last_close_result);
+       buf->lcd_last_close_data    = cpu_to_le32(lcd->lcd_last_close_data);
+       buf->lcd_pre_versions[0]    = cpu_to_le64(lcd->lcd_pre_versions[0]);
+       buf->lcd_pre_versions[1]    = cpu_to_le64(lcd->lcd_pre_versions[1]);
+       buf->lcd_pre_versions[2]    = cpu_to_le64(lcd->lcd_pre_versions[2]);
+       buf->lcd_pre_versions[3]    = cpu_to_le64(lcd->lcd_pre_versions[3]);
+       buf->lcd_last_epoch      = cpu_to_le32(lcd->lcd_last_epoch);
+       buf->lcd_first_epoch    = cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+       return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+               lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+       return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+               lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+       int                    lsi_flags;
+       struct obd_device       *lsi_mgc;     /* mgc obd */
+       struct lustre_mount_data *lsi_lmd;     /* mount command info */
+       struct ll_sb_info       *lsi_llsbi;   /* add'l client sbi info */
+       struct dt_device         *lsi_dt_dev;  /* dt device to access disk fs*/
+       struct vfsmount   *lsi_srv_mnt; /* the one server mount */
+       atomic_t              lsi_mounts;  /* references to the srv_mnt */
+       char                      lsi_svname[MTI_NAME_MAXLEN];
+       char                      lsi_osd_obdname[64];
+       char                      lsi_osd_uuid[64];
+       struct obd_export        *lsi_osd_exp;
+       char                      lsi_osd_type[16];
+       char                      lsi_fstype[16];
+       struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+                                                 own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER          0x00200000
+#define LSI_BDI_INITIALIZED          0x00400000
+
+#define     s2lsi(sb)  ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define            get_mount_flags(sb)    (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define            get_mntdev_name(sb)    (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+       char             *lmi_name;
+       struct super_block   *lmi_sb;
+       struct vfsmount      *lmi_mnt;
+       struct list_head            lmi_list_chain;
+};
+
+/* on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library */
+struct los_ondisk {
+       __u32 lso_magic;
+       __u32 lso_next_oid;
+};
+
+#define LOS_MAGIC      0xdecafbee
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+                      size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+                       char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+                                                 struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif // _LUSTRE_DISK_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644 (file)
index 0000000..f6eaed8
--- /dev/null
@@ -0,0 +1,1668 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <linux/lustre_dlm.h>
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+       ELDLM_OK = 0,
+
+       ELDLM_LOCK_CHANGED = 300,
+       ELDLM_LOCK_ABORTED = 301,
+       ELDLM_LOCK_REPLACED = 302,
+       ELDLM_NO_LOCK_DATA = 303,
+       ELDLM_LOCK_WOULDBLOCK = 304,
+
+       ELDLM_NAMESPACE_EXISTS = 400,
+       ELDLM_BAD_NAMESPACE    = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+       LDLM_NAMESPACE_SERVER = 1 << 0,
+       LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * Declaration of flags sent through the wire.
+ **/
+#define LDLM_FL_LOCK_CHANGED   0x000001 /* extent, mode, or resource changed */
+
+/**
+ * If the server returns one of these flags, then the lock was put on that list.
+ * If the client sends one of these flags (during recovery ONLY!), it wants the
+ * lock added to the specified list, no questions asked.
+ */
+#define LDLM_FL_BLOCK_GRANTED  0x000002
+#define LDLM_FL_BLOCK_CONV     0x000004
+#define LDLM_FL_BLOCK_WAIT     0x000008
+
+/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */
+
+#define LDLM_FL_AST_SENT       0x000020 /* blocking or cancel packet was
+                                        * queued for sending. */
+/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040   moved to non-wire flags */
+/* Used to be LDLM_FL_CANCEL   0x000080   moved to non-wire flags */
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that one
+ * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous.
+ */
+#define LDLM_FL_REPLAY  0x000100
+
+#define LDLM_FL_INTENT_ONLY    0x000200 /* Don't grant lock, just do intent. */
+
+/* Used to be LDLM_FL_LOCAL_ONLY 0x000400  moved to non-wire flags */
+/* Used to be LDLM_FL_FAILED     0x000800  moved to non-wire flags */
+
+#define LDLM_FL_HAS_INTENT     0x001000 /* lock request has intent */
+
+/* Used to be LDLM_FL_CANCELING  0x002000  moved to non-wire flags */
+/* Used to be LDLM_FL_LOCAL      0x004000  moved to non-wire flags */
+
+#define LDLM_FL_DISCARD_DATA   0x010000 /* discard (no writeback) on cancel */
+
+#define LDLM_FL_NO_TIMEOUT     0x020000 /* Blocked by group lock - wait
+                                        * indefinitely */
+
+/** file & record locking */
+#define LDLM_FL_BLOCK_NOWAIT   0x040000 /* Server told not to wait if blocked.
+                                        * For AGL, OST will not send glimpse
+                                        * callback. */
+#define LDLM_FL_TEST_LOCK      0x080000 // return blocking lock
+
+/* Used to be LDLM_FL_LVB_READY  0x100000 moved to non-wire flags */
+/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */
+/* Used to be LDLM_FL_NO_LRU     0x400000 moved to non-wire flags */
+
+/* Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This is
+ * for clients (like liblustre) that cannot be expected to reliably response
+ * to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x800000
+
+/* Flags flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS     (LDLM_FL_CANCEL_ON_BLOCK)
+
+/* Used to be LDLM_FL_CP_REQD  0x1000000 moved to non-wire flags */
+/* Used to be LDLM_FL_CLEANED  0x2000000 moved to non-wire flags */
+/* Used to be LDLM_FL_ATOMIC_CB      0x4000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_AST    0x10000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_DONE  0x20000000 moved to non-wire flags */
+
+/* measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x40000000
+
+/* These are flags that are mapped into the flags and ASTs of blocking locks */
+#define LDLM_AST_DISCARD_DATA  0x80000000 /* Add FL_DISCARD to blocking ASTs */
+
+/* Flags sent in AST lock_flags to be mapped into the receiving lock. */
+#define LDLM_AST_FLAGS  (LDLM_FL_DISCARD_DATA)
+
+/*
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/**
+ * Declaration of flags not sent through the wire.
+ **/
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep
+ * emulation + race with upcoming bl_ast.
+ */
+#define LDLM_FL_FAIL_LOC       0x100000000ULL
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it.
+ */
+#define LDLM_FL_SKIPPED        0x200000000ULL
+/* this lock is being destroyed */
+#define LDLM_FL_CBPENDING      0x400000000ULL
+/* not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC  0x800000000ULL
+/* cancellation callback already run */
+#define LDLM_FL_CANCEL  0x1000000000ULL
+#define LDLM_FL_LOCAL_ONLY     0x2000000000ULL
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED  0x4000000000ULL
+/* lock cancel has already been sent */
+#define LDLM_FL_CANCELING      0x8000000000ULL
+/* local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL    0x10000000000ULL
+/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that
+ * the LVB filling happens _after_ the lock has been granted, so another thread
+ * can match it before the LVB has been updated.  As a dirty hack, we set
+ * LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and callers
+ * must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST, which can
+ * be replaced with a LVB-aware wrapping function for OSC locks.  That change is
+ * pretty high-risk, though, and would need a lot more testing. */
+#define LDLM_FL_LVB_READY      0x20000000000ULL
+/* A lock contributes to the known minimum size (KMS) calculation until it has
+ * finished the part of its cancelation that performs write back on its dirty
+ * pages.  It can remain on the granted list during this whole time.  Threads
+ * racing to update the KMS after performing their writeback need to know to
+ * exclude each other's locks from the calculation as they walk the granted
+ * list. */
+#define LDLM_FL_KMS_IGNORE     0x40000000000ULL
+/* completion AST to be executed */
+#define LDLM_FL_CP_REQD        0x80000000000ULL
+/* cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED        0x100000000000ULL
+/* optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB      0x200000000000ULL
+
+/* It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting
+ * locks to this client for the first operation, whereas the second
+ * operation has canceled this lock and is waiting for rpc_lock which is
+ * taken by the first operation. LDLM_FL_BL_AST is set by
+ * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel
+ * (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock
+ * cache is dropped to let ldlm_callback_handler() return EINVAL to the
+ * server. It is used when ELC RPC is already prepared and is waiting
+ * for rpc_lock, too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST   0x400000000000ULL
+#define LDLM_FL_BL_DONE         0x800000000000ULL
+/* Don't put lock into the LRU list, so that it is not canceled due to aging.
+ * Used by MGC locks, they are cancelled only at unmount or by callback. */
+#define LDLM_FL_NO_LRU         0x1000000000000ULL
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+       LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+       return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+       /** Recalculate pool \a pl usage */
+       int (*po_recalc)(struct ldlm_pool *pl);
+       /** Cancel at least \a nr locks from pool \a pl */
+       int (*po_shrink)(struct ldlm_pool *pl, int nr,
+                        unsigned int gfp_mask);
+       int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+       /** Pool proc directory. */
+       proc_dir_entry_t        *pl_proc_dir;
+       /** Pool name, must be long enough to hold compound proc entry name. */
+       char                    pl_name[100];
+       /** Lock for protecting SLV/CLV updates. */
+       spinlock_t              pl_lock;
+       /** Number of allowed locks in in pool, both, client and server side. */
+       atomic_t                pl_limit;
+       /** Number of granted locks in */
+       atomic_t                pl_granted;
+       /** Grant rate per T. */
+       atomic_t                pl_grant_rate;
+       /** Cancel rate per T. */
+       atomic_t                pl_cancel_rate;
+       /** Server lock volume (SLV). Protected by pl_lock. */
+       __u64                   pl_server_lock_volume;
+       /** Current biggest client lock volume. Protected by pl_lock. */
+       __u64                   pl_client_lock_volume;
+       /** Lock volume factor. SLV on client is calculated as following:
+        *  server_slv * lock_volume_factor. */
+       atomic_t                pl_lock_volume_factor;
+       /** Time when last SLV from server was obtained. */
+       time_t                  pl_recalc_time;
+       /** Recalculation period for pool. */
+       time_t                  pl_recalc_period;
+       /** Recalculation and shrink operations. */
+       struct ldlm_pool_ops    *pl_ops;
+       /** Number of planned locks for next period. */
+       int                     pl_grant_plan;
+       /** Pool statistics. */
+       struct lprocfs_stats    *pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+                              void *req_cookie, ldlm_mode_t mode, __u64 flags,
+                              void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+       int (*lvbo_init)(struct ldlm_resource *res);
+       int (*lvbo_update)(struct ldlm_resource *res,
+                          struct ptlrpc_request *r,
+                          int increase);
+       int (*lvbo_free)(struct ldlm_resource *res);
+       /* Return size of lvb data appropriate RPC size can be reserved */
+       int (*lvbo_size)(struct ldlm_lock *lock);
+       /* Called to fill in lvb data to RPC buffer @buf */
+       int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+       LDLM_NAMESPACE_GREEDY = 1 << 0,
+       LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+       /** back pointer to namespace */
+       struct ldlm_namespace      *nsb_namespace;
+       /**
+        * Estimated lock callback time.  Used by adaptive timeout code to
+        * avoid spurious client evictions due to unresponsiveness when in
+        * fact the network or overall system load is at fault
+        */
+       struct adaptive_timeout     nsb_at_estimate;
+};
+
+enum {
+       /** LDLM namespace lock stats */
+       LDLM_NSS_LOCKS    = 0,
+       LDLM_NSS_LAST
+};
+
+typedef enum {
+       /** invalide type */
+       LDLM_NS_TYPE_UNKNOWN    = 0,
+       /** mdc namespace */
+       LDLM_NS_TYPE_MDC,
+       /** mds namespace */
+       LDLM_NS_TYPE_MDT,
+       /** osc namespace */
+       LDLM_NS_TYPE_OSC,
+       /** ost namespace */
+       LDLM_NS_TYPE_OST,
+       /** mgc namespace */
+       LDLM_NS_TYPE_MGC,
+       /** mgs namespace */
+       LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+       /** Backward link to OBD, required for LDLM pool to store new SLV. */
+       struct obd_device       *ns_obd;
+
+       /** Flag indicating if namespace is on client instead of server */
+       ldlm_side_t             ns_client;
+
+       /** Resource hash table for namespace. */
+       cfs_hash_t              *ns_rs_hash;
+
+       /** serialize */
+       spinlock_t              ns_lock;
+
+       /** big refcount (by bucket) */
+       atomic_t                ns_bref;
+
+       /**
+        * Namespace connect flags supported by server (may be changed via
+        * /proc, LRU resize may be disabled/enabled).
+        */
+       __u64                   ns_connect_flags;
+
+       /** Client side original connect flags supported by server. */
+       __u64                   ns_orig_connect_flags;
+
+       /**
+        * Position in global namespace list linking all namespaces on
+        * the node.
+        */
+       struct list_head                ns_list_chain;
+
+       /**
+        * List of unused locks for this namespace. This list is also called
+        * LRU lock list.
+        * Unused locks are locks with zero reader/writer reference counts.
+        * This list is only used on clients for lock caching purposes.
+        * When we want to release some locks voluntarily or if server wants
+        * us to release some locks due to e.g. memory pressure, we take locks
+        * to release from the head of this list.
+        * Locks are linked via l_lru field in \see struct ldlm_lock.
+        */
+       struct list_head                ns_unused_list;
+       /** Number of locks in the LRU list above */
+       int                     ns_nr_unused;
+
+       /**
+        * Maximum number of locks permitted in the LRU. If 0, means locks
+        * are managed by pools and there is no preset limit, rather it is all
+        * controlled by available memory on this client and on server.
+        */
+       unsigned int            ns_max_unused;
+       /** Maximum allowed age (last used time) for locks in the LRU */
+       unsigned int            ns_max_age;
+       /**
+        * Server only: number of times we evicted clients due to lack of reply
+        * to ASTs.
+        */
+       unsigned int            ns_timeouts;
+       /**
+        * Number of seconds since the file change time after which the
+        * MDT will return an UPDATE lock along with a LOOKUP lock.
+        * This allows the client to start caching negative dentries
+        * for a directory and may save an RPC for a later stat.
+        */
+       unsigned int            ns_ctime_age_limit;
+
+       /**
+        * Used to rate-limit ldlm_namespace_dump calls.
+        * \see ldlm_namespace_dump. Increased by 10 seconds every time
+        * it is called.
+        */
+       cfs_time_t              ns_next_dump;
+
+       /** "policy" function that does actual lock conflict determination */
+       ldlm_res_policy         ns_policy;
+
+       /**
+        * LVB operations for this namespace.
+        * \see struct ldlm_valblock_ops
+        */
+       struct ldlm_valblock_ops *ns_lvbo;
+
+       /**
+        * Used by filter code to store pointer to OBD of the service.
+        * Should be dropped in favor of \a ns_obd
+        */
+       void                    *ns_lvbp;
+
+       /**
+        * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+        * a resource is removed.
+        */
+       wait_queue_head_t               ns_waitq;
+       /** LDLM pool structure for this namespace */
+       struct ldlm_pool        ns_pool;
+       /** Definition of how eagerly unused locks will be released from LRU */
+       ldlm_appetite_t         ns_appetite;
+
+       /**
+        * If more than \a ns_contended_locks are found, the resource is
+        * considered to be contended. Lock enqueues might specify that no
+        * contended locks should be granted
+        */
+       unsigned                ns_contended_locks;
+
+       /**
+        * The resources in this namespace remember contended state during
+        * \a ns_contention_time, in seconds.
+        */
+       unsigned                ns_contention_time;
+
+       /**
+        * Limit size of contended extent locks, in bytes.
+        * If extended lock is requested for more then this many bytes and
+        * caller instructs us not to grant contended locks, we would disregard
+        * such a request.
+        */
+       unsigned                ns_max_nolock_size;
+
+       /** Limit of parallel AST RPC count. */
+       unsigned                ns_max_parallel_ast;
+
+       /** Callback to cancel locks before replaying it during recovery. */
+       ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+       /** LDLM lock stats */
+       struct lprocfs_stats    *ns_stats;
+
+       /**
+        * Flag to indicate namespace is being freed. Used to determine if
+        * recalculation of LDLM pool statistics should be skipped.
+        */
+       unsigned                ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                   LDLM_NAMESPACE_SERVER)));
+       LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+               ns->ns_client == LDLM_NAMESPACE_SERVER);
+       return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                   LDLM_NAMESPACE_SERVER)));
+       LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+               ns->ns_client == LDLM_NAMESPACE_SERVER);
+       return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+                                     ldlm_cancel_for_recovery arg)
+{
+       LASSERT(ns != NULL);
+       ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+                                     struct ldlm_lock_desc *new, void *data,
+                                     int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+                                       void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+/** Type for weight callback function of a lock. */
+typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+       struct ldlm_lock        *gl_lock; /* lock to glimpse */
+       struct list_head                 gl_list; /* linkage to other gl work structs */
+       __u32                    gl_flags;/* see LDLM_GL_WORK_* below */
+       union ldlm_gl_desc      *gl_desc; /* glimpse descriptor to be packed in
+                                          * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+       struct interval_node    li_node;  /* node for tree management */
+       struct list_head                li_group; /* the locks which have the same
+                                          * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+       /** Tree size. */
+       int                     lit_size;
+       ldlm_mode_t             lit_mode;  /* lock mode */
+       struct interval_node    *lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+       LCF_ASYNC      = 0x1, /* Cancel locks asynchronously. */
+       LCF_LOCAL      = 0x2, /* Cancel locks locally, not notifing server */
+       LCF_BL_AST     = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+                              * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+       __u64 start;
+       __u64 end;
+       __u64 owner;
+       __u64 blocking_owner;
+       struct obd_export *blocking_export;
+       /* Protected by the hash lock */
+       __u32 blocking_refs;
+       __u32 pid;
+};
+
+typedef union {
+       struct ldlm_extent l_extent;
+       struct ldlm_flock l_flock;
+       struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+                                const ldlm_policy_data_t *lpolicy,
+                                ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+                                 const ldlm_wire_policy_data_t *wpolicy,
+                                 ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+       LVB_T_NONE      = 0,
+       LVB_T_OST       = 1,
+       LVB_T_LQUOTA    = 2,
+       LVB_T_LAYOUT    = 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+       /**
+        * Local lock handle.
+        * When remote side wants to tell us about a lock, they address
+        * it by this opaque handle.  The handle does not hold a
+        * reference on the ldlm_lock, so it can be safely passed to
+        * other threads or nodes. When the lock needs to be accessed
+        * from the handle, it is looked up again in the lock table, and
+        * may no longer exist.
+        *
+        * Must be first in the structure.
+        */
+       struct portals_handle   l_handle;
+       /**
+        * Lock reference count.
+        * This is how many users have pointers to actual structure, so that
+        * we do not accidentally free lock structure that is in use.
+        */
+       atomic_t                l_refc;
+       /**
+        * Internal spinlock protects l_resource.  We should hold this lock
+        * first before taking res_lock.
+        */
+       spinlock_t              l_lock;
+       /**
+        * Pointer to actual resource this lock is in.
+        * ldlm_lock_change_resource() can change this.
+        */
+       struct ldlm_resource    *l_resource;
+       /**
+        * List item for client side LRU list.
+        * Protected by ns_lock in struct ldlm_namespace.
+        */
+       struct list_head                l_lru;
+       /**
+        * Linkage to resource's lock queues according to current lock state.
+        * (could be granted, waiting or converting)
+        * Protected by lr_lock in struct ldlm_resource.
+        */
+       struct list_head                l_res_link;
+       /**
+        * Tree node for ldlm_extent.
+        */
+       struct ldlm_interval    *l_tree_node;
+       /**
+        * Per export hash of locks.
+        * Protected by per-bucket exp->exp_lock_hash locks.
+        */
+       struct hlist_node       l_exp_hash;
+       /**
+        * Per export hash of flock locks.
+        * Protected by per-bucket exp->exp_flock_hash locks.
+        */
+       struct hlist_node       l_exp_flock_hash;
+       /**
+        * Requested mode.
+        * Protected by lr_lock.
+        */
+       ldlm_mode_t             l_req_mode;
+       /**
+        * Granted mode, also protected by lr_lock.
+        */
+       ldlm_mode_t             l_granted_mode;
+       /** Lock completion handler pointer. Called when lock is granted. */
+       ldlm_completion_callback l_completion_ast;
+       /**
+        * Lock blocking AST handler pointer.
+        * It plays two roles:
+        * - as a notification of an attempt to queue a conflicting lock (once)
+        * - as a notification when the lock is being cancelled.
+        *
+        * As such it's typically called twice: once for the initial conflict
+        * and then once more when the last user went away and the lock is
+        * cancelled (could happen recursively).
+        */
+       ldlm_blocking_callback  l_blocking_ast;
+       /**
+        * Lock glimpse handler.
+        * Glimpse handler is used to obtain LVB updates from a client by
+        * server
+        */
+       ldlm_glimpse_callback   l_glimpse_ast;
+
+       /** XXX apparently unused "weight" handler. To be removed? */
+       ldlm_weigh_callback     l_weigh_ast;
+
+       /**
+        * Lock export.
+        * This is a pointer to actual client export for locks that were granted
+        * to clients. Used server-side.
+        */
+       struct obd_export       *l_export;
+       /**
+        * Lock connection export.
+        * Pointer to server export on a client.
+        */
+       struct obd_export       *l_conn_export;
+
+       /**
+        * Remote lock handle.
+        * If the lock is remote, this is the handle of the other side lock
+        * (l_handle)
+        */
+       struct lustre_handle    l_remote_handle;
+
+       /**
+        * Representation of private data specific for a lock type.
+        * Examples are: extent range for extent lock or bitmask for ibits locks
+        */
+       ldlm_policy_data_t      l_policy_data;
+
+       /**
+        * Lock state flags.
+        * Like whenever we receive any blocking requests for this lock, etc.
+        * Protected by lr_lock.
+        */
+       __u64                   l_flags;
+       /**
+        * Lock r/w usage counters.
+        * Protected by lr_lock.
+        */
+       __u32                   l_readers;
+       __u32                   l_writers;
+       /**
+        * If the lock is granted, a process sleeps on this waitq to learn when
+        * it's no longer in use.  If the lock is not granted, a process sleeps
+        * on this waitq to learn when it becomes granted.
+        */
+       wait_queue_head_t               l_waitq;
+
+       /**
+        * Seconds. It will be updated if there is any activity related to
+        * the lock, e.g. enqueue the lock or send blocking AST.
+        */
+       cfs_time_t              l_last_activity;
+
+       /**
+        * Time last used by e.g. being matched by lock match.
+        * Jiffies. Should be converted to time if needed.
+        */
+       cfs_time_t              l_last_used;
+
+       /** Originally requested extent for the extent lock. */
+       struct ldlm_extent      l_req_extent;
+
+       unsigned int            l_failed:1,
+       /**
+        * Set for locks that were removed from class hash table and will be
+        * destroyed when last reference to them is released. Set by
+        * ldlm_lock_destroy_internal().
+        *
+        * Protected by lock and resource locks.
+        */
+                               l_destroyed:1,
+       /*
+        * it's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+        *
+        * NB: compared with check_res_locked(), checking this bit is cheaper.
+        * Also, spin_is_locked() is deprecated for kernel code; one reason is
+        * because it works only for SMP so user needs to add extra macros like
+        * LASSERT_SPIN_LOCKED for uniprocessor kernels.
+        */
+                               l_res_locked:1,
+       /*
+        * It's set once we call ldlm_add_waiting_lock_res_locked()
+        * to start the lock-timeout timer and it will never be reset.
+        *
+        * Protected by lock_res_and_lock().
+        */
+                               l_waited:1,
+       /** Flag whether this is a server namespace lock. */
+                               l_ns_srv:1;
+
+       /*
+        * Client-side-only members.
+        */
+
+       enum lvb_type         l_lvb_type;
+
+       /**
+        * Temporary storage for a LVB received during an enqueue operation.
+        */
+       __u32                   l_lvb_len;
+       void                    *l_lvb_data;
+
+       /** Private storage for lock user. Opaque to LDLM. */
+       void                    *l_ast_data;
+
+       /*
+        * Server-side-only members.
+        */
+
+       /**
+        * Connection cookie for the client originating the operation.
+        * Used by Commit on Share (COS) code. Currently only used for
+        * inodebits locks on MDS.
+        */
+       __u64                   l_client_cookie;
+
+       /**
+        * List item for locks waiting for cancellation from clients.
+        * The lists this could be linked into are:
+        * waiting_locks_list (protected by waiting_locks_spinlock),
+        * then if the lock timed out, it is moved to
+        * expired_lock_thread.elt_expired_locks for further processing.
+        * Protected by elt_lock.
+        */
+       struct list_head                l_pending_chain;
+
+       /**
+        * Set when lock is sent a blocking AST. Time in seconds when timeout
+        * is reached and client holding this lock could be evicted.
+        * This timeout could be further extended by e.g. certain IO activity
+        * under this lock.
+        * \see ost_rw_prolong_locks
+        */
+       cfs_time_t              l_callback_timeout;
+
+       /** Local PID of process which created this lock. */
+       __u32                   l_pid;
+
+       /**
+        * Number of times blocking AST was sent for this lock.
+        * This is for debugging. Valid values are 0 and 1, if there is an
+        * attempt to send blocking AST more than once, an assertion would be
+        * hit. \see ldlm_work_bl_ast_lock
+        */
+       int                     l_bl_ast_run;
+       /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+       struct list_head                l_bl_ast;
+       /** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+       struct list_head                l_cp_ast;
+       /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+       struct list_head                l_rk_ast;
+
+       /**
+        * Pointer to a conflicting lock that caused blocking AST to be sent
+        * for this lock
+        */
+       struct ldlm_lock        *l_blocking_lock;
+
+       /**
+        * Protected by lr_lock, linkages to "skip lists".
+        * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+        */
+       struct list_head                l_sl_mode;
+       struct list_head                l_sl_policy;
+
+       /** Reference tracking structure to debug leaked locks. */
+       struct lu_ref           l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       /* Debugging stuff for bug 20498, for tracking export references. */
+       /** number of export references taken */
+       int                     l_exp_refs_nr;
+       /** link all locks referencing one export */
+       struct list_head                l_exp_refs_link;
+       /** referenced export object */
+       struct obd_export       *l_exp_refs_target;
+#endif
+       /**
+        * export blocking dlm lock list, protected by
+        * l_export->exp_bl_list_lock.
+        * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+        * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+        */
+       struct list_head                l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+       struct ldlm_ns_bucket   *lr_ns_bucket;
+
+       /**
+        * List item for list in namespace hash.
+        * protected by ns_lock
+        */
+       struct hlist_node       lr_hash;
+
+       /** Spinlock to protect locks under this resource. */
+       spinlock_t              lr_lock;
+
+       /**
+        * protected by lr_lock
+        * @{ */
+       /** List of locks in granted state */
+       struct list_head                lr_granted;
+       /** List of locks waiting to change their granted mode (converted) */
+       struct list_head                lr_converting;
+       /**
+        * List of locks that could not be granted due to conflicts and
+        * that are waiting for conflicts to go away */
+       struct list_head                lr_waiting;
+       /** @} */
+
+       /* XXX No longer needed? Remove ASAP */
+       ldlm_mode_t             lr_most_restr;
+
+       /** Type of locks this resource can hold. Only one type per resource. */
+       ldlm_type_t             lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+       /** Resource name */
+       struct ldlm_res_id      lr_name;
+       /** Reference count for this resource */
+       atomic_t                lr_refcount;
+
+       /**
+        * Interval trees (only for extent locks) for all modes of this resource
+        */
+       struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+       /**
+        * Server-side-only lock value block elements.
+        * To serialize lvbo_init.
+        */
+       struct mutex            lr_lvb_mutex;
+       int                     lr_lvb_len;
+       /** protected by lr_lock */
+       void                    *lr_lvb_data;
+
+       /** When the resource was considered as contended. */
+       cfs_time_t              lr_contention_time;
+       /** List of references to this resource. For debugging. */
+       struct lu_ref           lr_reference;
+
+       struct inode            *lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+       return lock->l_resource->lr_type == LDLM_IBITS &&
+               lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+       return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+       return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+       return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+       return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+       return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+       if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+               return ns->ns_lvbo->lvbo_init(res);
+
+       return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+               return ns->ns_lvbo->lvbo_size(lock);
+
+       return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       if (ns->ns_lvbo != NULL) {
+               LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+               return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+       }
+       return 0;
+}
+
+struct ldlm_ast_work {
+       struct ldlm_lock      *w_lock;
+       int                 w_blocking;
+       struct ldlm_lock_desc  w_desc;
+       struct list_head             w_list;
+       int                 w_flags;
+       void              *w_data;
+       int                 w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+       __u32 ei_type;   /** Type of the lock being enqueued. */
+       __u32 ei_mode;   /** Mode of the lock being enqueued. */
+       void *ei_cb_bl;  /** blocking lock callback */
+       void *ei_cb_cp;  /** lock completion callback */
+       void *ei_cb_gl;  /** lock glimpse callback */
+       void *ei_cb_wg;  /** lock weigh callback */
+       void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)                        \
+       CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                      \
+                                                                       \
+       if (((mask) & D_CANTMASK) != 0 ||                              \
+           ((libcfs_debug & (mask)) != 0 &&                        \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))    \
+               _ldlm_lock_debug(lock, msgdata, fmt, ##a);            \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                     struct libcfs_debug_msg_data *data,
+                     const char *fmt, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {                    \
+       static cfs_debug_limit_state_t _ldlm_cdls;                         \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);       \
+       ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {                               \
+       if (likely(lock != NULL)) {                                         \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+               ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock,           \
+                               "### " fmt , ##a);                          \
+       } else {                                                            \
+               LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);                \
+       }                                                                   \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+                                     int first_enq, ldlm_error_t *err,
+                                     struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+                         void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+                           void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+                         ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+       ldlm_completion_callback lcs_completion;
+       ldlm_blocking_callback   lcs_blocking;
+       ldlm_glimpse_callback    lcs_glimpse;
+       ldlm_weigh_callback      lcs_weigh;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+                     struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+       return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+       lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+       struct ldlm_lock *lock;
+
+       lock = __ldlm_handle2lock(h, flags);
+       if (lock != NULL)
+               LDLM_LOCK_REF_DEL(lock);
+       return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from reqest \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+                                      struct ptlrpc_request *r, int increase)
+{
+       if (ldlm_res_to_ns(res)->ns_lvbo &&
+           ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+               return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+                                                                increase);
+       }
+       return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+                                           * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)                 \
+do {                                       \
+       LDLM_LOCK_REF_DEL(lock);                \
+       /*LDLM_DEBUG((lock), "put");*/    \
+       ldlm_lock_put(lock);                \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)                 \
+do {                                       \
+       /*LDLM_DEBUG((lock), "put");*/    \
+       ldlm_lock_put(lock);                \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)                 \
+({                                           \
+       ldlm_lock_get(lock);                \
+       /*LDLM_DEBUG((lock), "get");*/    \
+       lock;                              \
+})
+
+#define ldlm_lock_list_put(head, member, count)                     \
+({                                                               \
+       struct ldlm_lock *_lock, *_next;                            \
+       int c = count;                                        \
+       list_for_each_entry_safe(_lock, _next, head, member) {  \
+               if (c-- == 0)                                  \
+                       break;                                \
+               list_del_init(&_lock->member);            \
+               LDLM_LOCK_RELEASE(_lock);                          \
+       }                                                          \
+       LASSERT(c <= 0);                                            \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+                           const struct ldlm_res_id *, ldlm_type_t type,
+                           ldlm_policy_data_t *, ldlm_mode_t mode,
+                           struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+                                       __u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+                                       __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+                  ldlm_side_t client, ldlm_appetite_t apt,
+                  ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+                        struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef LPROCFS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+                                       struct ldlm_resource *parent,
+                                       const struct ldlm_res_id *,
+                                       ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+                           struct list_head *head,
+                           struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+                             const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {                           \
+       lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {                           \
+       lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                     void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+                    struct ldlm_enqueue_info *einfo,
+                    const struct ldlm_res_id *res_id,
+                    ldlm_policy_data_t const *policy, __u64 *flags,
+                    void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+                    struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+                         struct ptlrpc_request *req,
+                         struct list_head *cancels,
+                         int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+                     struct ptlrpc_request *req,
+                     int version, int opc, int canceloff,
+                     struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+                        const struct ldlm_request *dlm_req,
+                        const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+                         ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+                         __u64 *flags, void *lvb, __u32 lvb_len,
+                         struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_type_t type, ldlm_policy_data_t *policy,
+                          ldlm_mode_t mode, __u64 *flags,
+                          ldlm_blocking_callback blocking,
+                          ldlm_completion_callback completion,
+                          ldlm_glimpse_callback glimpse,
+                          void *data, __u32 lvb_len, enum lvb_type lvb_type,
+                          const __u64 *client_cookie,
+                          struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+                   void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+                   ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+                          ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+                                   const struct ldlm_res_id *res_id,
+                                   ldlm_policy_data_t *policy,
+                                   ldlm_mode_t mode,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+                       int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+                              struct list_head *cancels,
+                              ldlm_policy_data_t *policy,
+                              ldlm_mode_t mode, int lock_flags,
+                              ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+                              ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+                        struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE             'f'
+#define IOC_LDLM_MIN_NR                 40
+
+#define IOC_LDLM_TEST             _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP             _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START   _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP     _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR                 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+       LRT_NORMAL,
+       LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+       spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+                                  enum lock_res_type mode)
+{
+       spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+       spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+       LASSERT(spin_is_locked(&res->lr_lock));
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+void ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+                  int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+                    unsigned int gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644 (file)
index 0000000..b94f76a
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+       __u16              e_tag;
+       __u16              e_perm;
+       __u32              e_id;
+       __u32              e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+       __u32              a_count;
+       ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+       (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+       (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+                             posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+                            ext_acl_xattr_header *ext_header,
+                            posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+                          ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644 (file)
index 0000000..d61c020
--- /dev/null
@@ -0,0 +1,389 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+       /** Protects led_lcd below */
+       struct mutex            ted_lcd_lock;
+       /** Per-client data for each export */
+       struct lsd_client_data  *ted_lcd;
+       /** Offset of record in last_rcvd file */
+       loff_t                  ted_lr_off;
+       /** Client index in last_rcvd file */
+       int                     ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+       struct tg_export_data   med_ted;
+       /** List of all files opened by client on this MDT */
+       struct list_head                med_open_head;
+       spinlock_t              med_open_lock; /* med_open_head, mfd_list */
+       /** Bitmask of all ibit locks this MDT understands */
+       __u64                   med_ibits_known;
+       struct mutex            med_idmap_mutex;
+       struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+       struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+       struct tg_export_data   fed_ted;
+       spinlock_t              fed_lock;       /**< protects fed_mod_list */
+       long                   fed_dirty;    /* in bytes */
+       long                   fed_grant;    /* in bytes */
+       struct list_head                 fed_mod_list; /* files being modified */
+       int                     fed_mod_count;/* items in fed_writing list */
+       long                   fed_pending;  /* bytes just being written */
+       __u32                 fed_group;
+       __u8                   fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+       struct list_head                med_clients;    /* mgc fs client via this exp */
+       spinlock_t              med_lock;       /* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+       lnet_nid_t             nid;
+       struct hlist_node        nid_hash;
+       struct list_head               nid_list;
+       struct obd_device       *nid_obd;
+       struct proc_dir_entry   *nid_proc;
+       struct lprocfs_stats    *nid_stats;
+       struct lprocfs_stats    *nid_ldlm_stats;
+       atomic_t             nid_exp_ref_count; /* for obd_nid_stats_hash
+                                                          exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)                                                \
+do {                                                                      \
+       atomic_inc(&(nidstat)->nid_exp_ref_count);                       \
+} while(0)
+
+#define nidstat_putref(nidstat)                                                \
+do {                                                                      \
+       atomic_dec(&(nidstat)->nid_exp_ref_count);                       \
+       LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,         \
+                "stat %p nid_exp_ref_count < 0\n", nidstat);             \
+} while(0)
+
+enum obd_option {
+       OBD_OPT_FORCE =  0x0001,
+       OBD_OPT_FAILOVER =      0x0002,
+       OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+       /**
+        * Export handle, it's id is provided to client on connect
+        * Subsequent client RPCs contain this handle id to identify
+        * what export they are talking to.
+        */
+       struct portals_handle     exp_handle;
+       atomic_t              exp_refcount;
+       /**
+        * Set of counters below is to track where export references are
+        * kept. The exp_rpc_count is used for reconnect handling also,
+        * the cb_count and locks_count are for debug purposes only for now.
+        * The sum of them should be less than exp_refcount by 3
+        */
+       atomic_t              exp_rpc_count; /* RPC references */
+       atomic_t              exp_cb_count; /* Commit callback references */
+       /** Number of queued replay requests to be processes */
+       atomic_t                  exp_replay_count;
+       atomic_t              exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       struct list_head                exp_locks_list;
+       spinlock_t                exp_locks_list_guard;
+#endif
+       /** UUID of client connected to this export */
+       struct obd_uuid    exp_client_uuid;
+       /** To link all exports on an obd device */
+       struct list_head                exp_obd_chain;
+       struct hlist_node         exp_uuid_hash; /** uuid-export hash*/
+       struct hlist_node         exp_nid_hash; /** nid-export hash */
+       /**
+        * All exports eligible for ping evictor are linked into a list
+        * through this field in "most time since last request on this export"
+        * order
+        * protected by obd_dev_lock
+        */
+       struct list_head                exp_obd_chain_timed;
+       /** Obd device of this export */
+       struct obd_device       *exp_obd;
+       /**
+        * "reverse" import to send requests (e.g. from ldlm) back to client
+        * exp_lock protect its change
+        */
+       struct obd_import       *exp_imp_reverse;
+       struct nid_stat   *exp_nid_stats;
+       struct lprocfs_stats     *exp_md_stats;
+       /** Active connetion */
+       struct ptlrpc_connection *exp_connection;
+       /** Connection count value from last succesful reconnect rpc */
+       __u32                exp_conn_cnt;
+       /** Hash list of all ldlm locks granted on this export */
+       cfs_hash_t             *exp_lock_hash;
+       /**
+        * Hash list for Posix lock deadlock detection, added with
+        * ldlm_lock::l_exp_flock_hash.
+        */
+       cfs_hash_t             *exp_flock_hash;
+       struct list_head                exp_outstanding_replies;
+       struct list_head                exp_uncommitted_replies;
+       spinlock_t                exp_uncommitted_replies_lock;
+       /** Last committed transno for this export */
+       __u64                exp_last_committed;
+       /** When was last request received */
+       cfs_time_t              exp_last_request_time;
+       /** On replay all requests waiting for replay are linked here */
+       struct list_head                exp_req_replay_queue;
+       /**
+        * protects exp_flags, exp_outstanding_replies and the change
+        * of exp_imp_reverse
+        */
+       spinlock_t                exp_lock;
+       /** Compatibility flags for this export are embedded into
+        *  exp_connect_data */
+       struct obd_connect_data   exp_connect_data;
+       enum obd_option    exp_flags;
+       unsigned long        exp_failed:1,
+                                 exp_in_recovery:1,
+                                 exp_disconnected:1,
+                                 exp_connecting:1,
+                                 /** VBR: export missed recovery */
+                                 exp_delayed:1,
+                                 /** VBR: failed version checking */
+                                 exp_vbr_failed:1,
+                                 exp_req_replay_needed:1,
+                                 exp_lock_replay_needed:1,
+                                 exp_need_sync:1,
+                                 exp_flvr_changed:1,
+                                 exp_flvr_adapt:1,
+                                 exp_libclient:1, /* liblustre client? */
+                                 /* client timed out and tried to reconnect,
+                                  * but couldn't because of active rpcs */
+                                 exp_abort_active_req:1,
+                                 /* if to swap nidtbl entries for 2.2 clients.
+                                  * Only used by the MGS to fix LU-1644. */
+                                 exp_need_mne_swab:1;
+       /* also protected by exp_lock */
+       enum lustre_sec_part      exp_sp_peer;
+       struct sptlrpc_flavor     exp_flvr;          /* current */
+       struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+       cfs_time_t              exp_flvr_expire[2];   /* seconds */
+
+       /** protects exp_hp_rpcs */
+       spinlock_t                exp_rpc_lock;
+       struct list_head                  exp_hp_rpcs;  /* (potential) HP RPCs */
+
+       /** blocking dlm lock list, protected by exp_bl_list_lock */
+       struct list_head                exp_bl_list;
+       spinlock_t                exp_bl_list_lock;
+
+       /** Target specific data */
+       union {
+               struct tg_export_data     eu_target_data;
+               struct mdt_export_data    eu_mdt_data;
+               struct filter_export_data eu_filter_data;
+               struct ec_export_data     eu_ec_data;
+               struct mgs_export_data    eu_mgs_data;
+       } u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+       return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+       return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+               return exp->exp_connect_data.ocd_brw_size;
+
+       return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+       return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+       LASSERT(exp->exp_delayed);
+       return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+                              cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+       struct obd_import *imp = class_exp2cliimp(exp);
+
+       return !!(imp->imp_connect_data.ocd_connect_flags &
+                 OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       LASSERT(exp->exp_connection);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+       struct obd_connect_data *ocd;
+
+       LASSERT(imp != NULL);
+       ocd = &imp->imp_connect_data;
+       return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+               return true;
+       else
+               return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+       struct obd_connect_data *ocd;
+
+       LASSERT(imp != NULL);
+       ocd = &imp->imp_connect_data;
+       if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+               return true;
+       else
+               return false;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644 (file)
index 0000000..acaa1c4
--- /dev/null
@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LINUX_FID_H
+#define __LINUX_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <lustre_mdt.h>
+#include <obd.h>
+
+
+struct lu_site;
+struct lu_context;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+       /*
+        * This is how may metadata FIDs may be allocated in one sequence(128k)
+        */
+       LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+       /*
+        * This is how many data FIDs could be allocated in one sequence(4B - 1)
+        */
+       LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+       /*
+        * How many sequences to allocate to a client at once.
+        */
+       LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+       /*
+        * seq allocation pool size.
+        */
+       LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+       /*
+        * This is how many sequences may be in one super-sequence allocated to
+        * MDTs.
+        */
+       LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+       /** 2^6 FIDs for OI containers */
+       OSD_OI_FID_OID_BITS     = 6,
+       /** reserve enough FIDs in case we want more in the future */
+       OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+       /** \see fld_mod_init */
+       FLD_INDEX_OID           = 3UL,
+       /** \see fid_mod_init */
+       FID_SEQ_CTL_OID         = 4UL,
+       FID_SEQ_SRV_OID         = 5UL,
+       /** \see mdd_mod_init */
+       MDD_ROOT_INDEX_OID      = 6UL, /* deprecated in 2.4 */
+       MDD_ORPHAN_OID          = 7UL, /* deprecated in 2.4 */
+       MDD_LOV_OBJ_OID         = 8UL,
+       MDD_CAPA_KEYS_OID       = 9UL,
+       /** \see mdt_mod_init */
+       LAST_RECV_OID           = 11UL,
+       OSD_FS_ROOT_OID         = 13UL,
+       ACCT_USER_OID           = 15UL,
+       ACCT_GROUP_OID          = 16UL,
+       LFSCK_BOOKMARK_OID      = 17UL,
+       OTABLE_IT_OID           = 18UL,
+       /* These two definitions are obsolete
+        * OFD_GROUP0_LAST_OID     = 20UL,
+        * OFD_GROUP4K_LAST_OID    = 20UL+4096,
+        */
+       OFD_LAST_GROUP_OID      = 4117UL,
+       LLOG_CATALOGS_OID       = 4118UL,
+       MGS_CONFIGS_OID         = 4119UL,
+       OFD_HEALTH_CHECK_OID    = 4120UL,
+       MDD_LOV_OBJ_OSEQ        = 4121UL,
+       LFSCK_NAMESPACE_OID     = 4122UL,
+       REMOTE_PARENT_DIR_OID   = 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+       fid->f_seq = FID_SEQ_LOCAL_FILE;
+       fid->f_oid = oid;
+       fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+       fid->f_seq = FID_SEQ_LOCAL_NAME;
+       fid->f_oid = oid;
+       fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+       return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+                        fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+                       fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+                       fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+                       fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+              (fid_oid(fid) == ACCT_USER_OID ||
+               fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == FID_SEQ_QUOTA ||
+              fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+       const __u64 seq = fid_seq(fid);
+
+       /* Here, we cannot distinguish whether the normal FID is for OST
+        * object or not. It is caller's duty to check more if needed. */
+       return (!fid_is_last_id(fid) &&
+               (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+              fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+       return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+              fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+       if (fid_seq_is_mdt0(seq)) {
+               fid->f_seq = fid_idif_seq(0, 0);
+       } else {
+               LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+                        fid_seq_is_idif(seq), LPX64"\n", seq);
+               fid->f_seq = seq;
+       }
+       fid->f_oid = 0;
+       fid->f_ver = 0;
+}
+
+enum lu_mgr_type {
+       LUSTRE_SEQ_SERVER,
+       LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+       /* Sequence-controller export. */
+       struct obd_export      *lcs_exp;
+       struct mutex            lcs_mutex;
+
+       /*
+        * Range of allowed for allocation sequeces. When using lu_client_seq on
+        * clients, this contains meta-sequence range. And for servers this
+        * contains super-sequence range.
+        */
+       struct lu_seq_range      lcs_space;
+
+       /* Seq related proc */
+       proc_dir_entry_t   *lcs_proc_dir;
+
+       /* This holds last allocated fid in last obtained seq */
+       struct lu_fid      lcs_fid;
+
+       /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+       enum lu_cli_type        lcs_type;
+
+       /*
+        * Service uuid, passed from MDT + seq name to form unique seq name to
+        * use it with procfs.
+        */
+       char                lcs_name[80];
+
+       /*
+        * Sequence width, that is how many objects may be allocated in one
+        * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+        */
+       __u64              lcs_width;
+
+       /* Seq-server for direct talking */
+       struct lu_server_seq   *lcs_srv;
+
+       /* wait queue for fid allocation and update indicator */
+       wait_queue_head_t            lcs_waitq;
+       int                  lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+       /* Available sequences space */
+       struct lu_seq_range      lss_space;
+
+       /* keeps highwater in lsr_end for seq allocation algorithm */
+       struct lu_seq_range      lss_lowater_set;
+       struct lu_seq_range      lss_hiwater_set;
+
+       /*
+        * Device for server side seq manager needs (saving sequences to backing
+        * store).
+        */
+       struct dt_device       *lss_dev;
+
+       /* /seq file object device */
+       struct dt_object       *lss_obj;
+
+       /* Seq related proc */
+       proc_dir_entry_t   *lss_proc_dir;
+
+       /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+       enum lu_mgr_type       lss_type;
+
+       /* Client interafce to request controller */
+       struct lu_client_seq   *lss_cli;
+
+       /* Mutex for protecting allocation */
+       struct mutex            lss_mutex;
+
+       /*
+        * Service uuid, passed from MDT + seq name to form unique seq name to
+        * use it with procfs.
+        */
+       char                lss_name[80];
+
+       /*
+        * Allocation chunks for super and meta sequences. Default values are
+        * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+        */
+       __u64              lss_width;
+
+       /*
+        * minimum lss_alloc_set size that should be allocated from
+        * lss_space
+        */
+       __u64              lss_set_width;
+
+       /* sync is needed for update operation */
+       __u32              lss_need_sync;
+
+       /**
+        * Pointer to site object, required to access site fld.
+        */
+       struct seq_server_site  *lss_site;
+};
+
+int seq_query(struct com_thread_info *info);
+int seq_handle(struct ptlrpc_request *req);
+
+/* Server methods */
+int seq_server_init(struct lu_server_seq *seq,
+                   struct dt_device *dev,
+                   const char *prefix,
+                   enum lu_mgr_type type,
+                   struct seq_server_site *ss,
+                   const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                    const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                         struct lu_seq_range *out,
+                         const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+                      struct lu_client_seq *cli,
+                      const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+                   struct obd_export *exp,
+                   enum lu_cli_type type,
+                   const char *prefix,
+                   struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+                        struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+                      seqno_t *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+                struct lu_site *site, const struct lu_fid *fid);
+
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+                   enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f,
+                      struct ldlm_res_id *name)
+{
+       memset(name, 0, sizeof *name);
+       name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+       name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f);
+       return name;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid,
+                     struct ldlm_res_id *res)
+{
+       fid_build_reg_res_name(glb_fid, res);
+       res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+       res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+       return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_quota_resid(struct ldlm_res_id *res,
+                                          struct lu_fid *glb_fid,
+                                          union lquota_id *qid)
+{
+       glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+       glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF];
+       glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+       qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+       qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+       qid->qid_fid.f_ver =
+               (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+/*
+ * Return true if resource is for object identified by fid.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *f,
+                                 const struct ldlm_res_id *name)
+{
+       return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+              name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f);
+}
+
+/* reverse function of fid_build_reg_res_name() */
+static inline void fid_build_from_res_name(struct lu_fid *f,
+                                          const struct ldlm_res_id *name)
+{
+       fid_zero(f);
+       f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF];
+       f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff;
+       f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32;
+       LASSERT(fid_res_name_eq(f, name));
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *f,
+                      unsigned int hash,
+                      struct ldlm_res_id *name)
+{
+       fid_build_reg_res_name(f, name);
+       name->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+       return name;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+                                       struct ldlm_res_id *name)
+{
+       memset(name, 0, sizeof *name);
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+               name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+       } else {
+               fid_build_reg_res_name((struct lu_fid *)oi, name);
+       }
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+                                       struct ldlm_res_id *name)
+{
+       if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+               /* old resid */
+               ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+               ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+       } else {
+               /* new resid */
+               fid_build_from_res_name((struct lu_fid *)oi, name);
+       }
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+                                   struct ldlm_res_id *name)
+{
+       /* Note: it is just a trick here to save some effort, probably the
+        * correct way would be turn them into the FID and compare */
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+                      name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+       } else {
+               return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+                      name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+       }
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+                                      struct ldlm_res_id *resname)
+{
+       if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+               struct ost_id oi;
+               if (fid_to_ostid(fid, &oi) != 0)
+                       return;
+               ostid_build_res_name(&oi, resname);
+       } else {
+               fid_build_reg_res_name(fid, resname);
+       }
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+                                     const struct ldlm_res_id *name)
+{
+       if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+               /* old resid */
+               struct ost_id oi;
+               ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+               ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+               ostid_to_fid(fid, &oi, 0);
+       } else {
+               /* new resid */
+               fid_build_from_res_name(fid, name);
+       }
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+       __u64 ino;
+       __u64 seq;
+
+       if (fid_is_igif(fid)) {
+               ino = lu_igif_ino(fid);
+               RETURN(ino);
+       }
+
+       seq = fid_seq(fid);
+
+       ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+       RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+       /* all objects with same id and different versions will belong to same
+        * collisions list. */
+       return cfs_hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+       __u32 ino;
+       __u64 seq;
+
+       if (fid_is_igif(fid)) {
+               ino = lu_igif_ino(fid);
+               RETURN(ino);
+       }
+
+       seq = fid_seq(fid) - FID_SEQ_START;
+
+       /* Map the high bits of the OID into higher bits of the inode number so
+        * that inodes generated at about the same time have a reduced chance
+        * of collisions. This will give a period of 2^12 = 1024 unique clients
+        * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+        * (from OID), or up to 128M inodes without collisions for new files. */
+       ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+              (seq >> (64 - (40-8)) & 0xffffff00) +
+              (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+       RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+       LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+                PFID(fid1), PFID(fid2));
+
+       if (fid_is_idif(fid1) && fid_is_idif(fid2))
+               return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+                      fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+       return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = cpu_to_le64(src->lsr_start);
+       dst->lsr_end = cpu_to_le64(src->lsr_end);
+       dst->lsr_index = cpu_to_le32(src->lsr_index);
+       dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = le64_to_cpu(src->lsr_start);
+       dst->lsr_end = le64_to_cpu(src->lsr_end);
+       dst->lsr_index = le32_to_cpu(src->lsr_index);
+       dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = cpu_to_be64(src->lsr_start);
+       dst->lsr_end = cpu_to_be64(src->lsr_end);
+       dst->lsr_index = cpu_to_be32(src->lsr_index);
+       dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = be64_to_cpu(src->lsr_start);
+       dst->lsr_end = be64_to_cpu(src->lsr_end);
+       dst->lsr_index = be32_to_cpu(src->lsr_index);
+       dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LINUX_FID_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644 (file)
index 0000000..11e034a
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_mdt.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+       LUSTRE_CLI_FLD_HASH_DHT = 0,
+       LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+       struct list_head               ft_chain;
+       struct obd_export       *ft_exp;
+       struct lu_server_fld    *ft_srv;
+       __u64               ft_idx;
+};
+
+struct lu_server_fld {
+       /**
+        * Fld dir proc entry. */
+       proc_dir_entry_t    *lsf_proc_dir;
+
+       /**
+        * /fld file object device */
+       struct dt_object        *lsf_obj;
+
+       /**
+        * super sequence controller export, needed to forward fld
+        * lookup  request. */
+       struct obd_export       *lsf_control_exp;
+
+       /**
+        * Client FLD cache. */
+       struct fld_cache        *lsf_cache;
+
+       /**
+        * Protect index modifications */
+       struct mutex            lsf_lock;
+
+       /**
+        * Fld service name in form "fld-srv-lustre-MDTXXX" */
+       char                 lsf_name[80];
+
+};
+
+struct lu_client_fld {
+       /**
+        * Client side proc entry. */
+       proc_dir_entry_t    *lcf_proc_dir;
+
+       /**
+        * List of exports client FLD knows about. */
+       struct list_head               lcf_targets;
+
+       /**
+        * Current hash to be used to chose an export. */
+       struct lu_fld_hash      *lcf_hash;
+
+       /**
+        * Exports count. */
+       int                   lcf_count;
+
+       /**
+        * Lock protecting exports list and fld_hash. */
+       spinlock_t               lcf_lock;
+
+       /**
+        * Client FLD cache. */
+       struct fld_cache        *lcf_cache;
+
+       /**
+        * Client fld proc entry name. */
+       char                 lcf_name[80];
+
+       const struct lu_context *lcf_ctx;
+
+       int                   lcf_flags;
+};
+
+/**
+ * number of blocks to reserve for particular operations. Should be function of
+ * ... something. Stub for now.
+ */
+enum {
+       /* one insert operation can involve two delete and one insert */
+       FLD_TXN_INDEX_INSERT_CREDITS  = 60,
+       FLD_TXN_INDEX_DELETE_CREDITS  = 20,
+};
+
+int fld_query(struct com_thread_info *info);
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+                   struct dt_device *dt, const char *prefix, int mds_node_id,
+                   int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+                             struct lu_server_fld *fld,
+                             struct lu_seq_range *new,
+                             struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+                     struct lu_server_fld *fld,
+                     struct lu_seq_range *add_range,
+                     struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+                    struct lu_server_fld *fld,
+                    const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                     seqno_t seq, struct lu_seq_range *range);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+                   const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+                     __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+                     struct lu_seq_range *range,
+                     const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld,
+                     seqno_t seq,
+                     const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+                         struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+                         __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
new file mode 100644 (file)
index 0000000..9dcc332
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LUSTRE_FSFILT_H
+#define _LUSTRE_FSFILT_H
+
+#include <linux/lustre_fsfilt.h>
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644 (file)
index 0000000..105f6d6
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_check_suspend(void);
+void ptlrpc_activate_timeouts(struct obd_import *imp);
+void ptlrpc_deactivate_timeouts(struct obd_import *imp);
+
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644 (file)
index 0000000..fcd40f3
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/lustre_handles.h>
+
+#include <linux/libcfs/libcfs.h>
+
+
+struct portals_handle_ops {
+       void (*hop_addref)(void *object);
+       void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *      struct portals_handle handle;
+ *      ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+       struct list_head                        h_link;
+       __u64                           h_cookie;
+       struct portals_handle_ops       *h_ops;
+
+       /* newly added fields to handle the RCU issue. -jxiong */
+       cfs_rcu_head_t                  h_rcu;
+       spinlock_t                      h_lock;
+       unsigned int                    h_size:31;
+       unsigned int                    h_in:1;
+};
+#define RCU2HANDLE(rcu)    container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+                      struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(cfs_rcu_head_t *);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h
new file mode 100644 (file)
index 0000000..084bdd6
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_CACHE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+       ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+enum {
+       CFS_IC_NOTHING     = 0,    /* convert nothing */
+       CFS_IC_ALL       = 1,    /* convert all items */
+       CFS_IC_MAPPED      = 2,    /* convert mapped uid/gid */
+       CFS_IC_UNMAPPED    = 3     /* convert unmapped uid/gid */
+};
+
+#define  CFS_IDMAP_NOTFOUND     (-1)
+
+#define CFS_IDMAP_HASHSIZE      32
+
+enum lustre_idmap_idx {
+       RMT_UIDMAP_IDX,
+       LCL_UIDMAP_IDX,
+       RMT_GIDMAP_IDX,
+       LCL_GIDMAP_IDX,
+       CFS_IDMAP_N_HASHES
+};
+
+struct lustre_idmap_table {
+       spinlock_t      lit_lock;
+       struct list_head        lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE];
+};
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist);
+extern void lustre_groups_sort(group_info_t *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+extern int lustre_idmap_add(struct lustre_idmap_table *t,
+                           uid_t ruid, uid_t luid,
+                           gid_t rgid, gid_t lgid);
+extern int lustre_idmap_del(struct lustre_idmap_table *t,
+                           uid_t ruid, uid_t luid,
+                           gid_t rgid, gid_t lgid);
+extern int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+                                  struct lustre_idmap_table *t,
+                                  int reverse, uid_t uid);
+extern int lustre_idmap_lookup_gid(struct lu_ucred *mu,
+                                  struct lustre_idmap_table *t,
+                                  int reverse, gid_t gid);
+extern struct lustre_idmap_table *lustre_idmap_init(void);
+extern void lustre_idmap_fini(struct lustre_idmap_table *t);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644 (file)
index 0000000..3a5dd6a
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4                /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1        /* use last reported value only */
+
+struct adaptive_timeout {
+       time_t          at_binstart;     /* bin start time */
+       unsigned int    at_hist[AT_BINS];    /* timeout history bins */
+       unsigned int    at_flags;
+       unsigned int    at_current;       /* current timeout value */
+       unsigned int    at_worst_ever;       /* worst-ever timeout value */
+       time_t          at_worst_time;       /* worst-ever timeout timestamp */
+       spinlock_t      at_lock;
+};
+
+struct ptlrpc_at_array {
+       struct list_head       *paa_reqs_array; /** array to hold requests */
+       __u32        paa_size;       /** the size of array */
+       __u32        paa_count;      /** the total count of reqs */
+       time_t      paa_deadline;   /** the earliest deadline of reqs */
+       __u32       *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+       int                  iat_portal[IMP_AT_MAX_PORTALS];
+       struct adaptive_timeout iat_net_latency;
+       struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+       LUSTRE_IMP_CLOSED     = 1,
+       LUSTRE_IMP_NEW  = 2,
+       LUSTRE_IMP_DISCON     = 3,
+       LUSTRE_IMP_CONNECTING = 4,
+       LUSTRE_IMP_REPLAY     = 5,
+       LUSTRE_IMP_REPLAY_LOCKS = 6,
+       LUSTRE_IMP_REPLAY_WAIT  = 7,
+       LUSTRE_IMP_RECOVER    = 8,
+       LUSTRE_IMP_FULL       = 9,
+       LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+       static char* import_state_names[] = {
+               "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+               "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+               "RECOVER", "FULL", "EVICTED",
+       };
+
+       LASSERT (state <= LUSTRE_IMP_EVICTED);
+       return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+       IMP_EVENT_DISCON     = 0x808001,
+       IMP_EVENT_INACTIVE   = 0x808002,
+       IMP_EVENT_INVALIDATE = 0x808003,
+       IMP_EVENT_ACTIVE     = 0x808004,
+       IMP_EVENT_OCD   = 0x808005,
+       IMP_EVENT_DEACTIVATE = 0x808006,
+       IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+       /** Item for linking connections together */
+       struct list_head                oic_item;
+       /** Pointer to actual PortalRPC connection */
+       struct ptlrpc_connection *oic_conn;
+       /** uuid of remote side */
+       struct obd_uuid    oic_uuid;
+       /**
+        * Time (64 bit jiffies) of last connection attempt on this connection
+        */
+       __u64                oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+       enum lustre_imp_state ish_state;
+       time_t          ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+       /** Local handle (== id) for this import. */
+       struct portals_handle     imp_handle;
+       /** Reference counter */
+       atomic_t              imp_refcount;
+       struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+       /** Currently active connection */
+       struct ptlrpc_connection *imp_connection;
+       /** PortalRPC client structure for this import */
+       struct ptlrpc_client     *imp_client;
+       /** List element for linking into pinger chain */
+       struct list_head                imp_pinger_chain;
+       /** List element for linking into chain for destruction */
+       struct list_head                imp_zombie_chain;
+
+       /**
+        * Lists of requests that are retained for replay, waiting for a reply,
+        * or waiting for recovery to complete, respectively.
+        * @{
+        */
+       struct list_head                imp_replay_list;
+       struct list_head                imp_sending_list;
+       struct list_head                imp_delayed_list;
+       /** @} */
+
+       /** obd device for this import */
+       struct obd_device       *imp_obd;
+
+       /**
+        * some seciruty-related fields
+        * @{
+        */
+       struct ptlrpc_sec       *imp_sec;
+       struct mutex              imp_sec_mutex;
+       cfs_time_t              imp_sec_expire;
+       /** @} */
+
+       /** Wait queue for those who need to wait for recovery completion */
+       wait_queue_head_t              imp_recovery_waitq;
+
+       /** Number of requests currently in-flight */
+       atomic_t              imp_inflight;
+       /** Number of requests currently unregistering */
+       atomic_t              imp_unregistering;
+       /** Number of replay requests inflight */
+       atomic_t              imp_replay_inflight;
+       /** Number of currently happening import invalidations */
+       atomic_t              imp_inval_count;
+       /** Numbner of request timeouts */
+       atomic_t              imp_timeouts;
+       /** Current import state */
+       enum lustre_imp_state     imp_state;
+       /** History of import states */
+       struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+       int                    imp_state_hist_idx;
+       /** Current import generation. Incremented on every reconnect */
+       int                    imp_generation;
+       /** Incremented every time we send reconnection request */
+       __u32                imp_conn_cnt;
+       /**
+       * \see ptlrpc_free_committed remembers imp_generation value here
+       * after a check to save on unnecessary replay list iterations
+       */
+       int                    imp_last_generation_checked;
+       /** Last tranno we replayed */
+       __u64                imp_last_replay_transno;
+       /** Last transno committed on remote side */
+       __u64                imp_peer_committed_transno;
+       /**
+        * \see ptlrpc_free_committed remembers last_transno since its last
+        * check here and if last_transno did not change since last run of
+        * ptlrpc_free_committed and import generation is the same, we can
+        * skip looking for requests to remove from replay list as optimisation
+        */
+       __u64                imp_last_transno_checked;
+       /**
+        * Remote export handle. This is how remote side knows what export
+        * we are talking to. Filled from response to connect request
+        */
+       struct lustre_handle      imp_remote_handle;
+       /** When to perform next ping. time in jiffies. */
+       cfs_time_t              imp_next_ping;
+       /** When we last succesfully connected. time in 64bit jiffies */
+       __u64                imp_last_success_conn;
+
+       /** List of all possible connection for import. */
+       struct list_head                imp_conn_list;
+       /**
+        * Current connection. \a imp_connection is imp_conn_current->oic_conn
+        */
+       struct obd_import_conn   *imp_conn_current;
+
+       /** Protects flags, level, generation, conn_cnt, *_list */
+       spinlock_t                imp_lock;
+
+       /* flags */
+       unsigned long        imp_no_timeout:1, /* timeouts are disabled */
+                                 imp_invalid:1,    /* evicted */
+                                 /* administratively disabled */
+                                 imp_deactive:1,
+                                 /* try to recover the import */
+                                 imp_replayable:1,
+                                 /* don't run recovery (timeout instead) */
+                                 imp_dlm_fake:1,
+                                 /* use 1/2 timeout on MDS' OSCs */
+                                 imp_server_timeout:1,
+                                 /* VBR: imp in delayed recovery */
+                                 imp_delayed_recovery:1,
+                                 /* VBR: if gap was found then no lock replays
+                                  */
+                                 imp_no_lock_replay:1,
+                                 /* recovery by versions was failed */
+                                 imp_vbr_failed:1,
+                                 /* force an immidiate ping */
+                                 imp_force_verify:1,
+                                 /* force a scheduled ping */
+                                 imp_force_next_verify:1,
+                                 /* pingable */
+                                 imp_pingable:1,
+                                 /* resend for replay */
+                                 imp_resend_replay:1,
+                                 /* disable normal recovery, for test only. */
+                                 imp_no_pinger_recover:1,
+                                 /* need IR MNE swab */
+                                 imp_need_mne_swab:1,
+                                 /* import must be reconnected instead of
+                                  * chouse new connection */
+                                 imp_force_reconnect:1,
+                                 /* import has tried to connect with server */
+                                 imp_connect_tried:1;
+       __u32                imp_connect_op;
+       struct obd_connect_data   imp_connect_data;
+       __u64                imp_connect_flags_orig;
+       int                    imp_connect_error;
+
+       __u32                imp_msg_magic;
+       __u32                imp_msghdr_flags;       /* adjusted based on server capability */
+
+       struct ptlrpc_request_pool *imp_rq_pool;          /* emergency request pool */
+
+       struct imp_at        imp_at;             /* adaptive timeout data */
+       time_t              imp_last_reply_time;    /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+                                   int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+       struct list_head           oio_chain;
+       obd_import_callback  oio_cb;
+       void            *oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+                         void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+                           void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+                                  void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+       /* add an arbitrary minimum: 125% +5 sec */
+       return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+       /* restore estimate value from timeout: e=4/5(t-5) */
+       LASSERT(val);
+       return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val) {
+       at->at_current = val;
+       at->at_worst_ever = val;
+       at->at_worst_time = cfs_time_current_sec();
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+       memset(at, 0, sizeof(*at));
+       spin_lock_init(&at->at_lock);
+       at->at_flags = flags;
+       at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+       return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644 (file)
index 0000000..bdfc539
--- /dev/null
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <lustre_cfg.h>
+#include <linux/lustre_lib.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lvfs.h>
+
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+                     int opcode, int version,
+                     obd_count keylen, void *key,
+                     obd_count vallen, void *val,
+                     struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+       struct lustre_handle  och_fh;
+       struct lu_fid    och_fid;
+       struct md_open_data  *och_mod;
+       __u32 och_magic;
+       int och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* l_lock.c */
+struct lustre_lock {
+       int                     l_depth;
+       task_t          *l_owner;
+       struct semaphore        l_sem;
+       spinlock_t              l_spin;
+};
+
+void l_lock_init(struct lustre_lock *);
+void l_lock(struct lustre_lock *);
+void l_unlock(struct lustre_lock *);
+int l_has_lock(struct lustre_lock *);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+       ECHO_MD_CREATE       = 1, /* Open/Create file on MDT */
+       ECHO_MD_MKDIR   = 2, /* Mkdir on MDT */
+       ECHO_MD_DESTROY      = 3, /* Unlink file on MDT */
+       ECHO_MD_RMDIR   = 4, /* Rmdir on MDT */
+       ECHO_MD_LOOKUP       = 5, /* Lookup on MDT */
+       ECHO_MD_GETATTR      = 6, /* Getattr on MDT */
+       ECHO_MD_SETATTR      = 7, /* Setattr on MDT */
+       ECHO_MD_ALLOC_FID    = 8, /* Get FIDs from MDT */
+};
+
+/*
+ *   OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+       __u32 ioc_len;
+       __u32 ioc_version;
+
+       union {
+               __u64 ioc_cookie;
+               __u64 ioc_u64_1;
+       };
+       union {
+               __u32 ioc_conn1;
+               __u32 ioc_u32_1;
+       };
+       union {
+               __u32 ioc_conn2;
+               __u32 ioc_u32_2;
+       };
+
+       struct obdo ioc_obdo1;
+       struct obdo ioc_obdo2;
+
+       obd_size ioc_count;
+       obd_off  ioc_offset;
+       __u32    ioc_dev;
+       __u32    ioc_command;
+
+       __u64 ioc_nid;
+       __u32 ioc_nal;
+       __u32 ioc_type;
+
+       /* buffers the kernel will treat as user pointers */
+       __u32  ioc_plen1;
+       char  *ioc_pbuf1;
+       __u32  ioc_plen2;
+       char  *ioc_pbuf2;
+
+       /* inline buffers for various arguments */
+       __u32  ioc_inllen1;
+       char  *ioc_inlbuf1;
+       __u32  ioc_inllen2;
+       char  *ioc_inlbuf2;
+       __u32  ioc_inllen3;
+       char  *ioc_inlbuf3;
+       __u32  ioc_inllen4;
+       char  *ioc_inlbuf4;
+
+       char    ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+       __u32 ioc_len;
+       __u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+       int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+       len += cfs_size_round(data->ioc_inllen1);
+       len += cfs_size_round(data->ioc_inllen2);
+       len += cfs_size_round(data->ioc_inllen3);
+       len += cfs_size_round(data->ioc_inllen4);
+       return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+       if (data->ioc_len > (1<<30)) {
+               CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen3 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen4 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+               CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+               CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+               CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+               CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf1 && !data->ioc_plen1) {
+               CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf2 && !data->ioc_plen2) {
+               CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_plen1 && !data->ioc_pbuf1) {
+               CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+               return 1;
+       }
+       if (data->ioc_plen2 && !data->ioc_pbuf2) {
+               CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+               return 1;
+       }
+       if (obd_ioctl_packlen(data) > data->ioc_len) {
+               CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+                      obd_ioctl_packlen(data), data->ioc_len);
+               return 1;
+       }
+       return 0;
+}
+
+
+#include <obd_support.h>
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+       ENTRY;
+
+       OBD_FREE_LARGE(buf, len);
+       EXIT;
+       return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1       _IOR(g, n1, long)
+ * #define IOC_V2       _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux?  (XXX Liang)
+ */
+#define OBD_IOC_CREATE          _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY                _IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE        _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR                _IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR                _IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ              _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE            _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS          _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC              _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2            _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT          _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION            _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY              _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR              _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH            _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG      _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ              _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE            _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV              _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV              _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME                _IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME            _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME             OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG  _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER  _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET        _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
+#define OBD_IOC_NO_TRANSNO          _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY      _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY  _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH        _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION                _IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT        _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID          _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND  _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE            _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH              _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE    _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE    _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA            _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK          _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK        _IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL              _IOWR('f', 162, struct if_quotactl)
+/* see  also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG    _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG        _IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR        _IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD          _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD            _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE            _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD              _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG        _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG              _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG            _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM            _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL              _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS      _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST          _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO            _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT          _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL        _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE        _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK          _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO      _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE        _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE        _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE              _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL                _IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION        _IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT          _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD                _IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ  _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK           _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK            _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK           _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+                                                offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *                                        intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *                                            timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *          Thread1                        Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);                                    (1)
+ *
+ *                                 wake_up(&obj->wq):           (2)
+ *                                      spin_lock(&q->lock);     (2.1)
+ *                                      __wake_up_common(q, ...);     (2.2)
+ *                                      spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);                                                  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+       return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+       cfs_duration_t lwi_timeout;
+       cfs_duration_t lwi_interval;
+       int         lwi_allow_intr;
+       int  (*lwi_on_timeout)(void *);
+       void (*lwi_on_signal)(void *);
+       void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)         \
+((struct l_wait_info) {                         \
+       .lwi_timeout    = time,          \
+       .lwi_on_timeout = cb,              \
+       .lwi_cb_data    = data,          \
+       .lwi_interval   = 0,                \
+       .lwi_allow_intr = 0                  \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {                                 \
+       .lwi_timeout    = time,                  \
+       .lwi_on_timeout = cb,                      \
+       .lwi_cb_data    = data,                  \
+       .lwi_interval   = interval,                  \
+       .lwi_allow_intr = 0                          \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {                                 \
+       .lwi_timeout    = time,                  \
+       .lwi_on_timeout = time_cb,                    \
+       .lwi_on_signal  = sig_cb,                      \
+       .lwi_cb_data    = data,                  \
+       .lwi_interval   = 0,                        \
+       .lwi_allow_intr = 0                          \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {                                         \
+       .lwi_timeout    = time,                          \
+       .lwi_on_timeout = time_cb,                            \
+       .lwi_on_signal  = sig_cb,                              \
+       .lwi_cb_data    = data,                          \
+       .lwi_interval   = 0,                                \
+       .lwi_allow_intr = 1                                  \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)              \
+do {                                                                      \
+       wait_queue_t __wait;                                             \
+       cfs_duration_t __timeout = info->lwi_timeout;                     \
+       sigset_t   __blocked;                                         \
+       int   __allow_intr = info->lwi_allow_intr;                           \
+                                                                              \
+       ret = 0;                                                               \
+       if (condition)                                                   \
+               break;                                                   \
+                                                                              \
+       init_waitqueue_entry_current(&__wait);                                      \
+       l_add_wait(&wq, &__wait);                                             \
+                                                                              \
+       /* Block all signals (just the non-fatal ones if no timeout). */       \
+       if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+               __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);             \
+       else                                                               \
+               __blocked = cfs_block_sigsinv(0);                             \
+                                                                              \
+       for (;;) {                                                           \
+               unsigned       __wstate;                                       \
+                                                                              \
+               __wstate = info->lwi_on_signal != NULL &&                     \
+                          (__timeout == 0 || __allow_intr) ?             \
+                       TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;             \
+                                                                              \
+               set_current_state(TASK_INTERRUPTIBLE);           \
+                                                                              \
+               if (condition)                                           \
+                       break;                                           \
+                                                                              \
+               if (__timeout == 0) {                                     \
+                       waitq_wait(&__wait, __wstate);               \
+               } else {                                                       \
+                       cfs_duration_t interval = info->lwi_interval?     \
+                                            min_t(cfs_duration_t,           \
+                                                info->lwi_interval,__timeout):\
+                                            __timeout;                 \
+                       cfs_duration_t remaining = waitq_timedwait(&__wait,\
+                                                  __wstate,               \
+                                                  interval);             \
+                       __timeout = cfs_time_sub(__timeout,                 \
+                                           cfs_time_sub(interval, remaining));\
+                       if (__timeout == 0) {                             \
+                               if (info->lwi_on_timeout == NULL ||         \
+                                   info->lwi_on_timeout(info->lwi_cb_data)) { \
+                                       ret = -ETIMEDOUT;                     \
+                                       break;                           \
+                               }                                             \
+                               /* Take signals after the timeout expires. */  \
+                               if (info->lwi_on_signal != NULL)               \
+                                   (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+                       }                                                     \
+               }                                                             \
+                                                                              \
+               if (condition)                                           \
+                       break;                                           \
+               if (cfs_signal_pending()) {                                 \
+                       if (info->lwi_on_signal != NULL &&                   \
+                           (__timeout == 0 || __allow_intr)) {         \
+                               if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+                                       info->lwi_on_signal(info->lwi_cb_data);\
+                               ret = -EINTR;                             \
+                               break;                                   \
+                       }                                                     \
+                       /* We have to do this here because some signals */     \
+                       /* are not blockable - ie from strace(1).       */     \
+                       /* In these cases we want to schedule_timeout() */     \
+                       /* again, because we don't want that to return  */     \
+                       /* -EINTR when the RPC actually succeeded.      */     \
+                       /* the recalc_sigpending() below will deliver the */     \
+                       /* signal properly.                          */     \
+                       cfs_clear_sigpending();                         \
+               }                                                             \
+       }                                                                     \
+                                                                              \
+       cfs_restore_sigs(__blocked);                                       \
+                                                                              \
+       set_current_state(TASK_RUNNING);                               \
+       remove_wait_queue(&wq, &__wait);                                           \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info)                     \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue);             \
+       __ret;                                            \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)         \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue_exclusive);         \
+       __ret;                                            \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)       \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue_exclusive_head);    \
+       __ret;                                            \
+})
+
+#define l_wait_condition(wq, condition)                         \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event(wq, condition, &lwi);                    \
+})
+
+#define l_wait_condition_exclusive(wq, condition)             \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event_exclusive(wq, condition, &lwi);        \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)   \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h
new file mode 100644 (file)
index 0000000..5790be9
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+struct linkea_data {
+       /**
+        * Buffer to keep link EA body.
+        */
+       struct lu_buf           *ld_buf;
+       /**
+        * The matched header, entry and its lenght in the EA
+        */
+       struct link_ea_header   *ld_leh;
+       struct link_ea_entry    *ld_lee;
+       int                     ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+                        struct lu_name *lname, struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+                  const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+                     const struct lu_fid  *pfid);
+
+#define LINKEA_NEXT_ENTRY(ldata)       \
+       (struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen)
+
+#define LINKEA_FIRST_ENTRY(ldata)      \
+       (struct link_ea_entry *)(ldata.ld_leh + 1)
diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644 (file)
index 0000000..25f8bfa
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include <linux/lustre_lite.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre_net.h>
+#include <lustre_mds.h>
+#include <lustre_ha.h>
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS     (22)
+#define LL_MAX_BLKSIZE   (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include <lustre/lustre_user.h>
+
+
+struct lustre_rw_params {
+       int             lrp_lock_mode;
+       ldlm_policy_data_t lrp_policy;
+       obd_flag           lrp_brw_flags;
+       int             lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+                                           __u64 connect_flags,
+                                           loff_t pos, ssize_t len,
+                                           struct lustre_rw_params *params)
+{
+       params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+       params->lrp_brw_flags = 0;
+
+       params->lrp_policy.l_extent.start = pos;
+       params->lrp_policy.l_extent.end = pos + len - 1;
+       /*
+        * for now O_APPEND always takes local locks.
+        */
+       if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+               params->lrp_policy.l_extent.start = 0;
+               params->lrp_policy.l_extent.end   = OBD_OBJECT_EOF;
+       } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+               /*
+                * liblustre: OST-side locking for all non-O_APPEND
+                * reads/writes.
+                */
+               params->lrp_lock_mode = LCK_NL;
+               params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+       } else {
+               /*
+                * nothing special for the kernel. In the future llite may use
+                * OST-side locks for small writes into highly contended
+                * files.
+                */
+       }
+       params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+               LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+       /*
+        * This is conjunction of connect_flags across all imports (LOVs) this
+        * mount is connected to. This field is updated by cl_ocd_update()
+        * under ->lco_lock.
+        */
+       __u64         lco_flags;
+       struct mutex       lco_lock;
+       struct obd_export *lco_md_exp;
+       struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+       /* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+       if (BITS_PER_LONG == 32 && hash64)
+               hash >>= 32;
+       return ~0UL - hash;
+}
+
+/** @} lite */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644 (file)
index 0000000..714ab37
--- /dev/null
@@ -0,0 +1,576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <linux/lustre_log.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#define LOG_NAME_LIMIT(logname, name)             \
+       snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+       LLOG_OPEN_EXISTS        = 0x0000,
+       LLOG_OPEN_NEW           = 0x0001,
+};
+
+struct plain_handle_data {
+       struct list_head          phd_entry;
+       struct llog_handle *phd_cat_handle;
+       struct llog_cookie  phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+       struct list_head              chd_head;
+       struct llog_handle     *chd_current_log; /* currently open log */
+       struct llog_handle      *chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+       /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+        * logid's by non-zero ogen (inode generation) and convert them
+        * into IGIF */
+       if (id->lgl_ogen == 0) {
+               fid->f_seq = id->lgl_oi.oi.oi_seq;
+               fid->f_oid = id->lgl_oi.oi.oi_id;
+               fid->f_ver = 0;
+       } else {
+               lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+       }
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+       id->lgl_oi.oi.oi_seq = fid->f_seq;
+       id->lgl_oi.oi.oi_id = fid->f_oid;
+       id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+       log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+       return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+                    int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+                     struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+                llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+                        struct llog_handle *loghandle, llog_cb_t cb,
+                        void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+                   int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+             struct llog_handle **lgh, struct llog_logid *logid,
+             char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_get_size(struct llog_handle *loghandle);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+       /**
+        * Any useful data needed while processing catalog. This is
+        * passed later to process callback.
+        */
+       void            *lpd_data;
+       /**
+        * Catalog process callback function, called for each record
+        * in catalog.
+        */
+       llog_cb_t           lpd_cb;
+       /**
+        * Start processing the catalog from startcat/startidx
+        */
+       int               lpd_startcat;
+       int               lpd_startidx;
+};
+
+struct llog_process_cat_data {
+       /**
+        * Temporary stored first_idx while scanning log.
+        */
+       int               lpcd_first_idx;
+       /**
+        * Temporary stored last_idx while scanning log.
+        */
+       int               lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                    void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+                            struct llog_handle *cathandle,
+                            struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+                struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+                           struct llog_handle *cathandle, int count,
+                           struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+                            struct llog_handle *cat_llh, llog_cb_t cb,
+                            void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+                    llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+                            struct llog_handle *cat_llh, llog_cb_t cb,
+                            void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+                             struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+              struct obd_llog_group *olg, int index,
+              struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+                struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                struct llog_cookie *logcookies, int numcookies);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+               struct lov_stripe_md *lsm, int count,
+               struct llog_cookie *cookies, int flags);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *idx);
+
+int obd_llog_finish(struct obd_device *obd, int count);
+
+/* llog_ioctl.c */
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+              struct obd_ioctl_data *data);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+       int (*lop_destroy)(const struct lu_env *env,
+                          struct llog_handle *handle);
+       int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+                             int *curr_idx, int next_idx, __u64 *offset,
+                             void *buf, int len);
+       int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+                             int prev_idx, void *buf, int len);
+       int (*lop_read_header)(const struct lu_env *env,
+                              struct llog_handle *handle);
+       int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+                        struct obd_llog_group *olg, int ctxt_idx,
+                        struct obd_device *disk_obd);
+       int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+                       int flags);
+       int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+       int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+                         struct lov_stripe_md *lsm, int count,
+                         struct llog_cookie *cookies, int flags);
+       int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+                          struct llog_gen *gen, struct obd_uuid *uuid);
+       /**
+        * Any llog file must be opened first using llog_open().  Llog can be
+        * opened by name, logid or without both, in last case the new logid
+        * will be generated.
+        */
+       int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+                       struct llog_logid *logid, char *name,
+                       enum llog_open_param);
+       /**
+        * Opened llog may not exist and this must be checked where needed using
+        * the llog_exist() call.
+        */
+       int (*lop_exist)(struct llog_handle *lgh);
+       /**
+        * Close llog file and calls llog_free_handle() implicitly.
+        * Any opened llog must be closed by llog_close() call.
+        */
+       int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+       /**
+        * Create new llog file. The llog must be opened.
+        * Must be used only for local llog operations.
+        */
+       int (*lop_declare_create)(const struct lu_env *env,
+                                 struct llog_handle *handle,
+                                 struct thandle *th);
+       int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+                         struct thandle *th);
+       /**
+        * write new record in llog. It appends records usually but can edit
+        * existing records too.
+        */
+       int (*lop_declare_write_rec)(const struct lu_env *env,
+                                    struct llog_handle *lgh,
+                                    struct llog_rec_hdr *rec,
+                                    int idx, struct thandle *th);
+       int (*lop_write_rec)(const struct lu_env *env,
+                            struct llog_handle *loghandle,
+                            struct llog_rec_hdr *rec,
+                            struct llog_cookie *cookie, int cookiecount,
+                            void *buf, int idx, struct thandle *th);
+       /**
+        * Add new record in llog catalog. Does the same as llog_write_rec()
+        * but using llog catalog.
+        */
+       int (*lop_declare_add)(const struct lu_env *env,
+                              struct llog_handle *lgh,
+                              struct llog_rec_hdr *rec, struct thandle *th);
+       int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+                      struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+                      void *buf, struct thandle *th);
+       /* Old llog_add version, used in MDS-LOV-OSC now and will gone with
+        * LOD/OSP replacement */
+       int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                          struct llog_cookie *logcookies, int numcookies);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+       struct rw_semaphore      lgh_lock;
+       spinlock_t               lgh_hdr_lock; /* protect lgh_hdr data */
+       struct llog_logid        lgh_id; /* id of this log */
+       struct llog_log_hdr     *lgh_hdr;
+       struct file             *lgh_file;
+       struct dt_object        *lgh_obj;
+       int                      lgh_last_idx;
+       int                      lgh_cur_idx; /* used during llog_process */
+       __u64                    lgh_cur_offset; /* used during llog_process */
+       struct llog_ctxt        *lgh_ctxt;
+       union {
+               struct plain_handle_data         phd;
+               struct cat_handle_data           chd;
+       } u;
+       char                    *lgh_name;
+       void                    *private_data;
+       struct llog_operations  *lgh_logops;
+       atomic_t                 lgh_refcount;
+};
+
+/* llog_lvfs.c */
+extern struct llog_operations llog_lvfs_ops;
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count,
+                         struct llog_catid *idarray);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count,
+                         struct llog_catid *idarray);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP             0x00000002
+
+struct llog_ctxt {
+       int                   loc_idx; /* my index the obd array of ctxt's */
+       struct obd_device       *loc_obd; /* points back to the containing obd*/
+       struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+       struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+       struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+                                            pointing import */
+       struct llog_operations  *loc_logops;
+       struct llog_handle      *loc_handle;
+       struct mutex             loc_mutex; /* protect loc_imp */
+       atomic_t             loc_refcount;
+       long                 loc_flags; /* flags, see above defines */
+       struct dt_object        *loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+                              struct llog_operations **lop)
+{
+       if (ctxt == NULL)
+               return -ENOTCONN;
+
+       *lop = ctxt->loc_logops;
+       if (*lop == NULL)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+                                 struct llog_operations **lop)
+{
+       if (loghandle == NULL || loghandle->lgh_logops == NULL)
+               return -EINVAL;
+
+       *lop = loghandle->lgh_logops;
+       return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+       return cfs_size_round(len);
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+       atomic_inc(&ctxt->loc_refcount);
+       CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+              atomic_read(&ctxt->loc_refcount));
+       return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+       if (ctxt == NULL)
+               return;
+       LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+              atomic_read(&ctxt->loc_refcount) - 1);
+       __llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+       init_waitqueue_head(&olg->olg_waitq);
+       spin_lock_init(&olg->olg_lock);
+       mutex_init(&olg->olg_cat_processing);
+       olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+                                     struct llog_ctxt *ctxt, int index)
+{
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+       spin_lock(&olg->olg_lock);
+       if (olg->olg_ctxts[index] != NULL) {
+               spin_unlock(&olg->olg_lock);
+               return -EEXIST;
+       }
+       olg->olg_ctxts[index] = ctxt;
+       spin_unlock(&olg->olg_lock);
+       return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+                                                   int index)
+{
+       struct llog_ctxt *ctxt;
+
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+       spin_lock(&olg->olg_lock);
+       if (olg->olg_ctxts[index] == NULL)
+               ctxt = NULL;
+       else
+               ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+       spin_unlock(&olg->olg_lock);
+       return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+       spin_lock(&olg->olg_lock);
+       olg->olg_ctxts[index] = NULL;
+       spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+                                                int index)
+{
+       return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+       return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+       return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+                              struct llog_handle *handle)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_destroy == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_destroy(env, handle);
+       RETURN(rc);
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle, int *cur_idx,
+                                 int next_idx, __u64 *cur_offset, void *buf,
+                                 int len)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_next_block == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+                                cur_offset, buf, len);
+       RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int prev_idx, void *buf, int len)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_prev_block == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+       RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+                              struct llog_logid *logid, struct llog_gen *gen,
+                              struct obd_uuid *uuid)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_obd2ops(ctxt, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_connect == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_connect(ctxt, logid, gen, uuid);
+       RETURN(rc);
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+                       struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+               struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+                          struct llog_handle *handle,
+                          struct llog_rec_hdr *rec, int idx,
+                          struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+                  struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+                  int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+            struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+            void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+                    struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+                      struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+                  struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+                    struct llog_handle **res, struct llog_logid *logid,
+                    char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+              struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+              struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+              int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644 (file)
index 0000000..fb1561a
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+       struct mutex            rpcl_mutex;
+       struct lookup_intent    *rpcl_it;
+       int                     rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+       mutex_init(&lck->rpcl_mutex);
+       lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+                                   struct lookup_intent *it)
+{
+       ENTRY;
+
+       if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+               return;
+
+       /* This would normally block until the existing request finishes.
+        * If fail_loc is set it will block until the regular request is
+        * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+        * it will only be cleared when all fake requests are finished.
+        * Only when all fake requests are finished can normal requests
+        * be sent, to ensure they are recoverable again. */
+ again:
+       mutex_lock(&lck->rpcl_mutex);
+
+       if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+               lck->rpcl_it = MDC_FAKE_RPCL_IT;
+               lck->rpcl_fakes++;
+               mutex_unlock(&lck->rpcl_mutex);
+               return;
+       }
+
+       /* This will only happen when the CFS_FAIL_CHECK() was
+        * just turned off but there are still requests in progress.
+        * Wait until they finish.  It doesn't need to be efficient
+        * in this extremely rare case, just have low overhead in
+        * the common case when it isn't true. */
+       while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+               mutex_unlock(&lck->rpcl_mutex);
+               schedule_timeout(cfs_time_seconds(1) / 4);
+               goto again;
+       }
+
+       LASSERT(lck->rpcl_it == NULL);
+       lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+                                   struct lookup_intent *it)
+{
+       if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+               goto out;
+
+       if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+               mutex_lock(&lck->rpcl_mutex);
+
+               LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+               lck->rpcl_fakes--;
+
+               if (lck->rpcl_fakes == 0)
+                       lck->rpcl_it = NULL;
+
+       } else {
+               LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+               lck->rpcl_it = NULL;
+       }
+
+       mutex_unlock(&lck->rpcl_mutex);
+ out:
+       EXIT;
+}
+
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+                                              struct mdt_body *body)
+{
+       if (body->valid & OBD_MD_FLMODEASIZE) {
+               if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+                       exp->exp_obd->u.cli.cl_max_mds_easize =
+                                               body->max_mdsize;
+               if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+                                               body->max_cookiesize)
+                       exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+                                               body->max_cookiesize;
+       }
+}
+
+
+struct mdc_cache_waiter {
+       struct list_head              mcw_entry;
+       wait_queue_head_t            mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644 (file)
index 0000000..b386f87
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+       struct obd_uuid *uuid;
+       int group;
+};
+
+struct mds_capa_info {
+       struct obd_uuid *uuid;
+       struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+       return !(flags & MDS_OPEN_DELAY_CREATE ||
+              !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE     0200000000
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h
new file mode 100644 (file)
index 0000000..dba26a6
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_MDT_H
+#define __LINUX_MDT_H
+
+/** \defgroup mdt mdt
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <md_object.h>
+#include <dt_object.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Common thread info for mdt, seq and fld
+ */
+struct com_thread_info {
+       /*
+        * for req-layout interface.
+        */
+       struct req_capsule *cti_pill;
+};
+
+enum {
+       ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+       LASSERT(rc < 0);
+       LASSERT(-rc < ESERIOUS);
+       return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+       if (rc < 0)
+               rc = -(-rc & ~ESERIOUS);
+       return rc;
+}
+
+static inline int is_serious(int rc)
+{
+       return (rc < 0 && -rc & ESERIOUS);
+}
+
+/** @} mdt */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644 (file)
index 0000000..874412e
--- /dev/null
@@ -0,0 +1,3453 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/lustre_net.h>
+
+#include <linux/libcfs/libcfs.h>
+// #include <obd.h>
+#include <linux/lnet/lnet.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS   2
+#define PTLRPC_BULK_OPS_COUNT  (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK   (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS    (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE    (1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES   (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE                (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE                (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES       (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE                PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES       (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE       (1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#  error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+#  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT      2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS           # buffers to allocate when growing the pool
+ * ?_BUFSIZE       # bytes in a single request buffer
+ * ?_MAXREQSIZE         # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT                # threads to create for each service partition on
+ *                       initializing. If it's non-affinity service and
+ *                       there is only one partition, it's the overall #
+ *                       threads for the service while initializing.
+ * ?_NTHRS_BASE                # threads should be created at least for each
+ *                       ptlrpc partition to keep the service healthy.
+ *                       It's the low-water mark of threads upper-limit
+ *                       for each partition.
+ * ?_THR_FACTOR         # threads can be added on threads upper-limit for
+ *                       each CPU core. This factor is only for reference,
+ *                       we might decrease value of factor if number of cores
+ *                       per CPT is above a limit.
+ * ?_NTHRS_MAX         # overall threads can be created for a service,
+ *                       it's a soft limit because if service is running
+ *                       on machine with hundreds of cores and tens of
+ *                       CPU partitions, we need to guarantee each partition
+ *                       has ?_NTHRS_BASE threads, which means total threads
+ *                       will be ?_NTHRS_BASE * number_of_cpts which can
+ *                       exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT      2
+ * #define MDS_NTHRS_BASE      64
+ * #define MDS_NTHRS_FACTOR    8
+ * #define MDS_NTHRS_MAX       1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *     top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *     MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR                8
+#define LDLM_NTHRS_INIT                PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE                24
+#define LDLM_NTHRS_MAX         (num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS                1024
+#define MDS_MAX_OTHR_THREADS   256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS        PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS   max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR         8
+#define MDS_NTHRS_INIT         PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX          MDS_MAX_THREADS
+#define MDS_NTHRS_BASE         min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR    4
+#define MDS_RDPG_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE    min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR    4
+#define MDS_SETA_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE    min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS              64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *       path name length = PATH_MAX = 4096
+ *       LOV MD size max  = EA_MAX = 24 * 2000
+ *             (NB: 24 is size of lov_ost_data)
+ *       LOV LOGCOOKIE size max = 32 * 2000
+ *             (NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE         (5 * 1024)      /* >= 4736 */
+#define MDS_MAXREPSIZE         (9 * 1024)      /* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE     max(MDS_MAXREQSIZE, \
+                                   362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE     MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg                 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body       184
+ *   mdt_rec_setxattr  136
+ *   lustre_capa       120
+ *   name              256 (XATTR_NAME_MAX)
+ *   value           65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE      66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE     (((max(MDS_EA_MAXREQSIZE, \
+                                      MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE     MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE     (9 * 1024)
+#define MDS_OUT_MAXREPSIZE     MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE            max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE                max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   160 * 1024)
+
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define MDS_OUT_BUFSIZE                max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE    (1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE    (1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX  32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR         min_t(int, 8, \
+                               NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT))
+#define OSS_NTHRS_INIT         (PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE         64
+#define OSS_NTHRS_MAX          512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR      1
+#define OSS_CR_NTHRS_INIT      PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE      8
+#define OSS_CR_NTHRS_MAX       64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ *     lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ *     DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+                            sizeof(struct ptlrpc_body) + \
+                            sizeof(struct obdo) + \
+                            sizeof(struct obd_ioobj) + \
+                            sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE         (5 * 1024)
+#define OST_IO_MAXREQSIZE      max_t(int, OST_MAXREQSIZE, \
+                               (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE         (9 * 1024)
+#define OST_IO_MAXREPSIZE      OST_MAXREPSIZE
+
+#define OST_NBUFS              64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE            max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE         max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+       /** linkage for connections hash table */
+       struct hlist_node       c_hash;
+       /** Our own lnet nid for this connection */
+       lnet_nid_t            c_self;
+       /** Remote side nid for this connection */
+       lnet_process_id_t       c_peer;
+       /** UUID of the other side */
+       struct obd_uuid  c_remote_uuid;
+       /** reference counter for this connection */
+       atomic_t            c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+       /** What lnet portal does this client send messages to by default */
+       __u32              cli_request_portal;
+       /** What portal do we expect replies on */
+       __u32              cli_reply_portal;
+       /** Name of the client */
+       char               *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+       /**
+        * Scratchpad for passing args to completion interpreter. Users
+        * cast to the struct of their choosing, and CLASSERT that this is
+        * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+        * a pointer to it here.  The pointer_arg ensures this struct is at
+        * least big enough for that.
+        */
+       void      *pointer_arg[11];
+       __u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+       atomic_t          set_refcount;
+       /** number of in queue requests */
+       atomic_t          set_new_count;
+       /** number of uncompleted requests */
+       atomic_t          set_remaining;
+       /** wait queue to wait on for request events */
+       wait_queue_head_t          set_waitq;
+       wait_queue_head_t         *set_wakeup_ptr;
+       /** List of requests in the set */
+       struct list_head            set_requests;
+       /**
+        * List of completion callbacks to be called when the set is completed
+        * This is only used if \a set_interpret is NULL.
+        * Links struct ptlrpc_set_cbdata.
+        */
+       struct list_head            set_cblist;
+       /** Completion callback, if only one. */
+       set_interpreter_func  set_interpret;
+       /** opaq argument passed to completion \a set_interpret callback. */
+       void             *set_arg;
+       /**
+        * Lock for \a set_new_requests manipulations
+        * locked so that any old caller can communicate requests to
+        * the set holder who can then fold them into the lock-free set
+        */
+       spinlock_t              set_new_req_lock;
+       /** List of new yet unsent requests. Only used with ptlrpcd now. */
+       struct list_head            set_new_requests;
+
+       /** rq_status of requests that have been freed already */
+       int                set_rc;
+       /** Additional fields used by the flow control extension */
+       /** Maximum number of RPCs in flight */
+       int                set_max_inflight;
+       /** Callback function used to generate RPCs */
+       set_producer_func     set_producer;
+       /** opaq argument passed to the producer callback */
+       void             *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+       /** List linkage item */
+       struct list_head              psc_item;
+       /** Pointer to interpreting function */
+       set_interpreter_func    psc_interpret;
+       /** Opaq argument to pass to the callback */
+       void               *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+       void   (*cbid_fn)(lnet_event_t *ev);     /* specific callback fn */
+       void    *cbid_arg;                    /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+       /** Callback description */
+       struct ptlrpc_cb_id    rs_cb_id;
+       /** Linkage for list of all reply states in a system */
+       struct list_head             rs_list;
+       /** Linkage for list of all reply states on same export */
+       struct list_head             rs_exp_list;
+       /** Linkage for list of all reply states for same obd */
+       struct list_head             rs_obd_list;
+#if RS_DEBUG
+       struct list_head             rs_debug_list;
+#endif
+       /** A spinlock to protect the reply state flags */
+       spinlock_t              rs_lock;
+       /** Reply state flags */
+       unsigned long     rs_difficult:1;     /* ACK/commit stuff */
+       unsigned long     rs_no_ack:1;    /* no ACK, even for
+                                                 difficult requests */
+       unsigned long     rs_scheduled:1;     /* being handled? */
+       unsigned long     rs_scheduled_ever:1;/* any schedule attempts? */
+       unsigned long     rs_handled:1;  /* been handled yet? */
+       unsigned long     rs_on_net:1;   /* reply_out_callback pending? */
+       unsigned long     rs_prealloc:1; /* rs from prealloc list */
+       unsigned long     rs_committed:1;/* the transaction was committed
+                                                and the rs was dispatched
+                                                by ptlrpc_commit_replies */
+       /** Size of the state */
+       int                 rs_size;
+       /** opcode */
+       __u32             rs_opc;
+       /** Transaction number */
+       __u64             rs_transno;
+       /** xid */
+       __u64             rs_xid;
+       struct obd_export     *rs_export;
+       struct ptlrpc_service_part *rs_svcpt;
+       /** Lnet metadata handle for the reply */
+       lnet_handle_md_t       rs_md_h;
+       atomic_t           rs_refcount;
+
+       /** Context for the sevice thread */
+       struct ptlrpc_svc_ctx *rs_svc_ctx;
+       /** Reply buffer (actually sent to the client), encoded if needed */
+       struct lustre_msg     *rs_repbuf;       /* wrapper */
+       /** Size of the reply buffer */
+       int                 rs_repbuf_len;   /* wrapper buf length */
+       /** Size of the reply message */
+       int                 rs_repdata_len;  /* wrapper msg length */
+       /**
+        * Actual reply message. Its content is encrupted (if needed) to
+        * produce reply buffer for actual sending. In simple case
+        * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+        */
+       struct lustre_msg     *rs_msg;    /* reply message */
+
+       /** Number of locks awaiting client ACK */
+       int                 rs_nlocks;
+       /** Handles of locks awaiting client reply ACK */
+       struct lustre_handle   rs_locks[RS_MAX_LOCKS];
+       /** Lock modes of locks in \a rs_locks */
+       ldlm_mode_t         rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+       RQ_PHASE_NEW        = 0xebc0de00,
+       RQ_PHASE_RPC        = 0xebc0de01,
+       RQ_PHASE_BULK      = 0xebc0de02,
+       RQ_PHASE_INTERPRET      = 0xebc0de03,
+       RQ_PHASE_COMPLETE       = 0xebc0de04,
+       RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+       RQ_PHASE_UNDEFINED      = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+                                   struct ptlrpc_request *req,
+                                   void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+       /** Locks the list */
+       spinlock_t prp_lock;
+       /** list of ptlrpc_request structs */
+       struct list_head prp_req_list;
+       /** Maximum message size that would fit into a rquest from this pool */
+       int prp_rq_size;
+       /** Function to allocate more requests for this pool */
+       void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+       /**
+        * Not a valid opcode.
+        */
+       PTLRPC_NRS_CTL_INVALID,
+       /**
+        * Activate the policy.
+        */
+       PTLRPC_NRS_CTL_START,
+       /**
+        * Reserved for multiple primary policies, which may be a possibility
+        * in the future.
+        */
+       PTLRPC_NRS_CTL_STOP,
+       /**
+        * Policies can start using opcodes from this value and onwards for
+        * their own purposes; the assigned value itself is arbitrary.
+        */
+       PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+       NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       NRS_CTL_ORR_WR_QUANTUM,
+       NRS_CTL_ORR_RD_OFF_TYPE,
+       NRS_CTL_ORR_WR_OFF_TYPE,
+       NRS_CTL_ORR_RD_SUPP_REQ,
+       NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+       /**
+        * Called during policy registration; this operation is optional.
+        *
+        * \param[in,out] policy The policy being initialized
+        */
+       int     (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called during policy unregistration; this operation is optional.
+        *
+        * \param[in,out] policy The policy being unregistered/finalized
+        */
+       void    (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called when activating a policy via lprocfs; policies allocate and
+        * initialize their resources here; this operation is optional.
+        *
+        * \param[in,out] policy The policy being started
+        *
+        * \see nrs_policy_start_locked()
+        */
+       int     (*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called when deactivating a policy via lprocfs; policies deallocate
+        * their resources here; this operation is optional
+        *
+        * \param[in,out] policy The policy being stopped
+        *
+        * \see nrs_policy_stop0()
+        */
+       void    (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Used for policy-specific operations; i.e. not generic ones like
+        * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+        * to an ioctl; this operation is optional.
+        *
+        * \param[in,out]        policy The policy carrying out operation \a opc
+        * \param[in]     opc    The command operation being carried out
+        * \param[in,out] arg    An generic buffer for communication between the
+        *                       user and the control operation
+        *
+        * \retval -ve error
+        * \retval   0 success
+        *
+        * \see ptlrpc_nrs_policy_control()
+        */
+       int     (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+                                 enum ptlrpc_nrs_ctl opc, void *arg);
+
+       /**
+        * Called when obtaining references to the resources of the resource
+        * hierarchy for a request that has arrived for handling at the PTLRPC
+        * service. Policies should return -ve for requests they do not wish
+        * to handle. This operation is mandatory.
+        *
+        * \param[in,out] policy  The policy we're getting resources for.
+        * \param[in,out] nrq     The request we are getting resources for.
+        * \param[in]     parent  The parent resource of the resource being
+        *                        requested; set to NULL if none.
+        * \param[out]    resp    The resource is to be returned here; the
+        *                        fallback policy in an NRS head should
+        *                        \e always return a non-NULL pointer value.
+        * \param[in]  moving_req When set, signifies that this is an attempt
+        *                        to obtain resources for a request being moved
+        *                        to the high-priority NRS head by
+        *                        ldlm_lock_reorder_req().
+        *                        This implies two things:
+        *                        1. We are under obd_export::exp_rpc_lock and
+        *                        so should not sleep.
+        *                        2. We should not perform non-idempotent or can
+        *                        skip performing idempotent operations that
+        *                        were carried out when resources were first
+        *                        taken for the request when it was initialized
+        *                        in ptlrpc_nrs_req_initialize().
+        *
+        * \retval 0, +ve The level of the returned resource in the resource
+        *                hierarchy; currently only 0 (for a non-leaf resource)
+        *                and 1 (for a leaf resource) are supported by the
+        *                framework.
+        * \retval -ve    error
+        *
+        * \see ptlrpc_nrs_req_initialize()
+        * \see ptlrpc_nrs_hpreq_add_nolock()
+        * \see ptlrpc_nrs_req_hp_move()
+        */
+       int     (*op_res_get) (struct ptlrpc_nrs_policy *policy,
+                              struct ptlrpc_nrs_request *nrq,
+                              const struct ptlrpc_nrs_resource *parent,
+                              struct ptlrpc_nrs_resource **resp,
+                              bool moving_req);
+       /**
+        * Called when releasing references taken for resources in the resource
+        * hierarchy for the request; this operation is optional.
+        *
+        * \param[in,out] policy The policy the resource belongs to
+        * \param[in] res        The resource to be freed
+        *
+        * \see ptlrpc_nrs_req_finalize()
+        * \see ptlrpc_nrs_hpreq_add_nolock()
+        * \see ptlrpc_nrs_req_hp_move()
+        */
+       void    (*op_res_put) (struct ptlrpc_nrs_policy *policy,
+                              const struct ptlrpc_nrs_resource *res);
+
+       /**
+        * Obtains a request for handling from the policy, and optionally
+        * removes the request from the policy; this operation is mandatory.
+        *
+        * \param[in,out] policy The policy to poll
+        * \param[in]     peek   When set, signifies that we just want to
+        *                       examine the request, and not handle it, so the
+        *                       request is not removed from the policy.
+        * \param[in]     force  When set, it will force a policy to return a
+        *                       request if it has one queued.
+        *
+        * \retval NULL No request available for handling
+        * \retval valid-pointer The request polled for handling
+        *
+        * \see ptlrpc_nrs_req_get_nolock()
+        */
+       struct ptlrpc_nrs_request *
+               (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+                              bool force);
+       /**
+        * Called when attempting to add a request to a policy for later
+        * handling; this operation is mandatory.
+        *
+        * \param[in,out] policy  The policy on which to enqueue \a nrq
+        * \param[in,out] nrq The request to enqueue
+        *
+        * \retval 0    success
+        * \retval != 0 error
+        *
+        * \see ptlrpc_nrs_req_add_nolock()
+        */
+       int     (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+                                  struct ptlrpc_nrs_request *nrq);
+       /**
+        * Removes a request from the policy's set of pending requests. Normally
+        * called after a request has been polled successfully from the policy
+        * for handling; this operation is mandatory.
+        *
+        * \param[in,out] policy The policy the request \a nrq belongs to
+        * \param[in,out] nrq    The request to dequeue
+        *
+        * \see ptlrpc_nrs_req_del_nolock()
+        */
+       void    (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+                                  struct ptlrpc_nrs_request *nrq);
+       /**
+        * Called after the request being carried out. Could be used for
+        * job/resource control; this operation is optional.
+        *
+        * \param[in,out] policy The policy which is stopping to handle request
+        *                       \a nrq
+        * \param[in,out] nrq    The request
+        *
+        * \pre spin_is_locked(&svcpt->scp_req_lock)
+        *
+        * \see ptlrpc_nrs_req_stop_nolock()
+        */
+       void    (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_request *nrq);
+       /**
+        * Registers the policy's lprocfs interface with a PTLRPC service.
+        *
+        * \param[in] svc The service
+        *
+        * \retval 0    success
+        * \retval != 0 error
+        */
+       int     (*op_lprocfs_init) (struct ptlrpc_service *svc);
+       /**
+        * Unegisters the policy's lprocfs interface with a PTLRPC service.
+        *
+        * In cases of failed policy registration in
+        * \e ptlrpc_nrs_policy_register(), this function may be called for a
+        * service which has not registered the policy successfully, so
+        * implementations of this method should make sure their operations are
+        * safe in such cases.
+        *
+        * \param[in] svc The service
+        */
+       void    (*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+       /**
+        * Fallback policy, use this flag only on a single supported policy per
+        * service. The flag cannot be used on policies that use
+        * \e PTLRPC_NRS_FL_REG_EXTERN
+        */
+       PTLRPC_NRS_FL_FALLBACK          = (1 << 0),
+       /**
+        * Start policy immediately after registering.
+        */
+       PTLRPC_NRS_FL_REG_START         = (1 << 1),
+       /**
+        * This is a policy registering from a module different to the one NRS
+        * core ships in (currently ptlrpc).
+        */
+       PTLRPC_NRS_FL_REG_EXTERN        = (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+       PTLRPC_NRS_QUEUE_REG    = (1 << 0),
+       PTLRPC_NRS_QUEUE_HP     = (1 << 1),
+       PTLRPC_NRS_QUEUE_BOTH   = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+       spinlock_t                      nrs_lock;
+       /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+       /**
+        * List of registered policies
+        */
+       struct list_head                        nrs_policy_list;
+       /**
+        * List of policies with queued requests. Policies that have any
+        * outstanding requests are queued here, and this list is queried
+        * in a round-robin manner from NRS core when obtaining a request
+        * for handling. This ensures that requests from policies that at some
+        * point transition away from the
+        * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+        */
+       struct list_head                        nrs_policy_queued;
+       /**
+        * Service partition for this NRS head
+        */
+       struct ptlrpc_service_part     *nrs_svcpt;
+       /**
+        * Primary policy, which is the preferred policy for handling RPCs
+        */
+       struct ptlrpc_nrs_policy       *nrs_policy_primary;
+       /**
+        * Fallback policy, which is the backup policy for handling RPCs
+        */
+       struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+       /**
+        * This NRS head handles either HP or regular requests
+        */
+       enum ptlrpc_nrs_queue_type      nrs_queue_type;
+       /**
+        * # queued requests from all policies in this NRS head
+        */
+       unsigned long                   nrs_req_queued;
+       /**
+        * # scheduled requests from all policies in this NRS head
+        */
+       unsigned long                   nrs_req_started;
+       /**
+        * # policies on this NRS
+        */
+       unsigned                        nrs_num_pols;
+       /**
+        * This NRS head is in progress of starting a policy
+        */
+       unsigned                        nrs_policy_starting:1;
+       /**
+        * In progress of shutting down the whole NRS head; used during
+        * unregistration
+        */
+       unsigned                        nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX               16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+                                      const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+       /**
+        * Human-readable policy name
+        */
+       char                               nc_name[NRS_POL_NAME_MAX];
+       /**
+        * NRS operations for this policy
+        */
+       const struct ptlrpc_nrs_pol_ops   *nc_ops;
+       /**
+        * Service compatibility predicate
+        */
+       nrs_pol_desc_compat_t              nc_compat;
+       /**
+        * Set for policies that support a single ptlrpc service, i.e. ones that
+        * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+        * depicts the name of the single service that such policies are
+        * compatible with.
+        */
+       const char                        *nc_compat_svc_name;
+       /**
+        * Owner module for this policy descriptor; policies registering from a
+        * different module to the one the NRS framework is held within
+        * (currently ptlrpc), should set this field to THIS_MODULE.
+        */
+       module_t                          *nc_owner;
+       /**
+        * Policy registration flags; a bitmast of \e nrs_policy_flags
+        */
+       unsigned                           nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+       /**
+        * Human-readable policy name
+        */
+       char                                    pd_name[NRS_POL_NAME_MAX];
+       /**
+        * Link into nrs_core::nrs_policies
+        */
+       struct list_head                                pd_list;
+       /**
+        * NRS operations for this policy
+        */
+       const struct ptlrpc_nrs_pol_ops        *pd_ops;
+       /**
+        * Service compatibility predicate
+        */
+       nrs_pol_desc_compat_t                   pd_compat;
+       /**
+        * Set for policies that are compatible with only one PTLRPC service.
+        *
+        * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+        */
+       const char                             *pd_compat_svc_name;
+       /**
+        * Owner module for this policy descriptor.
+        *
+        * We need to hold a reference to the module whenever we might make use
+        * of any of the module's contents, i.e.
+        * - If one or more instances of the policy are at a state where they
+        *   might be handling a request, i.e.
+        *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+        *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+        *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+        *   is taken on the module when
+        *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+        *   becomes 0, so that we hold only one reference to the module maximum
+        *   at any time.
+        *
+        *   We do not need to hold a reference to the module, even though we
+        *   might use code and data from the module, in the following cases:
+        * - During external policy registration, because this should happen in
+        *   the module's init() function, in which case the module is safe from
+        *   removal because a reference is being held on the module by the
+        *   kernel, and iirc kmod (and I guess module-init-tools also) will
+        *   serialize any racing processes properly anyway.
+        * - During external policy unregistration, because this should happen
+        *   in a module's exit() function, and any attempts to start a policy
+        *   instance would need to take a reference on the module, and this is
+        *   not possible once we have reached the point where the exit()
+        *   handler is called.
+        * - During service registration and unregistration, as service setup
+        *   and cleanup, and policy registration, unregistration and policy
+        *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+        *   as long as users adhere to the convention of registering policies
+        *   in init() and unregistering them in module exit() functions, there
+        *   should not be a race between these operations.
+        * - During any policy-specific lprocfs operations, because a reference
+        *   is held by the kernel on a proc entry that has been entered by a
+        *   syscall, so as long as proc entries are removed during unregistration time,
+        *   then unregistration and lprocfs operations will be properly
+        *   serialized.
+        */
+       module_t                               *pd_owner;
+       /**
+        * Bitmask of \e nrs_policy_flags
+        */
+       unsigned                                pd_flags;
+       /**
+        * # of references on this descriptor
+        */
+       atomic_t                                pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+       /**
+        * Not a valid policy state.
+        */
+       NRS_POL_STATE_INVALID,
+       /**
+        * Policies are at this state either at the start of their life, or
+        * transition here when the user selects a different policy to act
+        * as the primary one.
+        */
+       NRS_POL_STATE_STOPPED,
+       /**
+        * Policy is progress of stopping
+        */
+       NRS_POL_STATE_STOPPING,
+       /**
+        * Policy is in progress of starting
+        */
+       NRS_POL_STATE_STARTING,
+       /**
+        * A policy is in this state in two cases:
+        * - it is the fallback policy, which is always in this state.
+        * - it has been activated by the user; i.e. it is the primary policy,
+        */
+       NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+       /**
+        * Policy name
+        */
+       char                            pi_name[NRS_POL_NAME_MAX];
+       /**
+        * Current policy state
+        */
+       enum ptlrpc_nrs_pol_state       pi_state;
+       /**
+        * # RPCs enqueued for later dispatching by the policy
+        */
+       long                            pi_req_queued;
+       /**
+        * # RPCs started for dispatch by the policy
+        */
+       long                            pi_req_started;
+       /**
+        * Is this a fallback policy?
+        */
+       unsigned                        pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+       /**
+        * Linkage into the NRS head's list of policies,
+        * ptlrpc_nrs:nrs_policy_list
+        */
+       struct list_head                        pol_list;
+       /**
+        * Linkage into the NRS head's list of policies with enqueued
+        * requests ptlrpc_nrs:nrs_policy_queued
+        */
+       struct list_head                        pol_list_queued;
+       /**
+        * Current state of this policy
+        */
+       enum ptlrpc_nrs_pol_state       pol_state;
+       /**
+        * Bitmask of nrs_policy_flags
+        */
+       unsigned                        pol_flags;
+       /**
+        * # RPCs enqueued for later dispatching by the policy
+        */
+       long                            pol_req_queued;
+       /**
+        * # RPCs started for dispatch by the policy
+        */
+       long                            pol_req_started;
+       /**
+        * Usage Reference count taken on the policy instance
+        */
+       long                            pol_ref;
+       /**
+        * The NRS head this policy has been created at
+        */
+       struct ptlrpc_nrs              *pol_nrs;
+       /**
+        * Private policy data; varies by policy type
+        */
+       void                           *pol_private;
+       /**
+        * Policy descriptor for this policy instance.
+        */
+       struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+       /**
+        * This NRS resource's parent; is NULL for resources embedded in NRS
+        * policy instances; i.e. those are top-level ones.
+        */
+       struct ptlrpc_nrs_resource     *res_parent;
+       /**
+        * The policy associated with this resource.
+        */
+       struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+       NRS_RES_FALLBACK,
+       NRS_RES_PRIMARY,
+       NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+       /**
+        * Resource object for policy instance.
+        */
+       struct ptlrpc_nrs_resource      fh_res;
+       /**
+        * List of queued requests.
+        */
+       struct list_head                        fh_list;
+       /**
+        * For debugging purposes.
+        */
+       __u64                           fh_sequence;
+};
+
+struct nrs_fifo_req {
+       struct list_head                fr_list;
+       __u64                   fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+       struct ptlrpc_nrs_resource      cn_res;
+       cfs_binheap_t                  *cn_binheap;
+       cfs_hash_t                     *cn_cli_hash;
+       /**
+        * Used when a new scheduling round commences, in order to synchronize
+        * all clients with the new round number.
+        */
+       __u64                           cn_round;
+       /**
+        * Determines the relevant ordering amongst request batches within a
+        * scheduling round.
+        */
+       __u64                           cn_sequence;
+       /**
+        * Round Robin quantum; the maximum number of RPCs that each request
+        * batch for each client can have in a scheduling round.
+        */
+       __u16                           cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+       struct ptlrpc_nrs_resource      cc_res;
+       struct hlist_node               cc_hnode;
+       lnet_nid_t                      cc_nid;
+       /**
+        * The round number against which this client is currently scheduling
+        * requests.
+        */
+       __u64                           cc_round;
+       /**
+        * The sequence number used for requests scheduled by this client during
+        * the current round number.
+        */
+       __u64                           cc_sequence;
+       atomic_t                        cc_ref;
+       /**
+        * Round Robin quantum; the maximum number of RPCs the client is allowed
+        * to schedule in a single batch of each round.
+        */
+       __u16                           cc_quantum;
+       /**
+        * # of pending requests for this client, on all existing rounds
+        */
+       __u16                           cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+       /**
+        * Round number for this request; shared with all other requests in the
+        * same batch.
+        */
+       __u64                   cr_round;
+       /**
+        * Sequence number for this request; shared with all other requests in
+        * the same batch.
+        */
+       __u64                   cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+       /**
+        * Read the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       /**
+        * Write the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+       __u64           or_start;
+       __u64           or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+       NOS_OST_READ  = (1 << 0),
+       NOS_OST_WRITE = (1 << 1),
+       NOS_OST_RW    = (NOS_OST_READ | NOS_OST_WRITE),
+       /**
+        * Default value for policies.
+        */
+       NOS_DFLT      = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *     allows to consolidate some of the code between ORR and TRR, and these
+ *     policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+       union {
+               /** object FID for ORR */
+               struct lu_fid   ok_fid;
+               /** OST index for TRR */
+               __u32           ok_idx;
+       };
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX   (sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+       struct ptlrpc_nrs_resource      od_res;
+       cfs_binheap_t                  *od_binheap;
+       cfs_hash_t                     *od_obj_hash;
+       struct kmem_cache                      *od_cache;
+       /**
+        * Used when a new scheduling round commences, in order to synchronize
+        * all object or OST batches with the new round number.
+        */
+       __u64                           od_round;
+       /**
+        * Determines the relevant ordering amongst request batches within a
+        * scheduling round.
+        */
+       __u64                           od_sequence;
+       /**
+        * RPC types that are currently supported.
+        */
+       enum nrs_orr_supp               od_supp;
+       /**
+        * Round Robin quantum; the maxium number of RPCs that each request
+        * batch for each object or OST can have in a scheduling round.
+        */
+       __u16                           od_quantum;
+       /**
+        * Whether to use physical disk offsets or logical file offsets.
+        */
+       bool                            od_physical;
+       /**
+        * XXX: We need to provide a persistently allocated string to hold
+        * unique object names for this policy, since in currently supported
+        * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+        * to the name string provided. kstrdup() is used in the version of
+        * kmeme_cache_create() in current Linux mainline, so we may be able to
+        * remove this in the future.
+        */
+       char                            od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+       struct ptlrpc_nrs_resource      oo_res;
+       struct hlist_node               oo_hnode;
+       /**
+        * The round number against which requests are being scheduled for this
+        * object or OST
+        */
+       __u64                           oo_round;
+       /**
+        * The sequence number used for requests scheduled for this object or
+        * OST during the current round number.
+        */
+       __u64                           oo_sequence;
+       /**
+        * The key of the object or OST for which this structure instance is
+        * scheduling RPCs
+        */
+       struct nrs_orr_key              oo_key;
+       atomic_t                        oo_ref;
+       /**
+        * Round Robin quantum; the maximum number of RPCs that are allowed to
+        * be scheduled for the object or OST in a single batch of each round.
+        */
+       __u16                           oo_quantum;
+       /**
+        * # of pending requests for this object or OST, on all existing rounds
+        */
+       __u16                           oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+       /**
+        * The offset range this request covers
+        */
+       struct nrs_orr_req_range        or_range;
+       /**
+        * Round number for this request; shared with all other requests in the
+        * same batch.
+        */
+       __u64                           or_round;
+       /**
+        * Sequence number for this request; shared with all other requests in
+        * the same batch.
+        */
+       __u64                           or_sequence;
+       /**
+        * For debugging purposes.
+        */
+       struct nrs_orr_key              or_key;
+       /**
+        * An ORR policy instance has filled in request information while
+        * enqueueing the request on the service partition's regular NRS head.
+        */
+       unsigned int                    or_orr_set:1;
+       /**
+        * A TRR policy instance has filled in request information while
+        * enqueueing the request on the service partition's regular NRS head.
+        */
+       unsigned int                    or_trr_set:1;
+       /**
+        * Request offset ranges have been filled in with logical offset
+        * values.
+        */
+       unsigned int                    or_logical_set:1;
+       /**
+        * Request offset ranges have been filled in with physical offset
+        * values.
+        */
+       unsigned int                    or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+       /**
+        * The request's resource hierarchy.
+        */
+       struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+       /**
+        * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+        * policy that was used to enqueue the request.
+        *
+        * \see nrs_request_enqueue()
+        */
+       unsigned                        nr_res_idx;
+       unsigned                        nr_initialized:1;
+       unsigned                        nr_enqueued:1;
+       unsigned                        nr_started:1;
+       unsigned                        nr_finalized:1;
+       cfs_binheap_node_t              nr_node;
+
+       /**
+        * Policy-specific fields, used for determining a request's scheduling
+        * priority, and other supporting functionality.
+        */
+       union {
+               /**
+                * Fields for the FIFO policy
+                */
+               struct nrs_fifo_req     fifo;
+               /**
+                * CRR-N request defintion
+                */
+               struct nrs_crrn_req     crr;
+               /** ORR and TRR share the same request definition */
+               struct nrs_orr_req      orr;
+       } nr_u;
+       /**
+        * Externally-registering policies may want to use this to allocate
+        * their own request properties.
+        */
+       void                           *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+       /**
+        * Check if the lock handle of the given lock is the same as
+        * taken from the request.
+        */
+       int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+       /**
+        * Check if the request is a high priority one.
+        */
+       int  (*hpreq_check)(struct ptlrpc_request *);
+       /**
+        * Called after the request has been handled.
+        */
+       void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+       /* Request type: one of PTL_RPC_MSG_* */
+       int rq_type;
+       /** Result of request processing */
+       int rq_status;
+       /**
+        * Linkage item through which this request is included into
+        * sending/delayed lists on client and into rqbd list on server
+        */
+       struct list_head rq_list;
+       /**
+        * Server side list of incoming unserved requests sorted by arrival
+        * time.  Traversed from time to time to notice about to expire
+        * requests and sent back "early replies" to clients to let them
+        * know server is alive and well, just very busy to service their
+        * requests in time
+        */
+       struct list_head rq_timed_list;
+       /** server-side history, used for debuging purposes. */
+       struct list_head rq_history_list;
+       /** server-side per-export list */
+       struct list_head rq_exp_list;
+       /** server-side hp handlers */
+       struct ptlrpc_hpreq_ops *rq_ops;
+
+       /** initial thread servicing this request */
+       struct ptlrpc_thread *rq_svc_thread;
+
+       /** history sequence # */
+       __u64 rq_history_seq;
+       /** \addtogroup  nrs
+        * @{
+        */
+       /** stub for NRS request */
+       struct ptlrpc_nrs_request rq_nrq;
+       /** @} nrs */
+       /** the index of service's srv_at_array into which request is linked */
+       time_t rq_at_index;
+       /** Lock to protect request flags and some other important bits, like
+        * rq_list
+        */
+       spinlock_t rq_lock;
+       /** client-side flags are serialized by rq_lock */
+       unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+               rq_timedout:1, rq_resend:1, rq_restart:1,
+               /**
+                * when ->rq_replay is set, request is kept by the client even
+                * after server commits corresponding transaction. This is
+                * used for operations that require sequence of multiple
+                * requests to be replayed. The only example currently is file
+                * open/close. When last request in such a sequence is
+                * committed, ->rq_replay is cleared on all requests in the
+                * sequence.
+                */
+               rq_replay:1,
+               rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+               rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+               rq_early:1, rq_must_unlink:1,
+               rq_memalloc:1,      /* req originated from "kswapd" */
+               /* server-side flags */
+               rq_packed_final:1,  /* packed final reply */
+               rq_hp:1,            /* high priority RPC */
+               rq_at_linked:1,     /* link into service's srv_at_array */
+               rq_reply_truncate:1,
+               rq_committed:1,
+               /* whether the "rq_set" is a valid one */
+               rq_invalid_rqset:1,
+               rq_generation_set:1,
+               /* do not resend request on -EINPROGRESS */
+               rq_no_retry_einprogress:1,
+               /* allow the req to be sent if the import is in recovery
+                * status */
+               rq_allow_replay:1,
+               /* bulk request, sent to server, but uncommitted */
+               rq_unstable:1;
+
+       unsigned int rq_nr_resend;
+
+       enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+       enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+       atomic_t rq_refcount;/* client-side refcount for SENT race,
+                                   server-side refcounf for multiple replies */
+
+       /** Portal to which this request would be sent */
+       short rq_request_portal;  /* XXX FIXME bug 249 */
+       /** Portal where to wait for reply and where reply would be sent */
+       short rq_reply_portal;    /* XXX FIXME bug 249 */
+
+       /**
+        * client-side:
+        * !rq_truncate : # reply bytes actually received,
+        *  rq_truncate : required repbuf_len for resend
+        */
+       int rq_nob_received;
+       /** Request length */
+       int rq_reqlen;
+       /** Reply length */
+       int rq_replen;
+       /** Request message - what client sent */
+       struct lustre_msg *rq_reqmsg;
+       /** Reply message - server response */
+       struct lustre_msg *rq_repmsg;
+       /** Transaction number */
+       __u64 rq_transno;
+       /** xid */
+       __u64 rq_xid;
+       /**
+        * List item to for replay list. Not yet commited requests get linked
+        * there.
+        * Also see \a rq_replay comment above.
+        */
+       struct list_head rq_replay_list;
+
+       /**
+        * security and encryption data
+        * @{ */
+       struct ptlrpc_cli_ctx   *rq_cli_ctx;     /**< client's half ctx */
+       struct ptlrpc_svc_ctx   *rq_svc_ctx;     /**< server's half ctx */
+       struct list_head               rq_ctx_chain;   /**< link to waited ctx */
+
+       struct sptlrpc_flavor    rq_flvr;       /**< for client & server */
+       enum lustre_sec_part     rq_sp_from;
+
+       /* client/server security flags */
+       unsigned int
+                                rq_ctx_init:1,      /* context initiation */
+                                rq_ctx_fini:1,      /* context destroy */
+                                rq_bulk_read:1,     /* request bulk read */
+                                rq_bulk_write:1,    /* request bulk write */
+                                /* server authentication flags */
+                                rq_auth_gss:1,      /* authenticated by gss */
+                                rq_auth_remote:1,   /* authed as remote user */
+                                rq_auth_usr_root:1, /* authed as root */
+                                rq_auth_usr_mdt:1,  /* authed as mdt */
+                                rq_auth_usr_ost:1,  /* authed as ost */
+                                /* security tfm flags */
+                                rq_pack_udesc:1,
+                                rq_pack_bulk:1,
+                                /* doesn't expect reply FIXME */
+                                rq_no_reply:1,
+                                rq_pill_init:1;     /* pill initialized */
+
+       uid_t               rq_auth_uid;        /* authed uid */
+       uid_t               rq_auth_mapped_uid; /* authed uid mapped to */
+
+       /* (server side), pointed directly into req buffer */
+       struct ptlrpc_user_desc *rq_user_desc;
+
+       /* various buffer pointers */
+       struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+       char                *rq_repbuf;      /* rep buffer */
+       struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+       struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
+       int                   rq_reqbuf_len;  /* req wrapper buf len */
+       int                   rq_reqdata_len; /* req wrapper msg len */
+       int                   rq_repbuf_len;  /* rep buffer len */
+       int                   rq_repdata_len; /* rep wrapper msg len */
+       int                   rq_clrbuf_len;  /* only in priv mode */
+       int                   rq_clrdata_len; /* only in priv mode */
+
+       /** early replies go to offset 0, regular replies go after that */
+       unsigned int         rq_reply_off;
+
+       /** @} */
+
+       /** Fields that help to see if request and reply were swabbed or not */
+       __u32 rq_req_swab_mask;
+       __u32 rq_rep_swab_mask;
+
+       /** What was import generation when this request was sent */
+       int rq_import_generation;
+       enum lustre_imp_state rq_send_state;
+
+       /** how many early replies (for stats) */
+       int rq_early_count;
+
+       /** client+server request */
+       lnet_handle_md_t     rq_req_md_h;
+       struct ptlrpc_cb_id  rq_req_cbid;
+       /** optional time limit for send attempts */
+       cfs_duration_t       rq_delay_limit;
+       /** time request was first queued */
+       cfs_time_t         rq_queued_time;
+
+       /* server-side... */
+       /** request arrival time */
+       struct timeval       rq_arrival_time;
+       /** separated reply state */
+       struct ptlrpc_reply_state *rq_reply_state;
+       /** incoming request buffer */
+       struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+       /** client-only incoming reply */
+       lnet_handle_md_t     rq_reply_md_h;
+       wait_queue_head_t         rq_reply_waitq;
+       struct ptlrpc_cb_id  rq_reply_cbid;
+
+       /** our LNet NID */
+       lnet_nid_t         rq_self;
+       /** Peer description (the other side) */
+       lnet_process_id_t    rq_peer;
+       /** Server-side, export on which request was received */
+       struct obd_export   *rq_export;
+       /** Client side, import where request is being sent */
+       struct obd_import   *rq_import;
+
+       /** Replay callback, called after request is replayed at recovery */
+       void (*rq_replay_cb)(struct ptlrpc_request *);
+       /**
+        * Commit callback, called when request is committed and about to be
+        * freed.
+        */
+       void (*rq_commit_cb)(struct ptlrpc_request *);
+       /** Opaq data for replay and commit callbacks. */
+       void  *rq_cb_data;
+
+       /** For bulk requests on client only: bulk descriptor */
+       struct ptlrpc_bulk_desc *rq_bulk;
+
+       /** client outgoing req */
+       /**
+        * when request/reply sent (secs), or time when request should be sent
+        */
+       time_t rq_sent;
+       /** time for request really sent out */
+       time_t rq_real_sent;
+
+       /** when request must finish. volatile
+        * so that servers' early reply updates to the deadline aren't
+        * kept in per-cpu cache */
+       volatile time_t rq_deadline;
+       /** when req reply unlink must finish. */
+       time_t rq_reply_deadline;
+       /** when req bulk unlink must finish. */
+       time_t rq_bulk_deadline;
+       /**
+        * service time estimate (secs)
+        * If the requestsis not served by this time, it is marked as timed out.
+        */
+       int    rq_timeout;
+
+       /** Multi-rpc bits */
+       /** Per-request waitq introduced by bug 21938 for recovery waiting */
+       wait_queue_head_t rq_set_waitq;
+       /** Link item for request set lists */
+       struct list_head  rq_set_chain;
+       /** Link back to the request set */
+       struct ptlrpc_request_set *rq_set;
+       /** Async completion handler, called when reply is received */
+       ptlrpc_interpterer_t rq_interpret_reply;
+       /** Async completion context */
+       union ptlrpc_async_args rq_async_args;
+
+       /** Pool if request is from preallocated list */
+       struct ptlrpc_request_pool *rq_pool;
+
+       struct lu_context          rq_session;
+       struct lu_context          rq_recov_session;
+
+       /** request format description */
+       struct req_capsule        rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+                                      struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_interpret_reply != NULL) {
+               req->rq_status = req->rq_interpret_reply(env, req,
+                                                        &req->rq_async_args,
+                                                        rc);
+               return req->rq_status;
+       }
+       return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+       /**
+        * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+        * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+        * to make sure it has not been scheduled yet (analogous to previous
+        * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+        */
+       return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+       return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+       return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+       return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+       return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+       LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+       req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+       LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+       req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+       switch (phase) {
+       case RQ_PHASE_NEW:
+               return "New";
+       case RQ_PHASE_RPC:
+               return "Rpc";
+       case RQ_PHASE_BULK:
+               return "Bulk";
+       case RQ_PHASE_INTERPRET:
+               return "Interpret";
+       case RQ_PHASE_COMPLETE:
+               return "Complete";
+       case RQ_PHASE_UNREGISTERING:
+               return "Unregistering";
+       default:
+               return "?Phase?";
+       }
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+       return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)                                               \
+       ptlrpc_rqphase2str(req),                                                \
+       FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                \
+       FLAG(req->rq_err, "E"),                                          \
+       FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+       FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),            \
+       FLAG(req->rq_no_resend, "N"),                                      \
+       FLAG(req->rq_waiting, "W"),                                          \
+       FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),                  \
+       FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+               struct libcfs_debug_msg_data *data, const char *fmt, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)                 \
+do {                                                                     \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                            \
+                                                                             \
+       if (((mask) & D_CANTMASK) != 0 ||                                    \
+           ((libcfs_debug & (mask)) != 0 &&                              \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+               _debug_req((req), msgdata, fmt, ##a);                    \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)                               \
+do {                                                                     \
+       if ((level) & (D_ERROR | D_WARNING)) {                          \
+               static cfs_debug_limit_state_t cdls;                      \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);          \
+               debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+       } else {                                                              \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);            \
+               debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+       }                                                                    \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+       /** Linkage to list of pages in a bulk */
+       struct list_head       bp_link;
+       /**
+        * Number of bytes in a page to transfer starting from \a bp_pageoffset
+        */
+       int           bp_buflen;
+       /** offset within a page */
+       int           bp_pageoffset;
+       /** The page itself */
+       struct page     *bp_page;
+};
+
+#define BULK_GET_SOURCE   0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+       /** completed with failure */
+       unsigned long bd_failure:1;
+       /** {put,get}{source,sink} */
+       unsigned long bd_type:2;
+       /** client side */
+       unsigned long bd_registered:1;
+       /** For serialization with callback */
+       spinlock_t bd_lock;
+       /** Import generation when request for this bulk was sent */
+       int bd_import_generation;
+       /** LNet portal for this bulk */
+       __u32 bd_portal;
+       /** Server side - export this bulk created for */
+       struct obd_export *bd_export;
+       /** Client side - import this bulk was sent on */
+       struct obd_import *bd_import;
+       /** Back pointer to the request */
+       struct ptlrpc_request *bd_req;
+       wait_queue_head_t           bd_waitq;   /* server side only WQ */
+       int                 bd_iov_count;    /* # entries in bd_iov */
+       int                 bd_max_iov;      /* allocated size of bd_iov */
+       int                 bd_nob;       /* # bytes covered */
+       int                 bd_nob_transferred; /* # bytes GOT/PUT */
+
+       __u64             bd_last_xid;
+
+       struct ptlrpc_cb_id    bd_cbid;  /* network callback info */
+       lnet_nid_t           bd_sender;       /* stash event::sender */
+       int                     bd_md_count;    /* # valid entries in bd_mds */
+       int                     bd_md_max_brw;  /* max entries in bd_mds */
+       /** array of associated MDs */
+       lnet_handle_md_t        bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+       /*
+        * encrypt iov, size is either 0 or bd_iov_count.
+        */
+       lnet_kiov_t        *bd_enc_iov;
+
+       lnet_kiov_t         bd_iov[0];
+};
+
+enum {
+       SVC_STOPPED     = 1 << 0,
+       SVC_STOPPING    = 1 << 1,
+       SVC_STARTING    = 1 << 2,
+       SVC_RUNNING     = 1 << 3,
+       SVC_EVENT       = 1 << 4,
+       SVC_SIGNAL      = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN            32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+       /**
+        * List of active threads in svc->srv_threads
+        */
+       struct list_head t_link;
+       /**
+        * thread-private data (preallocated memory)
+        */
+       void *t_data;
+       __u32 t_flags;
+       /**
+        * service thread index, from ptlrpc_start_threads
+        */
+       unsigned int t_id;
+       /**
+        * service thread pid
+        */
+       pid_t t_pid;
+       /**
+        * put watchdog in the structure per thread b=14840
+        */
+       struct lc_watchdog *t_watchdog;
+       /**
+        * the svc this thread belonged to b=18582
+        */
+       struct ptlrpc_service_part      *t_svcpt;
+       wait_queue_head_t                       t_ctl_waitq;
+       struct lu_env                   *t_env;
+       char                            t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+       return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+                                             __u32 flags)
+{
+       if (thread->t_flags & flags) {
+               thread->t_flags &= ~flags;
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+       /** Link item for rqbds on a service */
+       struct list_head             rqbd_list;
+       /** History of requests for this buffer */
+       struct list_head             rqbd_reqs;
+       /** Back pointer to service for which this buffer is registered */
+       struct ptlrpc_service_part *rqbd_svcpt;
+       /** LNet descriptor */
+       lnet_handle_md_t       rqbd_md_h;
+       int                 rqbd_refcount;
+       /** The buffer itself */
+       char              *rqbd_buffer;
+       struct ptlrpc_cb_id    rqbd_cbid;
+       /**
+        * This "embedded" request structure is only used for the
+        * last request to fit into the buffer
+        */
+       struct ptlrpc_request  rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+       /**
+        * if non-NULL called during thread creation (ptlrpc_start_thread())
+        * to initialize service specific per-thread state.
+        */
+       int             (*so_thr_init)(struct ptlrpc_thread *thr);
+       /**
+        * if non-NULL called during thread shutdown (ptlrpc_main()) to
+        * destruct state created by ->srv_init().
+        */
+       void            (*so_thr_done)(struct ptlrpc_thread *thr);
+       /**
+        * Handler function for incoming requests for this service
+        */
+       int             (*so_req_handler)(struct ptlrpc_request *req);
+       /**
+        * function to determine priority of the request, it's called
+        * on every new request
+        */
+       int             (*so_hpreq_handler)(struct ptlrpc_request *);
+       /**
+        * service-specific print fn
+        */
+       void            (*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+       /** serialize /proc operations */
+       spinlock_t                      srv_lock;
+       /** most often accessed fields */
+       /** chain thru all services */
+       struct list_head                      srv_list;
+       /** service operations table */
+       struct ptlrpc_service_ops       srv_ops;
+       /** only statically allocated strings here; we don't clean them */
+       char                       *srv_name;
+       /** only statically allocated strings here; we don't clean them */
+       char                       *srv_thread_name;
+       /** service thread list */
+       struct list_head                      srv_threads;
+       /** threads # should be created for each partition on initializing */
+       int                             srv_nthrs_cpt_init;
+       /** limit of threads number for each partition */
+       int                             srv_nthrs_cpt_limit;
+       /** Root of /proc dir tree for this service */
+       proc_dir_entry_t           *srv_procroot;
+       /** Pointer to statistic data for this service */
+       struct lprocfs_stats       *srv_stats;
+       /** # hp per lp reqs to handle */
+       int                          srv_hpreq_ratio;
+       /** biggest request to receive */
+       int                          srv_max_req_size;
+       /** biggest reply to send */
+       int                          srv_max_reply_size;
+       /** size of individual buffers */
+       int                          srv_buf_size;
+       /** # buffers to allocate in 1 group */
+       int                          srv_nbuf_per_group;
+       /** Local portal on which to receive requests */
+       __u32                      srv_req_portal;
+       /** Portal on the client to send replies to */
+       __u32                      srv_rep_portal;
+       /**
+        * Tags for lu_context associated with this thread, see struct
+        * lu_context.
+        */
+       __u32                      srv_ctx_tags;
+       /** soft watchdog timeout multiplier */
+       int                          srv_watchdog_factor;
+       /** under unregister_service */
+       unsigned                        srv_is_stopping:1;
+
+       /** max # request buffers in history per partition */
+       int                             srv_hist_nrqbds_cpt_max;
+       /** number of CPTs this service bound on */
+       int                             srv_ncpts;
+       /** CPTs array this service bound on */
+       __u32                           *srv_cpts;
+       /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+       int                             srv_cpt_bits;
+       /** CPT table this service is running over */
+       struct cfs_cpt_table            *srv_cptable;
+       /**
+        * partition data for ptlrpc service
+        */
+       struct ptlrpc_service_part      *srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+       /** back reference to owner */
+       struct ptlrpc_service           *scp_service __cfs_cacheline_aligned;
+       /* CPT id, reserved */
+       int                             scp_cpt;
+       /** always increasing number */
+       int                             scp_thr_nextid;
+       /** # of starting threads */
+       int                             scp_nthrs_starting;
+       /** # of stopping threads, reserved for shrinking threads */
+       int                             scp_nthrs_stopping;
+       /** # running threads */
+       int                             scp_nthrs_running;
+       /** service threads list */
+       struct list_head                        scp_threads;
+
+       /**
+        * serialize the following fields, used for protecting
+        * rqbd list and incoming requests waiting for preprocess,
+        * threads starting & stopping are also protected by this lock.
+        */
+       spinlock_t                      scp_lock  __cfs_cacheline_aligned;
+       /** total # req buffer descs allocated */
+       int                             scp_nrqbds_total;
+       /** # posted request buffers for receiving */
+       int                             scp_nrqbds_posted;
+       /** in progress of allocating rqbd */
+       int                             scp_rqbd_allocating;
+       /** # incoming reqs */
+       int                             scp_nreqs_incoming;
+       /** request buffers to be reposted */
+       struct list_head                        scp_rqbd_idle;
+       /** req buffers receiving */
+       struct list_head                        scp_rqbd_posted;
+       /** incoming reqs */
+       struct list_head                        scp_req_incoming;
+       /** timeout before re-posting reqs, in tick */
+       cfs_duration_t                  scp_rqbd_timeout;
+       /**
+        * all threads sleep on this. This wait-queue is signalled when new
+        * incoming request arrives and when difficult reply has to be handled.
+        */
+       wait_queue_head_t                       scp_waitq;
+
+       /** request history */
+       struct list_head                        scp_hist_reqs;
+       /** request buffer history */
+       struct list_head                        scp_hist_rqbds;
+       /** # request buffers in history */
+       int                             scp_hist_nrqbds;
+       /** sequence number for request */
+       __u64                           scp_hist_seq;
+       /** highest seq culled from history */
+       __u64                           scp_hist_seq_culled;
+
+       /**
+        * serialize the following fields, used for processing requests
+        * sent to this portal
+        */
+       spinlock_t                      scp_req_lock __cfs_cacheline_aligned;
+       /** # reqs in either of the NRS heads below */
+       /** # reqs being served */
+       int                             scp_nreqs_active;
+       /** # HPreqs being served */
+       int                             scp_nhreqs_active;
+       /** # hp requests handled */
+       int                             scp_hreq_count;
+
+       /** NRS head for regular requests */
+       struct ptlrpc_nrs               scp_nrs_reg;
+       /** NRS head for HP requests; this is only valid for services that can
+        *  handle HP requests */
+       struct ptlrpc_nrs              *scp_nrs_hp;
+
+       /** AT stuff */
+       /** @{ */
+       /**
+        * serialize the following fields, used for changes on
+        * adaptive timeout
+        */
+       spinlock_t                      scp_at_lock __cfs_cacheline_aligned;
+       /** estimated rpc service time */
+       struct adaptive_timeout         scp_at_estimate;
+       /** reqs waiting for replies */
+       struct ptlrpc_at_array          scp_at_array;
+       /** early reply timer */
+       timer_list_t                    scp_at_timer;
+       /** debug */
+       cfs_time_t                      scp_at_checktime;
+       /** check early replies */
+       unsigned                        scp_at_check;
+       /** @} */
+
+       /**
+        * serialize the following fields, used for processing
+        * replies for this portal
+        */
+       spinlock_t                      scp_rep_lock __cfs_cacheline_aligned;
+       /** all the active replies */
+       struct list_head                        scp_rep_active;
+       /** List of free reply_states */
+       struct list_head                        scp_rep_idle;
+       /** waitq to run, when adding stuff to srv_free_rs_list */
+       wait_queue_head_t                       scp_rep_waitq;
+       /** # 'difficult' replies */
+       atomic_t                        scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)                     \
+       for (i = 0;                                                     \
+            i < (svc)->srv_ncpts &&                                    \
+            (svc)->srv_parts != NULL &&                                \
+            ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+       /**
+        * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+        */
+       unsigned long                   pc_flags;
+       /**
+        * Thread lock protecting structure fields.
+        */
+       spinlock_t                      pc_lock;
+       /**
+        * Start completion.
+        */
+       struct completion               pc_starting;
+       /**
+        * Stop completion.
+        */
+       struct completion               pc_finishing;
+       /**
+        * Thread requests set.
+        */
+       struct ptlrpc_request_set  *pc_set;
+       /**
+        * Thread name used in cfs_daemonize()
+        */
+       char                    pc_name[16];
+       /**
+        * Environment for request interpreters to run in.
+        */
+       struct lu_env          pc_env;
+       /**
+        * Index of ptlrpcd thread in the array.
+        */
+       int                      pc_index;
+       /**
+        * Number of the ptlrpcd's partners.
+        */
+       int                      pc_npartners;
+       /**
+        * Pointer to the array of partners' ptlrpcd_ctl structure.
+        */
+       struct ptlrpcd_ctl      **pc_partners;
+       /**
+        * Record the partner index to be processed next.
+        */
+       int                      pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+       /**
+        * Ptlrpc thread start flag.
+        */
+       LIOD_START       = 1 << 0,
+       /**
+        * Ptlrpc thread stop flag.
+        */
+       LIOD_STOP       = 1 << 1,
+       /**
+        * Ptlrpc thread force flag (only stop force so far).
+        * This will cause aborting any inflight rpcs handled
+        * by thread if LIOD_STOP is specified.
+        */
+       LIOD_FORCE       = 1 << 2,
+       /**
+        * This is a recovery ptlrpc thread.
+        */
+       LIOD_RECOVERY    = 1 << 3,
+       /**
+        * The ptlrpcd is bound to some CPU core.
+        */
+       LIOD_BIND       = 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true         The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       LASSERT(desc->pd_compat_svc_name != NULL);
+       return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+                              lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+                                               lnet_nid_t self,
+                                               struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc;
+       int                   rc;
+
+       LASSERT(req != NULL);
+       desc = req->rq_bulk;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+           req->rq_bulk_deadline > cfs_time_current_sec())
+               return 1;
+
+       if (!desc)
+               return 0;
+
+       spin_lock(&desc->bd_lock);
+       rc = desc->bd_md_count;
+       spin_unlock(&desc->bd_lock);
+       return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY        0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                       struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+                                            void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                     set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                           struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+                   void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                           const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                           struct ptlrpc_request_pool *,
+                                           const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                       __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                               const struct req_format *format,
+                                               __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                            __u32 version, int opcode, char **bufs,
+                            struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+                                      int opcode, int count, __u32 *lengths,
+                                      char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+                                            __u32 version, int opcode,
+                                           int count, __u32 *lengths, char **bufs,
+                                           struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+                                             unsigned npages, unsigned max_brw,
+                                             unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+       __ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+       __ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                            struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+                                            struct page *page, int pageoffset,
+                                            int len)
+{
+       __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+                                              struct page *page, int pageoffset,
+                                              int len)
+{
+       __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                     struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                        int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+       /* nbufs is buffers # to allocate when growing the pool */
+       unsigned int                    bc_nbufs;
+       /* buffer size to post */
+       unsigned int                    bc_buf_size;
+       /* portal to listed for requests on */
+       unsigned int                    bc_req_portal;
+       /* portal of where to send replies to */
+       unsigned int                    bc_rep_portal;
+       /* maximum request size to be accepted for this service */
+       unsigned int                    bc_req_max_size;
+       /* maximum reply size this service can ever send */
+       unsigned int                    bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+       /* threadname should be 8 characters or less - 6 will be added on */
+       char                            *tc_thr_name;
+       /* threads increasing factor for each CPU */
+       unsigned int                    tc_thr_factor;
+       /* service threads # to start on each partition while initializing */
+       unsigned int                    tc_nthrs_init;
+       /*
+        * low water of threads # upper-limit on each partition while running,
+        * service availability may be impacted if threads number is lower
+        * than this value. It can be ZERO if the service doesn't require
+        * CPU affinity or there is only one partition.
+        */
+       unsigned int                    tc_nthrs_base;
+       /* "soft" limit for total threads number */
+       unsigned int                    tc_nthrs_max;
+       /* user specified threads number, it will be validated due to
+        * other members of this structure. */
+       unsigned int                    tc_nthrs_user;
+       /* set NUMA node affinity for service threads */
+       unsigned int                    tc_cpu_affinity;
+       /* Tags for lu_context associated with service thread */
+       __u32                           tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+       struct cfs_cpt_table            *cc_cptable;
+       /* string pattern to describe CPTs for a service */
+       char                            *cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+       /* service name */
+       char                            *psc_name;
+       /* soft watchdog timeout multiplifier to print stuck service traces */
+       unsigned int                    psc_watchdog_factor;
+       /* buffer information */
+       struct ptlrpc_service_buf_conf  psc_buf;
+       /* thread information */
+       struct ptlrpc_service_thr_conf  psc_thr;
+       /* CPU partition information */
+       struct ptlrpc_service_cpt_conf  psc_cpt;
+       /* function table */
+       struct ptlrpc_service_ops       psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+                     struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+                               struct ptlrpc_service_conf *conf,
+                               struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+                                 struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+              int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+                        int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+                               int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                       char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+                       __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+                     char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                        __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+                           char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                     unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+#endif
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+                   unsigned int newlen, int move_data)
+{
+       LASSERT(req->rq_reply_state);
+       LASSERT(req->rq_repmsg);
+       req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+                                          newlen, move_data);
+}
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+       if (req->rq_phase == new_phase)
+               return;
+
+       if (new_phase == RQ_PHASE_UNREGISTERING) {
+               req->rq_next_phase = req->rq_phase;
+               if (req->rq_import)
+                       atomic_inc(&req->rq_import->imp_unregistering);
+       }
+
+       if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+               if (req->rq_import)
+                       atomic_dec(&req->rq_import->imp_unregistering);
+       }
+
+       DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+                 ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+       req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 0;
+       return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 0;
+       return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 1;
+       return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+       int rc;
+
+       spin_lock(&req->rq_lock);
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec()) {
+               spin_unlock(&req->rq_lock);
+               return 1;
+       }
+       rc = req->rq_receiving_reply || req->rq_must_unlink;
+       spin_unlock(&req->rq_lock);
+       return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+       if (req->rq_set == NULL)
+               wake_up(&req->rq_reply_waitq);
+       else
+               wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+       LASSERT(atomic_read(&rs->rs_refcount) > 0);
+       atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+       LASSERT(atomic_read(&rs->rs_refcount) > 0);
+       if (atomic_dec_and_test(&rs->rs_refcount))
+               lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+       if (req->rq_reply_state == NULL)
+               return; /* shouldn't occur */
+       ptlrpc_rs_decref(req->rq_reply_state);
+       req->rq_reply_state = NULL;
+       req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+       return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return req->rq_reqmsg->lm_repsize;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n",
+                        req->rq_reqmsg->lm_magic);
+               return -EFAULT;
+       }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+       if (req->rq_delay_limit != 0 &&
+           cfs_time_before(cfs_time_add(req->rq_queued_time,
+                                        cfs_time_seconds(req->rq_delay_limit)),
+                           cfs_time_current())) {
+               return 1;
+       }
+       return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+       if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+               spin_lock(&req->rq_lock);
+               req->rq_no_resend = 1;
+               spin_unlock(&req->rq_lock);
+       }
+       return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+       int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+       return svcpt->scp_service->srv_watchdog_factor *
+              max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_rqbd != NULL);
+       return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+                         struct obd_export **exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid, struct obd_connect_data *,
+                         void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                           struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+       TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                             timeout_cb_t cb, void *data,
+                             struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                             enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+cfs_time_t ptlrpc_suspend_wakeup_time(void);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+       /* all ptlrpcd threads are free mode */
+       PDB_POLICY_NONE   = 1,
+       /* all ptlrpcd threads are bound mode */
+       PDB_POLICY_FULL   = 2,
+       /* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+       PDB_POLICY_PAIR   = 3,
+       /* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+        * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+        * If kernel supports NUMA, pthrpcd threads are binded and
+        * grouped by NUMA node */
+       PDB_POLICY_NEIGHBOR      = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+       /* on the same CPU core as the caller */
+       PDL_POLICY_SAME  = 1,
+       /* within the same CPU partition, but not the same core as the caller */
+       PDL_POLICY_LOCAL        = 2,
+       /* round-robin on all CPU cores, but not the same core as the caller */
+       PDL_POLICY_ROUND        = 3,
+       /* the specified CPU core is preferred, but not enforced */
+       PDL_POLICY_PREFERRED    = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+int llog_origin_handle_cancel(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644 (file)
index 0000000..ed65468
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+       char *old_param;
+       char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+                                              struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+           char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+       tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+       lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+                   ... testfs-MDT0000.lov.stripesize=4M
+                   ... testfs-OST0000.ost.client_cache_seconds=15
+                   ... testfs.sys.timeout=<secs>
+                   ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT        "timeout="          /* global */
+#define PARAM_LDLM_TIMEOUT      "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN          "at_min="           /* global */
+#define PARAM_AT_MAX          "at_max="           /* global */
+#define PARAM_AT_EXTRA      "at_extra="         /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY          "at_history="       /* global */
+#define PARAM_JOBID_VAR                   "jobid_var="        /* global */
+#define PARAM_MGSNODE        "mgsnode="          /* only at mounttime */
+#define PARAM_FAILNODE      "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE      "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE          "active="           /* activate/deactivate */
+#define PARAM_NETWORK        "network="          /* bind on nid */
+#define PARAM_ID_UPCALL                "identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST                "ost."
+#define PARAM_OSC                "osc."
+#define PARAM_MDT                "mdt."
+#define PARAM_MDD                "mdd."
+#define PARAM_MDC                "mdc."
+#define PARAM_LLITE            "llite."
+#define PARAM_LOV                "lov."
+#define PARAM_LOD              "lod."
+#define PARAM_OSP              "osp."
+#define PARAM_SYS                "sys."              /* global */
+#define PARAM_SRPC              "srpc."
+#define PARAM_SRPC_FLVR            "srpc.flavor."
+#define PARAM_SRPC_UDESC          "srpc.udesc.cli2mdt"
+#define PARAM_SEC                "security."
+#define PARAM_QUOTA            "quota."            /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644 (file)
index 0000000..1c3041f
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/lustre_quota.h>
+
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+       struct lquota_glb_rec   lqr_glb_rec;
+       struct lquota_slv_rec   lqr_slv_rec;
+       struct lquota_acct_rec  lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+       /* Handle quotactl request from client. */
+       int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+                            struct obd_quotactl *);
+
+       /* Handle dqacq/dqrel request from slave. */
+       int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+                         struct ptlrpc_request *);
+
+       /* LDLM intent policy associated with quota locks */
+       int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+                                 struct ptlrpc_request *, struct ldlm_lock **,
+                                 int);
+
+       /* Initialize LVB of ldlm resource associated with quota objects */
+       int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+       /* Update LVB of ldlm resource associated with quota objects */
+       int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+                               struct ptlrpc_request *, int);
+
+       /* Return size of LVB to be packed in ldlm message */
+       int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+       /* Fill request buffer with lvb */
+       int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+                             int);
+
+       /* Free lvb associated with ldlm resource */
+       int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *            instance via qsd_init(). This creates all required structures
+ *            to manage quota enforcement for this target and performs all
+ *            low-level initialization which does not involve any lustre
+ *            object. qsd_init() should typically be called when the OSD
+ *            is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *               feature and initiates the quota reintegration procedure if
+ *               needed. qsd_prepare() should typically be called when
+ *               ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *             (i.e. when ->ldo_recovery_complete is called). This is used
+ *             to notify the qsd layer that quota should now be enforced
+ *             again via the qsd_op_begin/end functions. The last step of the
+ *             reintegration prodecure (namely usage reconciliation) will be
+ *             completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *            qsd_init(). This releases all quota slave objects and frees the
+ *            structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *                declaration of each operation. qsd_op_end() should then be
+ *                invoked later once all operations have been completed in
+ *                order to release/adjust the quota space.
+ *                Running qsd_op_begin() before qsd_start() isn't fatal and
+ *                will return success.
+ *                Once qsd_start() has been run, qsd_op_begin() will block
+ *                until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *              called after the operation transaction stopped.
+ *              While qsd_op_begin() must be invoked each time a new
+ *              operation is declared, qsd_op_end() should be called only
+ *              once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+                             proc_dir_entry_t *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+                struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+               struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+                  union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+                         __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+       /* quota identifier */
+       union lquota_id          lqi_id;
+
+       /* USRQUOTA or GRPQUOTA for now, could be expanded for
+        * directory quota or other types later.  */
+       int                      lqi_type;
+
+       /* inodes or kbytes to be consumed or released, it could
+        * be negative when releasing space.  */
+       long long                lqi_space;
+
+       /* quota slave entry structure associated with this ID */
+       struct lquota_entry     *lqi_qentry;
+
+       /* whether we are reporting blocks or inodes */
+       bool                     lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+       unsigned short          lqt_id_cnt;
+       struct lquota_id_info   lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC     0x04
+
+#define IS_LQUOTA_RES(res)                                             \
+       (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||   \
+        res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+                 struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644 (file)
index 0000000..f4d3820
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+       RCL_CLIENT,
+       RCL_SERVER,
+       RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR  9
+
+struct req_capsule {
+       struct ptlrpc_request   *rc_req;
+       const struct req_format *rc_fmt;
+       enum req_location       rc_loc;
+       __u32               rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_net.h>
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+                     enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+                            const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+                            const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+                                       const struct req_msg_field *field,
+                                       int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                        enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+                             const struct req_msg_field *field,
+                             enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+                       const struct req_msg_field *field,
+                       unsigned int newlen,
+                       enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+                           const struct req_msg_field *field,
+                           unsigned int newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644 (file)
index 0000000..9e0908e
--- /dev/null
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+       SPTLRPC_POLICY_NULL          = 0,
+       SPTLRPC_POLICY_PLAIN        = 1,
+       SPTLRPC_POLICY_GSS            = 2,
+       SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+       SPTLRPC_MECH_NULL              = 0,
+       SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+       SPTLRPC_MECH_PLAIN            = 0,
+       SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+       SPTLRPC_MECH_GSS_NULL      = 0,
+       SPTLRPC_MECH_GSS_KRB5      = 1,
+       SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+       SPTLRPC_SVC_NULL                = 0,    /**< no security */
+       SPTLRPC_SVC_AUTH                = 1,    /**< authentication only */
+       SPTLRPC_SVC_INTG                = 2,    /**< integrity */
+       SPTLRPC_SVC_PRIV                = 3,    /**< privacy */
+       SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+       SPTLRPC_BULK_DEFAULT        = 0,    /**< follow rpc flavor */
+       SPTLRPC_BULK_HASH              = 1,    /**< hash integrity */
+       SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+       SPTLRPC_BULK_SVC_NULL      = 0,    /**< no security */
+       SPTLRPC_BULK_SVC_AUTH      = 1,    /**< authentication only */
+       SPTLRPC_BULK_SVC_INTG      = 2,    /**< integrity */
+       SPTLRPC_BULK_SVC_PRIV      = 3,    /**< privacy */
+       SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET           (0)
+#define FLVR_MECH_OFFSET               (4)
+#define FLVR_SVC_OFFSET                 (8)
+#define FLVR_BULK_TYPE_OFFSET     (12)
+#define FLVR_BULK_SVC_OFFSET       (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)                     \
+       (((__u32)(policy) << FLVR_POLICY_OFFSET) |                    \
+        ((__u32)(mech) << FLVR_MECH_OFFSET) |                    \
+        ((__u32)(svc) << FLVR_SVC_OFFSET) |                        \
+        ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |                \
+        ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)                                 \
+       ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)                                     \
+       ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)                                       \
+       ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)                           \
+       ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)                             \
+       ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)                                     \
+       ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)                             \
+       ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)                               \
+       ((__u32)(mech) |                                                \
+        ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL                             \
+       MAKE_FLVR(SPTLRPC_POLICY_NULL,            \
+                 SPTLRPC_MECH_NULL,                \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN                           \
+       MAKE_FLVR(SPTLRPC_POLICY_PLAIN,          \
+                 SPTLRPC_MECH_PLAIN,              \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_HASH,                \
+                 SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_AUTH,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_INTG,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_PRIV,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT       SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID       ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY               ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)               (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+       LASSERT(svc < SPTLRPC_SVC_MAX);
+       *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                         SPTLRPC_FLVR_MECH(*flvr),
+                         svc,
+                         SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                         SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+       LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+       *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                         SPTLRPC_FLVR_MECH(*flvr),
+                         SPTLRPC_FLVR_SVC(*flvr),
+                         SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                         svc);
+}
+
+struct bulk_spec_hash {
+       __u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+       /**
+        * wire flavor, should be renamed to sf_wire.
+        */
+       __u32   sf_rpc;
+       /**
+        * general flags of PTLRPC_SEC_FL_*
+        */
+       __u32   sf_flags;
+       /**
+        * rpc flavor specification
+        */
+       union {
+               /* nothing for now */
+       } u_rpc;
+       /**
+        * bulk flavor specification
+        */
+       union {
+               struct bulk_spec_hash hash;
+       } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+       LUSTRE_SP_CLI      = 0,
+       LUSTRE_SP_MDT,
+       LUSTRE_SP_OST,
+       LUSTRE_SP_MGC,
+       LUSTRE_SP_MGS,
+       LUSTRE_SP_ANY      = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+       __u32              sr_netid;   /* LNET network ID */
+       __u8                sr_from;    /* sec_part */
+       __u8                sr_to;      /* sec_part */
+       __u16              sr_padding;
+       struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+       int                  srs_nslot;
+       int                  srs_nrule;
+       struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+       memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+                           struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                           enum lustre_sec_part from,
+                           enum lustre_sec_part to,
+                           lnet_nid_t nid,
+                           struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+                                  struct sptlrpc_rule_set *rset,
+                                  int initial);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                 enum lustre_sec_part from,
+                                 lnet_nid_t nid,
+                                 struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+       uint32_t        vc_uid;
+       uint32_t        vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+       /**
+        * To determine whether it's suitable to use the \a ctx for \a vcred.
+        */
+       int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+                               struct vfs_cred *vcred);
+
+       /**
+        * To bring the \a ctx uptodate.
+        */
+       int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * Validate the \a ctx.
+        */
+       int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * Force the \a ctx to die.
+        */
+       void    (*die)   (struct ptlrpc_cli_ctx *ctx,
+                               int grace);
+       int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+                               char *buf, int bufsize);
+
+       /**
+        * Sign the request message using \a ctx.
+        *
+        * \pre req->rq_reqmsg point to request message.
+        * \pre req->rq_reqlen is the request message length.
+        * \post req->rq_reqbuf point to request message with signature.
+        * \post req->rq_reqdata_len is set to the final request message size.
+        *
+        * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+        */
+       int     (*sign) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Verify the reply message using \a ctx.
+        *
+        * \pre req->rq_repdata point to reply message with signature.
+        * \pre req->rq_repdata_len is the total reply message length.
+        * \post req->rq_repmsg point to reply message without signature.
+        * \post req->rq_replen is the reply message length.
+        *
+        * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+        */
+       int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Encrypt the request message using \a ctx.
+        *
+        * \pre req->rq_reqmsg point to request message in clear text.
+        * \pre req->rq_reqlen is the request message length.
+        * \post req->rq_reqbuf point to request message.
+        * \post req->rq_reqdata_len is set to the final request message size.
+        *
+        * \see gss_cli_ctx_seal().
+        */
+       int     (*seal) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Decrypt the reply message using \a ctx.
+        *
+        * \pre req->rq_repdata point to encrypted reply message.
+        * \pre req->rq_repdata_len is the total cipher text length.
+        * \post req->rq_repmsg point to reply message in clear text.
+        * \post req->rq_replen is the reply message length in clear text.
+        *
+        * \see gss_cli_ctx_unseal().
+        */
+       int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Wrap bulk request data. This is called before wrapping RPC
+        * request message.
+        *
+        * \pre bulk buffer is descripted by desc->bd_iov and
+        * desc->bd_iov_count. note for read it's just buffer, no data
+        * need to be sent;  for write it contains data in clear text.
+        * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+        * (usually inside of RPC request message).
+        * - encryption: cipher text bulk buffer is descripted by
+        *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+        *   count remains the same).
+        * - otherwise: bulk buffer is still desc->bd_iov and
+        *   desc->bd_iov_count.
+        *
+        * \return 0: success.
+        * \return -ev: error code.
+        *
+        * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+        */
+       int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req,
+                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Unwrap bulk reply data. This is called after wrapping RPC
+        * reply message.
+        *
+        * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+        * desc->bd_iov_count, according to wrap_bulk().
+        * \post final bulk data in clear text is placed in buffer described
+        * by desc->bd_iov and desc->bd_iov_count.
+        * \return +ve nob of actual bulk data in clear text.
+        * \return -ve error code.
+        *
+        * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+        */
+       int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req,
+                               struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT          (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT        (1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT        (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT      (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT    (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT  (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW          (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE        (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD                (1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR              (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED            (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL          (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK  (PTLRPC_CTX_NEW_BIT    |       \
+                                       PTLRPC_CTX_UPTODATE   |       \
+                                       PTLRPC_CTX_DEAD       |       \
+                                       PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+       struct hlist_node       cc_cache;      /* linked into ctx cache */
+       atomic_t            cc_refcount;
+       struct ptlrpc_sec      *cc_sec;
+       struct ptlrpc_ctx_ops  *cc_ops;
+       cfs_time_t            cc_expire;     /* in seconds */
+       unsigned int        cc_early_expire:1;
+       unsigned long      cc_flags;
+       struct vfs_cred  cc_vcred;
+       spinlock_t              cc_lock;
+       struct list_head              cc_req_list;   /* waiting reqs linked here */
+       struct list_head              cc_gc_chain;   /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+       /**
+        * Given an \a imp, create and initialize a ptlrpc_sec structure.
+        * \param ctx service context:
+        * - regular import: \a ctx should be NULL;
+        * - reverse import: \a ctx is obtained from incoming request.
+        * \param flavor specify what flavor to use.
+        *
+        * When necessary, policy module is responsible for taking reference
+        * on the import.
+        *
+        * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+        */
+       struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+                                               struct ptlrpc_svc_ctx *ctx,
+                                               struct sptlrpc_flavor *flavor);
+
+       /**
+        * Destructor of ptlrpc_sec. When called, refcount has been dropped
+        * to 0 and all contexts has been destroyed.
+        *
+        * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+        */
+       void                (*destroy_sec) (struct ptlrpc_sec *sec);
+
+       /**
+        * Notify that this ptlrpc_sec is going to die. Optionally, policy
+        * module is supposed to set sec->ps_dying and whatever necessary
+        * actions.
+        *
+        * \see plain_kill_sec(), gss_sec_kill().
+        */
+       void                (*kill_sec)    (struct ptlrpc_sec *sec);
+
+       /**
+        * Given \a vcred, lookup and/or create its context. The policy module
+        * is supposed to maintain its own context cache.
+        * XXX currently \a create and \a remove_dead is always 1, perhaps
+        * should be removed completely.
+        *
+        * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+        */
+       struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+                                               struct vfs_cred *vcred,
+                                               int create,
+                                               int remove_dead);
+
+       /**
+        * Called then the reference of \a ctx dropped to 0. The policy module
+        * is supposed to destroy this context or whatever else according to
+        * its cache maintainance mechamism.
+        *
+        * \param sync if zero, we shouldn't wait for the context being
+        * destroyed completely.
+        *
+        * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+        */
+       void                (*release_ctx) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_cli_ctx *ctx,
+                                               int sync);
+
+       /**
+        * Flush the context cache.
+        *
+        * \param uid context of which user, -1 means all contexts.
+        * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+        * contexts should be cleared immediately.
+        * \param force if zero, only idle contexts will be flushed.
+        *
+        * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+        */
+       int                  (*flush_ctx_cache)
+                                              (struct ptlrpc_sec *sec,
+                                               uid_t uid,
+                                               int grace,
+                                               int force);
+
+       /**
+        * Called periodically by garbage collector to remove dead contexts
+        * from cache.
+        *
+        * \see gss_sec_gc_ctx_kr().
+        */
+       void                (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+       /**
+        * Given an context \a ctx, install a corresponding reverse service
+        * context on client side.
+        * XXX currently it's only used by GSS module, maybe we should remove
+        * this from general API.
+        */
+       int                  (*install_rctx)(struct obd_import *imp,
+                                               struct ptlrpc_sec *sec,
+                                               struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * To allocate request buffer for \a req.
+        *
+        * \pre req->rq_reqmsg == NULL.
+        * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+        * we are not supposed to free it.
+        * \post if success, req->rq_reqmsg point to a buffer with size
+        * at least \a lustre_msg_size.
+        *
+        * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+        */
+       int                  (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int lustre_msg_size);
+
+       /**
+        * To free request buffer for \a req.
+        *
+        * \pre req->rq_reqbuf != NULL.
+        *
+        * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+        */
+       void                (*free_reqbuf) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req);
+
+       /**
+        * To allocate reply buffer for \a req.
+        *
+        * \pre req->rq_repbuf == NULL.
+        * \post if success, req->rq_repbuf point to a buffer with size
+        * req->rq_repbuf_len, the size should be large enough to receive
+        * reply which be transformed from \a lustre_msg_size of clear text.
+        *
+        * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+        */
+       int                  (*alloc_repbuf)(struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int lustre_msg_size);
+
+       /**
+        * To free reply buffer for \a req.
+        *
+        * \pre req->rq_repbuf != NULL.
+        * \post req->rq_repbuf == NULL.
+        * \post req->rq_repbuf_len == 0.
+        *
+        * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+        */
+       void                (*free_repbuf) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req);
+
+       /**
+        * To expand the request buffer of \a req, thus the \a segment in
+        * the request message pointed by req->rq_reqmsg can accommodate
+        * at least \a newsize of data.
+        *
+        * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+        *
+        * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+        * gss_enlarge_reqbuf().
+        */
+       int                  (*enlarge_reqbuf)
+                                              (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int segment, int newsize);
+       /*
+        * misc
+        */
+       int                  (*display)     (struct ptlrpc_sec *sec,
+                                               struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+       /**
+        * verify an incoming request.
+        *
+        * \pre request message is pointed by req->rq_reqbuf, size is
+        * req->rq_reqdata_len; and the message has been unpacked to
+        * host byte order.
+        *
+        * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+        * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+        * req->rq_sp_from is decoded from request.
+        * \retval SECSVC_COMPLETE success, the request has been fully
+        * processed, and reply message has been prepared; req->rq_sp_from is
+        * decoded from request.
+        * \retval SECSVC_DROP failed, this request should be dropped.
+        *
+        * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+        */
+       int                  (*accept)      (struct ptlrpc_request *req);
+
+       /**
+        * Perform security transformation upon reply message.
+        *
+        * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+        * is req->rq_replen.
+        * \post req->rs_repdata_len is the final message size.
+        * \post req->rq_reply_off is set.
+        *
+        * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+        */
+       int                  (*authorize)   (struct ptlrpc_request *req);
+
+       /**
+        * Invalidate server context \a ctx.
+        *
+        * \see gss_svc_invalidate_ctx().
+        */
+       void                (*invalidate_ctx)
+                                              (struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Allocate a ptlrpc_reply_state.
+        *
+        * \param msgsize size of the reply message in clear text.
+        * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+        * should simply use it; otherwise we'll responsible for allocating
+        * a new one.
+        * \post req->rq_reply_state != NULL;
+        * \post req->rq_reply_state->rs_msg != NULL;
+        *
+        * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+        */
+       int                  (*alloc_rs)    (struct ptlrpc_request *req,
+                                               int msgsize);
+
+       /**
+        * Free a ptlrpc_reply_state.
+        */
+       void                (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+       /**
+        * Release the server context \a ctx.
+        *
+        * \see gss_svc_free_ctx().
+        */
+       void                (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Install a reverse context based on the server context \a ctx.
+        *
+        * \see gss_svc_install_rctx_kr().
+        */
+       int                  (*install_rctx)(struct obd_import *imp,
+                                               struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Prepare buffer for incoming bulk write.
+        *
+        * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+        * intended to receive the write.
+        *
+        * \see gss_svc_prep_bulk().
+        */
+       int                  (*prep_bulk)   (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Unwrap the bulk write data.
+        *
+        * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+        */
+       int                  (*unwrap_bulk) (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Wrap the bulk read data.
+        *
+        * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+        */
+       int                  (*wrap_bulk)   (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+       module_t                   *sp_owner;
+       char                       *sp_name;
+       __u16                      sp_policy; /* policy number */
+       struct ptlrpc_sec_cops   *sp_cops;   /* client ops */
+       struct ptlrpc_sec_sops   *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE     0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY   0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC         0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK           0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG             0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+       struct ptlrpc_sec_policy       *ps_policy;
+       atomic_t                    ps_refcount;
+       /** statistic only */
+       atomic_t                    ps_nctx;
+       /** unique identifier */
+       int                          ps_id;
+       struct sptlrpc_flavor      ps_flvr;
+       enum lustre_sec_part        ps_part;
+       /** after set, no more new context will be created */
+       unsigned int                ps_dying:1;
+       /** owning import */
+       struct obd_import             *ps_import;
+       spinlock_t                      ps_lock;
+
+       /*
+        * garbage collection
+        */
+       struct list_head                      ps_gc_list;
+       cfs_time_t                    ps_gc_interval; /* in seconds */
+       cfs_time_t                    ps_gc_next;     /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+       return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+       return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+       atomic_t                    sc_refcount;
+       struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS             (128)
+
+struct ptlrpc_user_desc {
+       __u32      pud_uid;
+       __u32      pud_gid;
+       __u32      pud_fsuid;
+       __u32      pud_fsgid;
+       __u32      pud_cap;
+       __u32      pud_ngroups;
+       __u32      pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+       BULK_HASH_ALG_NULL      = 0,
+       BULK_HASH_ALG_ADLER32,
+       BULK_HASH_ALG_CRC32,
+       BULK_HASH_ALG_MD5,
+       BULK_HASH_ALG_SHA1,
+       BULK_HASH_ALG_SHA256,
+       BULK_HASH_ALG_SHA384,
+       BULK_HASH_ALG_SHA512,
+       BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+       BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+       __u8        bsd_version;    /* 0 */
+       __u8        bsd_type;       /* SPTLRPC_BULK_XXX */
+       __u8        bsd_svc;    /* SPTLRPC_BULK_SVC_XXXX */
+       __u8        bsd_flags;      /* flags */
+       __u32      bsd_nob;     /* nob of bulk data */
+       __u8        bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+       size--;
+       size |= size >> 1;
+       size |= size >> 2;
+       size |= size >> 4;
+       size |= size >> 8;
+       size |= size >> 16;
+       size++;
+       return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                 int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                              char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+       __module_get(policy->sp_owner);
+       return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+       module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+       return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+       return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+       return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+                              int segment, int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                   struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *ctx,
+                            struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+       SECSVC_OK       = 0,
+       SECSVC_COMPLETE,
+       SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+                                struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                     struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc,
+                                int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                             void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+       return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+       LUSTRE_SEC_NONE  = 0,
+       LUSTRE_SEC_REMOTE       = 1,
+       LUSTRE_SEC_SPECIFY      = 2,
+       LUSTRE_SEC_ALL    = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h
new file mode 100644 (file)
index 0000000..84defce
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+
+#define UPDATE_BUFFER_SIZE     8192
+struct update_request {
+       struct dt_device        *ur_dt;
+       struct list_head                ur_list;    /* attached itself to thandle */
+       int                     ur_flags;
+       int                     ur_rc;      /* request result */
+       int                     ur_batchid; /* Current batch(trans) id */
+       struct update_buf       *ur_buf;   /* Holding the update req */
+};
+
+static inline unsigned long update_size(struct update *update)
+{
+       unsigned long size;
+       int        i;
+
+       size = cfs_size_round(offsetof(struct update, u_bufs[0]));
+       for (i = 0; i < UPDATE_BUF_COUNT; i++)
+               size += cfs_size_round(update->u_lens[i]);
+
+       return size;
+}
+
+static inline void *update_param_buf(struct update *update, int index,
+                                    int *size)
+{
+       int     i;
+       void    *ptr;
+
+       if (index >= UPDATE_BUF_COUNT)
+               return NULL;
+
+       ptr = (char *)update + cfs_size_round(offsetof(struct update,
+                                                      u_bufs[0]));
+       for (i = 0; i < index; i++) {
+               LASSERT(update->u_lens[i] > 0);
+               ptr += cfs_size_round(update->u_lens[i]);
+       }
+
+       if (size != NULL)
+               *size = update->u_lens[index];
+
+       return ptr;
+}
+
+static inline unsigned long update_buf_size(struct update_buf *buf)
+{
+       unsigned long size;
+       int        i = 0;
+
+       size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0]));
+       for (i = 0; i < buf->ub_count; i++) {
+               struct update *update;
+
+               update = (struct update *)((char *)buf + size);
+               size += update_size(update);
+       }
+       LASSERT(size <= UPDATE_BUFFER_SIZE);
+       return size;
+}
+
+static inline void *update_buf_get(struct update_buf *buf, int index, int *size)
+{
+       int     count = buf->ub_count;
+       void    *ptr;
+       int     i = 0;
+
+       if (index >= count)
+               return NULL;
+
+       ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf,
+                                                   ub_bufs[0]));
+       for (i = 0; i < index; i++)
+               ptr += update_size((struct update *)ptr);
+
+       if (size != NULL)
+               *size = update_size((struct update *)ptr);
+
+       return ptr;
+}
+
+static inline void update_init_reply_buf(struct update_reply *reply, int count)
+{
+       reply->ur_version = UPDATE_REPLY_V1;
+       reply->ur_count = count;
+}
+
+static inline void *update_get_buf_internal(struct update_reply *reply,
+                                           int index, int *size)
+{
+       char *ptr;
+       int count = reply->ur_count;
+       int i;
+
+       if (index >= count)
+               return NULL;
+
+       ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply,
+                                            ur_lens[count]));
+       for (i = 0; i < index; i++) {
+               LASSERT(reply->ur_lens[i] > 0);
+               ptr += cfs_size_round(reply->ur_lens[i]);
+       }
+
+       if (size != NULL)
+               *size = reply->ur_lens[index];
+
+       return ptr;
+}
+
+static inline void update_insert_reply(struct update_reply *reply, void *data,
+                                      int data_len, int index, int rc)
+{
+       char *ptr;
+
+       ptr = update_get_buf_internal(reply, index, NULL);
+       LASSERT(ptr != NULL);
+
+       *(int *)ptr = cpu_to_le32(rc);
+       ptr += sizeof(int);
+       if (data_len > 0) {
+               LASSERT(data != NULL);
+               memcpy(ptr, data, data_len);
+       }
+       reply->ur_lens[index] = data_len + sizeof(int);
+}
+
+static inline int update_get_reply_buf(struct update_reply *reply, void **buf,
+                                      int index)
+{
+       char *ptr;
+       int  size = 0;
+       int  result;
+
+       ptr = update_get_buf_internal(reply, index, &size);
+       result = *(int *)ptr;
+
+       if (result < 0)
+               return result;
+
+       LASSERT((ptr != NULL && size >= sizeof(int)));
+       *buf = ptr + sizeof(int);
+       return size - sizeof(int);
+}
+
+static inline int update_get_reply_result(struct update_reply *reply,
+                                         void **buf, int index)
+{
+       void *ptr;
+       int  size;
+
+       ptr = update_get_buf_internal(reply, index, &size);
+       LASSERT(ptr != NULL && size > sizeof(int));
+       return *(int *)ptr;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644 (file)
index 0000000..dc187b8
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h
new file mode 100644 (file)
index 0000000..28f1a6b
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#define LL_FID_NAMELEN (16 + 1 + 8 + 1)
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lvfs.h>
+
+#include <linux/libcfs/lucache.h>
+
+
+/* lvfs_common.c */
+struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data);
+
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+              struct lvfs_ucred *cred);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+             struct lvfs_ucred *cred);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h
new file mode 100644 (file)
index 0000000..eefa0f1
--- /dev/null
@@ -0,0 +1,946 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+enum {
+       UCRED_INVALID   = -1,
+       UCRED_INIT      = 0,
+       UCRED_OLD       = 1,
+       UCRED_NEW       = 2
+};
+
+enum {
+       MD_CAPAINFO_MAX = 5
+};
+
+/** there are at most 5 fids in one operation, see rename, NOTE the last one
+ * is a temporary one used for is_subdir() */
+struct md_capainfo {
+       __u32              mc_auth;
+       __u32              mc_padding;
+       struct lu_fid      mc_fid[MD_CAPAINFO_MAX];
+       struct lustre_capa     *mc_capa[MD_CAPAINFO_MAX];
+};
+
+struct md_quota {
+       struct obd_export       *mq_exp;
+};
+
+/**
+ * Implemented in mdd/mdd_handler.c.
+ *
+ * XXX should be moved into separate .h/.c together with all md security
+ * related definitions.
+ */
+struct md_capainfo *md_capainfo(const struct lu_env *env);
+struct md_quota *md_quota(const struct lu_env *env);
+
+/** metadata attributes */
+enum ma_valid {
+       MA_INODE     = (1 << 0),
+       MA_LOV       = (1 << 1),
+       MA_COOKIE    = (1 << 2),
+       MA_FLAGS     = (1 << 3),
+       MA_LMV       = (1 << 4),
+       MA_ACL_DEF   = (1 << 5),
+       MA_LOV_DEF   = (1 << 6),
+       MA_LAY_GEN   = (1 << 7),
+       MA_HSM       = (1 << 8),
+       MA_SOM       = (1 << 9),
+       MA_PFID      = (1 << 10)
+};
+
+typedef enum {
+       MDL_MINMODE  = 0,
+       MDL_EX       = 1,
+       MDL_PW       = 2,
+       MDL_PR       = 4,
+       MDL_CW       = 8,
+       MDL_CR       = 16,
+       MDL_NL       = 32,
+       MDL_GROUP    = 64,
+       MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+       MDT_NUL_LOCK = 0,
+       MDT_REG_LOCK = (1 << 0),
+       MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+       __u32   mh_compat;
+       __u32   mh_flags;
+       __u64   mh_arch_id;
+       __u64   mh_arch_ver;
+};
+
+#define IOEPOCH_INVAL 0
+
+/* memory structure for som attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som_data {
+       __u32   msd_compat;
+       __u32   msd_incompat;
+       __u64   msd_ioepoch;
+       __u64   msd_size;
+       __u64   msd_blocks;
+       __u64   msd_mountid;
+};
+
+struct md_attr {
+       __u64              ma_valid;
+       __u64              ma_need;
+       __u64              ma_attr_flags;
+       struct lu_attr    ma_attr;
+       struct lu_fid      ma_pfid;
+       struct md_hsm      ma_hsm;
+       struct lov_mds_md      *ma_lmm;
+       struct lmv_stripe_md   *ma_lmv;
+       void               *ma_acl;
+       struct llog_cookie     *ma_cookie;
+       struct lustre_capa     *ma_capa;
+       struct md_som_data     *ma_som;
+       int                  ma_lmm_size;
+       int                  ma_lmv_size;
+       int                  ma_acl_size;
+       int                  ma_cookie_size;
+       __u16              ma_layout_gen;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+       union {
+               /** symlink target */
+               const char             *sp_symname;
+               /** parent FID for cross-ref mkdir */
+               const struct lu_fid      *sp_pfid;
+               /** eadata for regular files */
+               struct md_spec_reg {
+                       /** lov objs exist already */
+                       const struct lu_fid   *fid;
+                       const void *eadata;
+                       int  eadatalen;
+               } sp_ea;
+       } u;
+
+       /** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+       __u64      sp_cr_flags;
+
+       /** don't create lov objects or llog cookie - this replay */
+       unsigned int no_create:1,
+                    sp_cr_lookup:1, /* do lookup sanity check or not. */
+                    sp_rm_entry:1;  /* only remove name entry */
+
+       /** Current lock mode for parent dir where create is performing. */
+       mdl_mode_t sp_cr_mode;
+
+       /** to create directory */
+       const struct dt_index_features *sp_feat;
+};
+
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+       int (*moo_permission)(const struct lu_env *env,
+                             struct md_object *pobj, struct md_object *cobj,
+                             struct md_attr *attr, int mask);
+
+       int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+                           struct md_attr *attr);
+
+       int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+                           const struct md_attr *attr);
+
+       int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+                            struct lu_buf *buf, const char *name);
+
+       int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+                             struct lu_buf *buf);
+
+       int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+                            const struct lu_buf *buf, const char *name,
+                            int fl);
+
+       int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+                            const char *name);
+
+       /** This method is used to swap the layouts between 2 objects */
+       int (*moo_swap_layouts)(const struct lu_env *env,
+                              struct md_object *obj1, struct md_object *obj2,
+                              __u64 flags);
+
+       /** \retval number of bytes actually read upon success */
+       int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+                           const struct lu_rdpg *rdpg);
+
+       int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+                           struct lu_buf *buf);
+       int (*moo_changelog)(const struct lu_env *env,
+                            enum changelog_rec_type type, int flags,
+                            struct md_object *obj);
+       /** part of cross-ref operation */
+       int (*moo_object_create)(const struct lu_env *env,
+                                struct md_object *obj,
+                                const struct md_op_spec *spec,
+                                struct md_attr *ma);
+
+       int (*moo_ref_add)(const struct lu_env *env,
+                          struct md_object *obj,
+                          const struct md_attr *ma);
+
+       int (*moo_ref_del)(const struct lu_env *env,
+                          struct md_object *obj,
+                          struct md_attr *ma);
+
+       int (*moo_open)(const struct lu_env *env,
+                       struct md_object *obj, int flag);
+
+       int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+                        struct md_attr *ma, int mode);
+
+       int (*moo_capa_get)(const struct lu_env *, struct md_object *,
+                           struct lustre_capa *, int renewal);
+
+       int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+       int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj,
+                            struct lov_mds_md *lmm, struct ldlm_extent *extent,
+                            struct lustre_handle *lockh);
+       int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj,
+                              struct lov_mds_md *lmm,
+                              struct lustre_handle *lockh);
+       int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+                              struct lustre_handle *lh,
+                              struct ldlm_enqueue_info *einfo,
+                              void *policy);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+       int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+                             const struct lu_fid *fid, struct lu_fid *sfid);
+
+       int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+                         const struct lu_name *lname, struct lu_fid *fid,
+                         struct md_op_spec *spec);
+
+       mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+                                   struct md_object *obj,
+                                   mdl_mode_t mode);
+
+       int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+                         const struct lu_name *lname, struct md_object *child,
+                         struct md_op_spec *spec,
+                         struct md_attr *ma);
+
+       /** This method is used for creating data object for this meta object*/
+       int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+                              struct md_object *o,
+                              const struct md_op_spec *spec,
+                              struct md_attr *ma);
+
+       int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+                         struct md_object *tpobj, const struct lu_fid *lf,
+                         const struct lu_name *lsname, struct md_object *tobj,
+                         const struct lu_name *ltname, struct md_attr *ma);
+
+       int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+                       struct md_object *src_obj, const struct lu_name *lname,
+                       struct md_attr *ma);
+
+       int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+                         struct md_object *cobj, const struct lu_name *lname,
+                         struct md_attr *ma, int no_name);
+
+       /** This method is used to compare a requested layout to an existing
+        * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */
+       int (*mdo_lum_lmm_cmp)(const struct lu_env *env,
+                              struct md_object *cobj,
+                              const struct md_op_spec *spec,
+                              struct md_attr *ma);
+
+       /** partial ops for cross-ref case */
+       int (*mdo_name_insert)(const struct lu_env *env,
+                              struct md_object *obj,
+                              const struct lu_name *lname,
+                              const struct lu_fid *fid,
+                              const struct md_attr *ma);
+
+       int (*mdo_name_remove)(const struct lu_env *env,
+                              struct md_object *obj,
+                              const struct lu_name *lname,
+                              const struct md_attr *ma);
+
+       int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj,
+                             struct md_object *tobj, const struct lu_fid *fid,
+                             const struct lu_name *lname, struct md_attr *ma);
+};
+
+struct md_device_operations {
+       /** meta-data device related handlers. */
+       int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+                           struct lu_fid *f);
+
+       int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m,
+                              int *md_size, int *cookie_size);
+
+       int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+                         struct obd_statfs *sfs);
+
+       int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m,
+                                 int mode, unsigned long timeout, __u32 alg,
+                                 struct lustre_capa_key *keys);
+
+       int (*mdo_update_capa_key)(const struct lu_env *env,
+                                  struct md_device *m,
+                                  struct lustre_capa_key *key);
+
+       int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+                                struct md_device *m, int idx, void **h);
+
+       int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+                            unsigned int cmd, int len, void *data);
+};
+
+enum md_upcall_event {
+       /** Sync the md layer*/
+       MD_LOV_SYNC = (1 << 0),
+       /** Just for split, no need trans, for replay */
+       MD_NO_TRANS = (1 << 1),
+       MD_LOV_CONFIG = (1 << 2),
+       /** Trigger quota recovery */
+       MD_LOV_QUOTA = (1 << 3)
+};
+
+struct md_upcall {
+       /** this lock protects upcall using against its removal
+        * read lock is for usage the upcall, write - for init/fini */
+       struct rw_semaphore     mu_upcall_sem;
+       /** device to call, upper layer normally */
+       struct md_device       *mu_upcall_dev;
+       /** upcall function */
+       int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
+                        enum md_upcall_event ev, void *data);
+};
+
+struct md_device {
+       struct lu_device                   md_lu_dev;
+       const struct md_device_operations *md_ops;
+       struct md_upcall                   md_upcall;
+};
+
+static inline void md_upcall_init(struct md_device *m, void *upcl)
+{
+       init_rwsem(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = NULL;
+       m->md_upcall.mu_upcall = upcl;
+}
+
+static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up)
+{
+       down_write(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = up;
+       up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline void md_upcall_fini(struct md_device *m)
+{
+       down_write(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = NULL;
+       m->md_upcall.mu_upcall = NULL;
+       up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
+                               enum md_upcall_event ev, void *data)
+{
+       int rc = 0;
+       down_read(&m->md_upcall.mu_upcall_sem);
+       if (m->md_upcall.mu_upcall_dev != NULL &&
+           m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
+               rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
+                                             m->md_upcall.mu_upcall_dev,
+                                             ev, data);
+       }
+       up_read(&m->md_upcall.mu_upcall_sem);
+       return rc;
+}
+
+struct md_object {
+       struct lu_object                   mo_lu;
+       const struct md_object_operations *mo_ops;
+       const struct md_dir_operations    *mo_dir_ops;
+};
+
+/**
+ * seq-server site.
+ */
+struct seq_server_site {
+       struct lu_site       *ss_lu;
+       /**
+        * mds number of this site.
+        */
+       mdsno_t        ss_node_id;
+       /**
+        * Fid location database
+        */
+       struct lu_server_fld *ss_server_fld;
+       struct lu_client_fld *ss_client_fld;
+
+       /**
+        * Server Seq Manager
+        */
+       struct lu_server_seq *ss_server_seq;
+
+       /**
+        * Controller Seq Manager
+        */
+       struct lu_server_seq *ss_control_seq;
+       struct obd_export    *ss_control_exp;
+
+       /**
+        * Client Seq Manager
+        */
+       struct lu_client_seq *ss_client_seq;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+       LASSERT(IS_ERR(d) || lu_device_is_md(d));
+       return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+       return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+       return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline struct md_object *md_object_next(const struct md_object *obj)
+{
+       return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL);
+}
+
+static inline struct md_device *md_obj2dev(const struct md_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev));
+       return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+       return s->ld_seq_site;
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+       return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+       lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+                                                    struct md_device *md,
+                                                    const struct lu_fid *f)
+{
+       return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+                               struct md_object *p,
+                               struct md_object *c,
+                               struct md_attr *at,
+                               int mask)
+{
+       LASSERT(c->mo_ops->moo_permission);
+       return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+                             struct md_object *m,
+                             struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_attr_get);
+       return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+                             struct md_object *m,
+                             struct lu_buf *buf)
+{
+       LASSERT(m->mo_ops->moo_readlink);
+       return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+                              enum changelog_rec_type type,
+                              int flags, struct md_object *m)
+{
+       LASSERT(m->mo_ops->moo_changelog);
+       return m->mo_ops->moo_changelog(env, type, flags, m);
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+                             struct md_object *m,
+                             const struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_attr_set);
+       return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+                              struct md_object *m,
+                              struct lu_buf *buf,
+                              const char *name)
+{
+       LASSERT(m->mo_ops->moo_xattr_get);
+       return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+                              struct md_object *m,
+                              const char *name)
+{
+       LASSERT(m->mo_ops->moo_xattr_del);
+       return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct lu_buf *buf,
+                              const char *name,
+                              int flags)
+{
+       LASSERT(m->mo_ops->moo_xattr_set);
+       return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+                               struct md_object *m,
+                               struct lu_buf *buf)
+{
+       LASSERT(m->mo_ops->moo_xattr_list);
+       return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+                                 struct md_object *o1,
+                                 struct md_object *o2, __u64 flags)
+{
+       LASSERT(o1->mo_ops->moo_swap_layouts);
+       LASSERT(o2->mo_ops->moo_swap_layouts);
+       if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+               return -EPERM;
+       return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+                         struct md_object *m,
+                         int flags)
+{
+       LASSERT(m->mo_ops->moo_open);
+       return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+                          struct md_object *m,
+                          struct md_attr *ma,
+                          int mode)
+{
+       LASSERT(m->mo_ops->moo_close);
+       return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+                             struct md_object *m,
+                             const struct lu_rdpg *rdpg)
+{
+       LASSERT(m->mo_ops->moo_readpage);
+       return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_create(const struct lu_env *env,
+                                  struct md_object *m,
+                                  const struct md_op_spec *spc,
+                                  struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_object_create);
+       return m->mo_ops->moo_object_create(env, m, spc, at);
+}
+
+static inline int mo_ref_add(const struct lu_env *env,
+                            struct md_object *m,
+                            const struct md_attr *ma)
+{
+       LASSERT(m->mo_ops->moo_ref_add);
+       return m->mo_ops->moo_ref_add(env, m, ma);
+}
+
+static inline int mo_ref_del(const struct lu_env *env,
+                            struct md_object *m,
+                            struct md_attr *ma)
+{
+       LASSERT(m->mo_ops->moo_ref_del);
+       return m->mo_ops->moo_ref_del(env, m, ma);
+}
+
+static inline int mo_capa_get(const struct lu_env *env,
+                             struct md_object *m,
+                             struct lustre_capa *c,
+                             int renewal)
+{
+       LASSERT(m->mo_ops->moo_capa_get);
+       return m->mo_ops->moo_capa_get(env, m, c, renewal);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+       LASSERT(m->mo_ops->moo_object_sync);
+       return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_file_lock(const struct lu_env *env, struct md_object *m,
+                              struct lov_mds_md *lmm,
+                              struct ldlm_extent *extent,
+                              struct lustre_handle *lockh)
+{
+       LASSERT(m->mo_ops->moo_file_lock);
+       return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh);
+}
+
+static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m,
+                                struct lov_mds_md *lmm,
+                                struct lustre_handle *lockh)
+{
+       LASSERT(m->mo_ops->moo_file_unlock);
+       return m->mo_ops->moo_file_unlock(env, m, lmm, lockh);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+                                struct md_object *m,
+                                struct lustre_handle *lh,
+                                struct ldlm_enqueue_info *einfo,
+                                void *policy)
+{
+       LASSERT(m->mo_ops->moo_object_lock);
+       return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+                            struct md_object *p,
+                            const struct lu_name *lname,
+                            struct lu_fid *f,
+                            struct md_op_spec *spec)
+{
+       LASSERT(p->mo_dir_ops->mdo_lookup);
+       return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+                                      struct md_object *mo,
+                                      mdl_mode_t lm)
+{
+       if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+               return MDL_MINMODE;
+       return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+                            struct md_object *p,
+                            const struct lu_name *lchild_name,
+                            struct md_object *c,
+                            struct md_op_spec *spc,
+                            struct md_attr *at)
+{
+       LASSERT(p->mo_dir_ops->mdo_create);
+       return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+                                 struct md_object *p,
+                                 struct md_object *c,
+                                 const struct md_op_spec *spec,
+                                 struct md_attr *ma)
+{
+       LASSERT(c->mo_dir_ops->mdo_create_data);
+       return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+                            struct md_object *sp,
+                            struct md_object *tp,
+                            const struct lu_fid *lf,
+                            const struct lu_name *lsname,
+                            struct md_object *t,
+                            const struct lu_name *ltname,
+                            struct md_attr *ma)
+{
+       LASSERT(tp->mo_dir_ops->mdo_rename);
+       return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+                                         ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+                               struct md_object *mo,
+                               const struct lu_fid *fid,
+                               struct lu_fid *sfid)
+{
+       LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+       return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+                          struct md_object *p,
+                          struct md_object *s,
+                          const struct lu_name *lname,
+                          struct md_attr *ma)
+{
+       LASSERT(s->mo_dir_ops->mdo_link);
+       return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+                            struct md_object *p,
+                            struct md_object *c,
+                            const struct lu_name *lname,
+                            struct md_attr *ma, int no_name)
+{
+       LASSERT(p->mo_dir_ops->mdo_unlink);
+       return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_lum_lmm_cmp(const struct lu_env *env,
+                                 struct md_object *c,
+                                 const struct md_op_spec *spec,
+                                 struct md_attr *ma)
+{
+       LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp);
+       return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma);
+}
+
+static inline int mdo_name_insert(const struct lu_env *env,
+                                 struct md_object *p,
+                                 const struct lu_name *lname,
+                                 const struct lu_fid *f,
+                                 const struct md_attr *ma)
+{
+       LASSERT(p->mo_dir_ops->mdo_name_insert);
+       return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma);
+}
+
+static inline int mdo_name_remove(const struct lu_env *env,
+                                 struct md_object *p,
+                                 const struct lu_name *lname,
+                                 const struct md_attr *ma)
+{
+       LASSERT(p->mo_dir_ops->mdo_name_remove);
+       return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma);
+}
+
+static inline int mdo_rename_tgt(const struct lu_env *env,
+                                struct md_object *p,
+                                struct md_object *t,
+                                const struct lu_fid *lf,
+                                const struct lu_name *lname,
+                                struct md_attr *ma)
+{
+       if (t) {
+               LASSERT(t->mo_dir_ops->mdo_rename_tgt);
+               return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+       } else {
+               LASSERT(p->mo_dir_ops->mdo_rename_tgt);
+               return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+       }
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+       MOR_SRC_PARENT,
+       MOR_SRC_CHILD,
+       MOR_TGT_PARENT,
+       MOR_TGT_CHILD,
+       MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+/**
+ * Structure to hold object information. This is used to create object
+ * \pre llod_dir exist
+ */
+struct lu_local_obj_desc {
+       const char                    *llod_dir;
+       const char                    *llod_name;
+       __u32                       llod_oid;
+       int                           llod_is_index;
+       const struct dt_index_features  *llod_feat;
+       struct list_head                       llod_linkage;
+};
+
+struct md_object *llo_store_resolve(const struct lu_env *env,
+                                   struct md_device *md,
+                                   struct dt_device *dt,
+                                   const char *path,
+                                   struct lu_fid *fid);
+
+struct md_object *llo_store_open(const struct lu_env *env,
+                                struct md_device *md,
+                                struct dt_device *dt,
+                                const char *dirname,
+                                const char *objname,
+                                struct lu_fid *fid);
+
+struct md_object *llo_store_create_index(const struct lu_env *env,
+                                        struct md_device *md,
+                                        struct dt_device *dt,
+                                        const char *dirname,
+                                        const char *objname,
+                                        const struct lu_fid *fid,
+                                        const struct dt_index_features *feat);
+
+struct md_object *llo_store_create(const struct lu_env *env,
+                                  struct md_device *md,
+                                  struct dt_device *dt,
+                                  const char *dirname,
+                                  const char *objname,
+                                  const struct lu_fid *fid);
+
+void llo_local_obj_register(struct lu_local_obj_desc *);
+void llo_local_obj_unregister(struct lu_local_obj_desc *);
+
+int llo_local_objects_setup(const struct lu_env *env,
+                            struct md_device * md,
+                            struct dt_device * dt);
+
+int llo_global_init(void);
+void llo_global_fini(void);
+
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+
+struct lu_ucred {
+       __u32          uc_valid;
+       __u32          uc_o_uid;
+       __u32          uc_o_gid;
+       __u32          uc_o_fsuid;
+       __u32          uc_o_fsgid;
+       __u32          uc_uid;
+       __u32          uc_gid;
+       __u32          uc_fsuid;
+       __u32          uc_fsgid;
+       __u32          uc_suppgids[2];
+       cfs_cap_t          uc_cap;
+       __u32          uc_umask;
+       group_info_t   *uc_ginfo;
+       struct md_identity *uc_identity;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+       if (md_cap_raised(uc->uc_cap, cap))
+               return 1;
+       return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644 (file)
index 0000000..dade2fd
--- /dev/null
@@ -0,0 +1,1683 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/obd.h>
+
+#define IOC_OSC_TYPE    'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE    'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_MAX_NR       50
+
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <lustre_lib.h>
+#include <lustre_export.h>
+#include <lustre_fld.h>
+#include <lustre_capa.h>
+
+#include <linux/libcfs/bitmap.h>
+
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+       int     ar_rc;
+       int     ar_force_sync;
+       __u64   ar_min_xid;
+};
+
+struct lov_oinfo {              /* per-stripe data structure */
+       struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+       int loi_ost_idx;           /* OST stripe index in lov_tgt_desc->tgts */
+       int loi_ost_gen;           /* generation of this loi_ost_idx */
+
+       unsigned long loi_kms_valid:1;
+       __u64 loi_kms;       /* known minimum size */
+       struct ost_lvb loi_lvb;
+       struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+       oinfo->loi_kms = kms;
+       oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+       atomic_t     lsm_refc;
+       spinlock_t      lsm_lock;
+       pid_t       lsm_lock_owner; /* debugging */
+
+       /* maximum possible file size, might change as OSTs status changes,
+        * e.g. disconnected, deactivated */
+       __u64       lsm_maxbytes;
+       struct {
+               /* Public members. */
+               struct ost_id lw_object_oi; /* lov object id/seq */
+
+               /* LOV-private members start here -- only for use in lov/. */
+               __u32 lw_magic;
+               __u32 lw_stripe_size;      /* size of the stripe */
+               __u32 lw_pattern;         /* striping pattern (RAID0, RAID1) */
+               __u16 lw_stripe_count;  /* number of objects being striped over */
+               __u16 lw_layout_gen;       /* generation of the layout */
+               char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+       } lsm_wire;
+
+       struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi          lsm_wire.lw_object_oi
+#define lsm_magic      lsm_wire.lw_magic
+#define lsm_layout_gen   lsm_wire.lw_layout_gen
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+       /* Lock policy. It keeps an extent which is specific for a particular
+        * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+        * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+       ldlm_policy_data_t      oi_policy;
+       /* Flags used for set request specific flags:
+          - while lock handling, the flags obtained on the enqueue
+          request are set here.
+          - while stats, the flags used for control delay/resend.
+          - while setattr, the flags used for distinguish punch operation
+        */
+       __u64              oi_flags;
+       /* Lock handle specific for every OSC lock. */
+       struct lustre_handle   *oi_lockh;
+       /* lsm data specific for every OSC. */
+       struct lov_stripe_md   *oi_md;
+       /* obdo data specific for every OSC, if needed at all. */
+       struct obdo         *oi_oa;
+       /* statfs data specific for every OSC, if needed at all. */
+       struct obd_statfs      *oi_osfs;
+       /* An update callback which is called to update some data on upper
+        * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+        * request in osc level for enqueue requests. It is also possible to
+        * update some caller data from LOV layer if needed. */
+       obd_enqueue_update_f    oi_cb_up;
+       /* oss capability, its type is obd_capa in client to avoid copy.
+        * in contrary its type is lustre_capa in OSS. */
+       void               *oi_capa;
+       /* transfer jobid from ost_sync() to filter_sync()... */
+       char               *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+                                   struct lov_stripe_md *m2)
+{
+       /*
+        * ->lsm_wire contains padding, but it should be zeroed out during
+        * allocation.
+        */
+       return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+                                 struct lov_stripe_md  *lsm)
+{
+       if (lsm->lsm_magic != lum->lmm_magic)
+               return 1;
+       if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+           (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+               return 2;
+       if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+           (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+               return 3;
+       if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+           (lsm->lsm_pattern != lum->lmm_pattern))
+               return 4;
+       if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+           (strncmp(lsm->lsm_pool_name,
+                    ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+                    LOV_MAXPOOLNAME) != 0))
+               return 5;
+       return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+                                        int *lmm_magic,
+                                        struct lov_user_md *lum)
+{
+       if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1)))
+               return -EFAULT;
+
+       *lmm_magic = lumv3->lmm_magic;
+
+       if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+               lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+               *lmm_magic = LOV_USER_MAGIC_V1;
+       } else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+               if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+                       return -EFAULT;
+       } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+               if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+                       return -EFAULT;
+               lustre_swab_lov_user_md_v3(lumv3);
+               *lmm_magic = LOV_USER_MAGIC_V3;
+       } else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+               CDEBUG(D_IOCTL,
+                      "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+                      *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+                      return -EINVAL;
+       }
+       return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+       struct list_head typ_chain;
+       struct obd_ops *typ_dt_ops;
+       struct md_ops *typ_md_ops;
+       proc_dir_entry_t *typ_procroot;
+       char *typ_name;
+       int  typ_refcnt;
+       struct lu_device_type *typ_lu;
+       spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+       obd_off  off;
+       struct page *pg;
+       int count;
+       obd_flag flag;
+};
+
+/* Individual type definitions */
+
+struct ost_server_data;
+
+struct osd_properties {
+       size_t osd_max_ea_size;
+};
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+       __u32                obt_magic;
+       __u32                obt_instance;
+       struct super_block       *obt_sb;
+       /** last_rcvd file */
+       struct file           *obt_rcvd_filp;
+       __u64                obt_mount_count;
+       struct rw_semaphore       obt_rwsem;
+       struct vfsmount   *obt_vfsmnt;
+       struct file           *obt_health_check_filp;
+       struct osd_properties     obt_osd_properties;
+       struct obd_job_stats      obt_jobstats;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+       LLOG_CONFIG_ORIG_CTXT  =  0,
+       LLOG_CONFIG_REPL_CTXT,
+       LLOG_MDS_OST_ORIG_CTXT,
+       LLOG_MDS_OST_REPL_CTXT,
+       LLOG_SIZE_ORIG_CTXT,
+       LLOG_SIZE_REPL_CTXT,
+       LLOG_RD1_ORIG_CTXT,
+       LLOG_RD1_REPL_CTXT,
+       LLOG_TEST_ORIG_CTXT,
+       LLOG_TEST_REPL_CTXT,
+       LLOG_LOVEA_ORIG_CTXT,
+       LLOG_LOVEA_REPL_CTXT,
+       LLOG_CHANGELOG_ORIG_CTXT,      /**< changelog generation on mdd */
+       LLOG_CHANGELOG_REPL_CTXT,      /**< changelog access on clients */
+       LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+       LLOG_MAX_CTXTS
+};
+
+#define FILTER_SUBDIR_COUNT      32        /* set to zero for no subdirs */
+
+struct filter_subdirs {
+       struct dentry *dentry[FILTER_SUBDIR_COUNT];
+};
+
+
+struct filter_ext {
+       __u64           fe_start;
+       __u64           fe_end;
+};
+
+struct filter_obd {
+       /* NB this field MUST be first */
+       struct obd_device_target fo_obt;
+       const char              *fo_fstype;
+
+       int                     fo_group_count;
+       struct dentry           *fo_dentry_O;
+       struct dentry           **fo_dentry_O_groups;
+       struct filter_subdirs   *fo_dentry_O_sub;
+       struct mutex            fo_init_lock;   /* group initialization lock*/
+       int                     fo_committed_group;
+
+       spinlock_t              fo_objidlock;   /* protect fo_lastobjid */
+
+       unsigned long           fo_destroys_in_progress;
+       struct mutex            fo_create_locks[FILTER_SUBDIR_COUNT];
+
+       struct list_head fo_export_list;
+       int               fo_subdir_count;
+
+       obd_size             fo_tot_dirty;      /* protected by obd_osfs_lock */
+       obd_size             fo_tot_granted;    /* all values in bytes */
+       obd_size             fo_tot_pending;
+       int               fo_tot_granted_clients;
+
+       obd_size             fo_readcache_max_filesize;
+       spinlock_t              fo_flags_lock;
+       unsigned int     fo_read_cache:1,   /**< enable read-only cache */
+                            fo_writethrough_cache:1,/**< read cache writes */
+                            fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+                            fo_raid_degraded:1;/**< RAID device degraded */
+
+       struct obd_import   *fo_mdc_imp;
+       struct obd_uuid      fo_mdc_uuid;
+       struct lustre_handle fo_mdc_conn;
+       struct file     **fo_last_objid_files;
+       __u64          *fo_last_objids; /* last created objid for groups,
+                                             * protected by fo_objidlock */
+
+       struct mutex            fo_alloc_lock;
+
+       atomic_t         fo_r_in_flight;
+       atomic_t         fo_w_in_flight;
+
+       /*
+        * per-filter pool of kiobuf's allocated by filter_common_setup() and
+        * torn down by filter_cleanup().
+        *
+        * This pool contains kiobuf used by
+        * filter_{prep,commit}rw_{read,write}() and is shared by all OST
+        * threads.
+        *
+        * Locking: protected by internal lock of cfs_hash, pool can be
+        * found from this hash table by t_id of ptlrpc_thread.
+        */
+       struct cfs_hash         *fo_iobuf_hash;
+
+       struct brw_stats         fo_filter_stats;
+
+       int                   fo_fmd_max_num; /* per exp filter_mod_data */
+       int                   fo_fmd_max_age; /* jiffies to fmd expiry */
+       unsigned long       fo_syncjournal:1, /* sync journal on writes */
+                                fo_sync_lock_cancel:2;/* sync on lock cancel */
+
+
+       /* sptlrpc stuff */
+       rwlock_t                fo_sptlrpc_lock;
+       struct sptlrpc_rule_set  fo_sptlrpc_rset;
+
+       /* capability related */
+       unsigned int         fo_fl_oss_capa;
+       struct list_head               fo_capa_keys;
+       struct hlist_head       *fo_capa_hash;
+       int                   fo_sec_level;
+};
+
+struct timeout_item {
+       enum timeout_event ti_event;
+       cfs_time_t       ti_timeout;
+       timeout_cb_t       ti_cb;
+       void          *ti_cb_data;
+       struct list_head         ti_obd_list;
+       struct list_head         ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT       8
+#define MDS_OSC_MAX_RIF_DEFAULT   50
+#define OSC_MAX_RIF_MAX         256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS      10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+       NEVER_SYNC_ON_CANCEL = 0,
+       BLOCKING_SYNC_ON_CANCEL = 1,
+       ALWAYS_SYNC_ON_CANCEL = 2,
+       NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT       8
+#define MDC_MAX_RIF_MAX         512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+       struct rw_semaphore  cl_sem;
+       struct obd_uuid   cl_target_uuid;
+       struct obd_import       *cl_import; /* ptlrpc connection state */
+       int                   cl_conn_count;
+       /* max_mds_easize is purely a performance thing so we don't have to
+        * call obd_size_diskmd() all the time. */
+       int                   cl_default_mds_easize;
+       int                   cl_max_mds_easize;
+       int                   cl_max_mds_cookiesize;
+
+       enum lustre_sec_part     cl_sp_me;
+       enum lustre_sec_part     cl_sp_to;
+       struct sptlrpc_flavor    cl_flvr_mgc;   /* fixed flavor of mgc->mgs */
+
+       /* the grant values are protected by loi_list_lock below */
+       long                 cl_dirty;   /* all _dirty_ in bytes */
+       long                 cl_dirty_max;     /* allowed w/o rpc */
+       long                 cl_dirty_transit; /* dirty synchronous */
+       long                 cl_avail_grant;   /* bytes of credit for ost */
+       long                 cl_lost_grant;    /* lost credits (trunc) */
+
+       /* since we allocate grant by blocks, we don't know how many grant will
+        * be used to add a page into cache. As a solution, we reserve maximum
+        * grant before trying to dirty a page and unreserve the rest.
+        * See osc_{reserve|unreserve}_grant for details. */
+       long             cl_reserved_grant;
+       struct list_head           cl_cache_waiters; /* waiting for cache/grant */
+       cfs_time_t         cl_next_shrink_grant;   /* jiffies */
+       struct list_head           cl_grant_shrink_list;  /* Timeout event list */
+       int               cl_grant_shrink_interval; /* seconds */
+
+       /* A chunk is an optimal size used by osc_extent to determine
+        * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+       int               cl_chunkbits;
+       int               cl_chunk;
+       int               cl_extent_tax; /* extent overhead, by bytes */
+
+       /* keep track of objects that have lois that contain pages which
+        * have been queued for async brw.  this lock also protects the
+        * lists of osc_client_pages that hang off of the loi */
+       /*
+        * ->cl_loi_list_lock protects consistency of
+        * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+        * ->ap_completion() call-backs are executed under this lock. As we
+        * cannot guarantee that these call-backs never block on all platforms
+        * (as a matter of fact they do block on Mac OS X), type of
+        * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+        * and blocking mutex on Mac OS X. (Alternative is to make this lock
+        * blocking everywhere, but we don't want to slow down fast-path of
+        * our main platform.)
+        *
+        * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+        * with client_obd_list_{un,}lock() and
+        * client_obd_list_lock_{init,done}() functions.
+        *
+        * NB by Jinshan: though field names are still _loi_, but actually
+        * osc_object{}s are in the list.
+        */
+       client_obd_lock_t       cl_loi_list_lock;
+       struct list_head               cl_loi_ready_list;
+       struct list_head               cl_loi_hp_ready_list;
+       struct list_head               cl_loi_write_list;
+       struct list_head               cl_loi_read_list;
+       int                   cl_r_in_flight;
+       int                   cl_w_in_flight;
+       /* just a sum of the loi/lop pending numbers to be exported by /proc */
+       atomic_t             cl_pending_w_pages;
+       atomic_t             cl_pending_r_pages;
+       __u32                    cl_max_pages_per_rpc;
+       int                   cl_max_rpcs_in_flight;
+       struct obd_histogram     cl_read_rpc_hist;
+       struct obd_histogram     cl_write_rpc_hist;
+       struct obd_histogram     cl_read_page_hist;
+       struct obd_histogram     cl_write_page_hist;
+       struct obd_histogram     cl_read_offset_hist;
+       struct obd_histogram     cl_write_offset_hist;
+
+       /* lru for osc caching pages */
+       struct cl_client_cache  *cl_cache;
+       struct list_head                 cl_lru_osc; /* member of cl_cache->ccc_lru */
+       atomic_t                *cl_lru_left;
+       atomic_t                 cl_lru_busy;
+       atomic_t                 cl_lru_shrinkers;
+       atomic_t                 cl_lru_in_list;
+       struct list_head                 cl_lru_list; /* lru page list */
+       client_obd_lock_t        cl_lru_list_lock; /* page list protector */
+
+       /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+       atomic_t             cl_destroy_in_flight;
+       wait_queue_head_t             cl_destroy_waitq;
+
+       struct mdc_rpc_lock     *cl_rpc_lock;
+       struct mdc_rpc_lock     *cl_close_lock;
+
+       /* mgc datastruct */
+       struct semaphore         cl_mgc_sem;
+       struct vfsmount  *cl_mgc_vfsmnt;
+       struct dentry      *cl_mgc_configs_dir;
+       atomic_t             cl_mgc_refcount;
+       struct obd_export       *cl_mgc_mgsexp;
+
+       /* checksumming for data sent over the network */
+       unsigned int         cl_checksum:1; /* 0 = disabled, 1 = enabled */
+       /* supported checksum types that are worked out at connect time */
+       __u32               cl_supp_cksum_types;
+       /* checksum algorithm to be used */
+       cksum_type_t         cl_cksum_type;
+
+       /* also protected by the poorly named _loi_list_lock lock above */
+       struct osc_async_rc      cl_ar;
+
+       /* used by quotacheck when the servers are older than 2.4 */
+       int                   cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+       /* sequence manager */
+       struct lu_client_seq    *cl_seq;
+
+       atomic_t             cl_resends; /* resend count */
+
+       /* ptlrpc work for writeback in ptlrpcd context */
+       void                *cl_writeback_work;
+       /* hash tables for osc_quota_info */
+       cfs_hash_t            *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+       __u32   idx;
+       obd_id  *data;
+};
+
+/* */
+
+struct echo_obd {
+       struct obd_device_target eo_obt;
+       struct obdo             eo_oa;
+       spinlock_t               eo_lock;
+       __u64                    eo_lastino;
+       struct lustre_handle    eo_nl_lock;
+       atomic_t                eo_prep;
+};
+
+struct ost_obd {
+       struct ptlrpc_service   *ost_service;
+       struct ptlrpc_service   *ost_create_service;
+       struct ptlrpc_service   *ost_io_service;
+       struct ptlrpc_service   *ost_seq_service;
+       struct mutex            ost_health_mutex;
+};
+
+struct echo_client_obd {
+       struct obd_export       *ec_exp;   /* the local connection to osc/lov */
+       spinlock_t              ec_lock;
+       struct list_head           ec_objects;
+       struct list_head           ec_locks;
+       int               ec_nstripes;
+       __u64           ec_unique;
+};
+
+struct lov_qos_oss {
+       struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
+       struct list_head          lqo_oss_list;   /* link to lov_qos */
+       __u64          lqo_bavail;     /* total bytes avail on OSS */
+       __u64          lqo_penalty;    /* current penalty */
+       __u64          lqo_penalty_per_obj;/* penalty decrease every obj*/
+       time_t        lqo_used;       /* last used time, seconds */
+       __u32          lqo_ost_count;  /* number of osts on this oss */
+};
+
+struct ltd_qos {
+       struct lov_qos_oss *ltq_oss;     /* oss info */
+       __u64          ltq_penalty;     /* current penalty */
+       __u64          ltq_penalty_per_obj; /* penalty decrease every obj*/
+       __u64          ltq_weight;      /* net weighting */
+       time_t        ltq_used; /* last used time, seconds */
+       unsigned int    ltq_usable:1;    /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+       __u32         *op_array;      /* array of index of
+                                                  lov_obd->lov_tgts */
+       unsigned int    op_count;      /* number of OSTs in the array */
+       unsigned int    op_size;       /* allocated size of lp_array */
+       struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+       __u32          lqr_start_idx;   /* start index of new inode */
+       __u32          lqr_offset_idx;  /* aliasing for start_idx  */
+       int              lqr_start_count; /* reseed counter */
+       struct ost_pool     lqr_pool;   /* round-robin optimized list */
+       unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+       struct obd_info   lsd_oi;
+       struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+       struct list_head          lq_oss_list; /* list of OSSs that targets use */
+       struct rw_semaphore lq_rw_sem;
+       __u32          lq_active_oss_count;
+       unsigned int    lq_prio_free;   /* priority for free space */
+       unsigned int    lq_threshold_rr;/* priority for rr */
+       struct lov_qos_rr   lq_rr;        /* round robin qos data */
+       unsigned long       lq_dirty:1,     /* recalc qos data */
+                           lq_same_space:1,/* the ost's all have approx.
+                                              the same space avail */
+                           lq_reset:1,     /* zero current penalties */
+                           lq_statfs_in_progress:1; /* statfs op in
+                                                       progress */
+       /* qos statfs data */
+       struct lov_statfs_data *lq_statfs_data;
+       wait_queue_head_t        lq_statfs_waitq; /* waitqueue to notify statfs
+                                             * requests completion */
+};
+
+struct lov_tgt_desc {
+       struct list_head          ltd_kill;
+       struct obd_uuid     ltd_uuid;
+       struct obd_device  *ltd_obd;
+       struct obd_export  *ltd_exp;
+       struct ltd_qos      ltd_qos;     /* qos info per target */
+       __u32          ltd_gen;
+       __u32          ltd_index;   /* index in lov_obd->tgts */
+       unsigned long       ltd_active:1,/* is this target up for requests */
+                           ltd_activate:1,/* should  target be activated */
+                           ltd_reap:1;  /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+       char              pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+       struct ost_pool       pool_obds;              /* pool members */
+       atomic_t          pool_refcount;          /* pool ref. counter */
+       struct lov_qos_rr     pool_rr;          /* round robin qos */
+       struct hlist_node      pool_hash;             /* access by poolname */
+       struct list_head            pool_list;        /* serial access */
+       proc_dir_entry_t *pool_proc_entry;      /* file in /proc */
+       struct obd_device    *pool_lobd;              /* obd of the lov/lod to which
+                                                      * this pool belongs */
+};
+
+struct lov_obd {
+       struct lov_desc  desc;
+       struct lov_tgt_desc   **lov_tgts;             /* sparse array */
+       struct ost_pool  lov_packed;        /* all OSTs in a packed
+                                                         array */
+       struct mutex            lov_lock;
+       struct obd_connect_data lov_ocd;
+       atomic_t            lov_refcount;
+       __u32              lov_tgt_count;        /* how many OBD's */
+       __u32              lov_active_tgt_count;  /* how many active */
+       __u32              lov_death_row;/* tgts scheduled to be deleted */
+       __u32              lov_tgt_size;   /* size of tgts array */
+       int                  lov_connects;
+       int                  lov_pool_count;
+       cfs_hash_t           *lov_pools_hash_body; /* used for key access */
+       struct list_head              lov_pool_list; /* used for sequential access */
+       proc_dir_entry_t   *lov_pool_proc_entry;
+       enum lustre_sec_part    lov_sp_me;
+
+       /* Cached LRU and unstable data from upper layer */
+       void                   *lov_cache;
+
+       struct rw_semaphore     lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+       struct obd_uuid         ltd_uuid;
+       struct obd_export       *ltd_exp;
+       int                     ltd_idx;
+       struct mutex            ltd_fid_mutex;
+       unsigned long           ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+       PLACEMENT_CHAR_POLICY   = 0,
+       PLACEMENT_NID_POLICY    = 1,
+       PLACEMENT_INVAL_POLICY  = 2,
+       PLACEMENT_MAX_POLICY
+};
+
+typedef enum placement_policy placement_policy_t;
+
+struct lmv_obd {
+       int                     refcount;
+       struct lu_client_fld    lmv_fld;
+       spinlock_t              lmv_lock;
+       placement_policy_t      lmv_placement;
+       struct lmv_desc         desc;
+       struct obd_uuid         cluuid;
+       struct obd_export       *exp;
+
+       struct mutex            init_mutex;
+       int                     connected;
+       int                     max_easize;
+       int                     max_def_easize;
+       int                     max_cookiesize;
+       int                     server_timeout;
+
+       int                     tgts_size; /* size of tgts array */
+       struct lmv_tgt_desc     **tgts;
+
+       struct obd_connect_data conn_data;
+};
+
+struct niobuf_local {
+       __u64           lnb_file_offset;
+       __u32           lnb_page_offset;
+       __u32           len;
+       __u32           flags;
+       struct page     *page;
+       struct dentry   *dentry;
+       int             lnb_grant_used;
+       int             rc;
+};
+
+#define LUSTRE_FLD_NAME         "fld"
+#define LUSTRE_SEQ_NAME         "seq"
+
+#define LUSTRE_MDD_NAME         "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME        "osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME         "vvp"
+#define LUSTRE_LMV_NAME         "lmv"
+#define LUSTRE_SLP_NAME         "slp"
+#define LUSTRE_LOD_NAME                "lod"
+#define LUSTRE_OSP_NAME                "osp"
+#define LUSTRE_LWP_NAME                "lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME         "mds"
+#define LUSTRE_MDT_NAME         "mdt"
+#define LUSTRE_MDC_NAME         "mdc"
+#define LUSTRE_OSS_NAME         "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME         "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME         "osc"
+#define LUSTRE_LOV_NAME         "lov"
+#define LUSTRE_MGS_NAME         "mgs"
+#define LUSTRE_MGC_NAME         "mgc"
+
+#define LUSTRE_ECHO_NAME       "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME         "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_osp_on_mdt(char *name)
+{
+       char   *ptr;
+
+       ptr = strrchr(name, '-');
+       if (ptr == NULL) {
+               CERROR("%s is not a obdname\n", name);
+               return 0;
+       }
+
+       /* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */
+       if (strncmp(ptr + 1, "osc", 3) == 0)
+               return 1;
+
+       if (strncmp(ptr + 1, "MDT", 3) != 0)
+               return 0;
+
+       while (*(--ptr) != '-' && ptr != name);
+
+       if (ptr == name)
+               return 0;
+
+       if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 &&
+           strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0)
+               return 0;
+
+       return 1;
+}
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+       __u64               oti_transno;
+       __u64               oti_xid;
+       /* Only used on the server side for tracking acks. */
+       struct oti_req_ack_lock {
+               struct lustre_handle lock;
+               __u32           mode;
+       }                       oti_ack_locks[4];
+       void                *oti_handle;
+       struct llog_cookie       oti_onecookie;
+       struct llog_cookie      *oti_logcookies;
+       int                   oti_numcookies;
+       /** synchronous write is needed */
+       unsigned long            oti_sync_write:1;
+
+       /* initial thread handling transaction */
+       struct ptlrpc_thread *   oti_thread;
+       __u32               oti_conn_cnt;
+       /** VBR: versions */
+       __u64               oti_pre_version;
+       /** JobID */
+       char                *oti_jobid;
+
+       struct obd_uuid  *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+                           struct ptlrpc_request *req)
+{
+       if (oti == NULL)
+               return;
+       memset(oti, 0, sizeof(*oti));
+
+       if (req == NULL)
+               return;
+
+       oti->oti_xid = req->rq_xid;
+       /** VBR: take versions from request */
+       if (req->rq_reqmsg != NULL &&
+           lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+               __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+               oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+               oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+       }
+
+       /** called from mds_create_objects */
+       if (req->rq_repmsg != NULL)
+               oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+       oti->oti_thread = req->rq_svc_thread;
+       if (req->rq_reqmsg != NULL)
+               oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
+{
+       if (!oti)
+               return;
+
+       if (num_cookies == 1)
+               oti->oti_logcookies = &oti->oti_onecookie;
+       else
+               OBD_ALLOC_LARGE(oti->oti_logcookies,
+                               num_cookies * sizeof(oti->oti_onecookie));
+
+       oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+       if (!oti || !oti->oti_logcookies)
+               return;
+
+       if (oti->oti_logcookies == &oti->oti_onecookie)
+               LASSERT(oti->oti_numcookies == 1);
+       else
+               OBD_FREE_LARGE(oti->oti_logcookies,
+                              oti->oti_numcookies*sizeof(oti->oti_onecookie));
+       oti->oti_logcookies = NULL;
+       oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+       /* target added */
+       OBD_NOTIFY_CREATE,
+       /* Device connect start */
+       OBD_NOTIFY_CONNECT,
+       /* Device activated */
+       OBD_NOTIFY_ACTIVE,
+       /* Device deactivated */
+       OBD_NOTIFY_INACTIVE,
+       /* Device disconnected */
+       OBD_NOTIFY_DISCON,
+       /* Connect data for import were changed */
+       OBD_NOTIFY_OCD,
+       /* Sync request */
+       OBD_NOTIFY_SYNC_NONBLOCK,
+       OBD_NOTIFY_SYNC,
+       /* Configuration event */
+       OBD_NOTIFY_CONFIG,
+       /* Administratively deactivate/activate event */
+       OBD_NOTIFY_DEACTIVATE,
+       OBD_NOTIFY_ACTIVATE
+};
+
+/* bit-mask flags for config events */
+enum config_flags {
+       CONFIG_LOG      = 0x1,  /* finished processing config log */
+       CONFIG_SYNC     = 0x2,  /* mdt synced 1 ost */
+       CONFIG_TARGET   = 0x4   /* one target is added */
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+       int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+                         enum obd_notify_event ev, void *owner, void *data);
+       /* Opaque datum supplied by upper layer listener */
+       void *onu_owner;
+};
+
+struct target_recovery_data {
+       svc_handler_t           trd_recovery_handler;
+       pid_t                   trd_processing_task;
+       struct completion       trd_starting;
+       struct completion       trd_finishing;
+};
+
+struct obd_llog_group {
+       int             olg_seq;
+       struct llog_ctxt  *olg_ctxts[LLOG_MAX_CTXTS];
+       wait_queue_head_t       olg_waitq;
+       spinlock_t         olg_lock;
+       struct mutex       olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC       0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
+
+struct obd_device {
+       struct obd_type *obd_type;
+       __u32              obd_magic;
+
+       /* common and UUID name of this device */
+       char                obd_name[MAX_OBD_NAME];
+       struct obd_uuid  obd_uuid;
+
+       struct lu_device       *obd_lu_dev;
+
+       int                  obd_minor;
+       /* bitfield modification is protected by obd_dev_lock */
+       unsigned long obd_attached:1,      /* finished attach */
+                     obd_set_up:1,     /* finished setup */
+                     obd_recovering:1,    /* there are recoverable clients */
+                     obd_abort_recovery:1,/* recovery expired */
+                     obd_version_recov:1, /* obd uses version checking */
+                     obd_replayable:1,    /* recovery is enabled; inform clients */
+                     obd_no_transno:1,    /* no committed-transno notification */
+                     obd_no_recov:1,      /* fail instead of retry messages */
+                     obd_stopping:1,      /* started cleanup */
+                     obd_starting:1,      /* started setup */
+                     obd_force:1,       /* cleanup with > 0 obd refcount */
+                     obd_fail:1,         /* cleanup with failover */
+                     obd_async_recov:1,   /* allow asynchronous orphan cleanup */
+                     obd_no_conn:1,       /* deny new connections */
+                     obd_inactive:1,      /* device active/inactive
+                                          * (for /proc/status only!!) */
+                     obd_no_ir:1,       /* no imperative recovery. */
+                     obd_process_conf:1;  /* device is processing mgs config */
+       /* use separate field as it is set in interrupt to don't mess with
+        * protection of other bits using _bh lock */
+       unsigned long obd_recovery_expired:1;
+       /* uuid-export hash body */
+       cfs_hash_t           *obd_uuid_hash;
+       /* nid-export hash body */
+       cfs_hash_t           *obd_nid_hash;
+       /* nid stats body */
+       cfs_hash_t           *obd_nid_stats_hash;
+       struct list_head              obd_nid_stats;
+       atomic_t            obd_refcount;
+       wait_queue_head_t            obd_refcount_waitq;
+       struct list_head              obd_exports;
+       struct list_head              obd_unlinked_exports;
+       struct list_head              obd_delayed_exports;
+       int                  obd_num_exports;
+       spinlock_t              obd_nid_lock;
+       struct ldlm_namespace  *obd_namespace;
+       struct ptlrpc_client    obd_ldlm_client; /* XXX OST/MDS only */
+       /* a spinlock is OK for what we do now, may need a semaphore later */
+       spinlock_t              obd_dev_lock; /* protect OBD bitfield above */
+       struct mutex            obd_dev_mutex;
+       __u64                   obd_last_committed;
+       struct fsfilt_operations *obd_fsops;
+       spinlock_t              obd_osfs_lock;
+       struct obd_statfs       obd_osfs;       /* locked by obd_osfs_lock */
+       __u64                   obd_osfs_age;
+       struct lvfs_run_ctxt    obd_lvfs_ctxt;
+       struct obd_llog_group   obd_olg;        /* default llog group */
+       struct obd_device       *obd_observer;
+       struct rw_semaphore     obd_observer_link_sem;
+       struct obd_notify_upcall obd_upcall;
+       struct obd_export       *obd_self_export;
+       /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+       struct list_head              obd_exports_timed;
+       time_t            obd_eviction_timer; /* for ping evictor */
+
+       int                           obd_max_recoverable_clients;
+       atomic_t                     obd_connected_clients;
+       int                           obd_stale_clients;
+       int                           obd_delayed_clients;
+       /* this lock protects all recovery list_heads, timer and
+        * obd_next_recovery_transno value */
+       spinlock_t                       obd_recovery_task_lock;
+       __u64                       obd_next_recovery_transno;
+       int                           obd_replayed_requests;
+       int                           obd_requests_queued_for_recovery;
+       wait_queue_head_t                     obd_next_transno_waitq;
+       /* protected by obd_recovery_task_lock */
+       timer_list_t                  obd_recovery_timer;
+       time_t                     obd_recovery_start; /* seconds */
+       time_t                     obd_recovery_end; /* seconds, for lprocfs_status */
+       int                           obd_recovery_time_hard;
+       int                           obd_recovery_timeout;
+       int                           obd_recovery_ir_factor;
+
+       /* new recovery stuff from CMD2 */
+       struct target_recovery_data      obd_recovery_data;
+       int                           obd_replayed_locks;
+       atomic_t                     obd_req_replay_clients;
+       atomic_t                     obd_lock_replay_clients;
+       /* all lists are protected by obd_recovery_task_lock */
+       struct list_head                       obd_req_replay_queue;
+       struct list_head                       obd_lock_replay_queue;
+       struct list_head                       obd_final_req_queue;
+       int                           obd_recovery_stage;
+
+       union {
+               struct obd_device_target obt;
+               struct filter_obd filter;
+               struct client_obd cli;
+               struct ost_obd ost;
+               struct echo_client_obd echo_client;
+               struct echo_obd echo;
+               struct lov_obd lov;
+               struct lmv_obd lmv;
+       } u;
+       /* Fields used by LProcFS */
+       unsigned int       obd_cntr_base;
+       struct lprocfs_stats  *obd_stats;
+
+       unsigned int       md_cntr_base;
+       struct lprocfs_stats  *md_stats;
+
+       proc_dir_entry_t  *obd_proc_entry;
+       proc_dir_entry_t  *obd_proc_exports_entry;
+       proc_dir_entry_t  *obd_svc_procroot;
+       struct lprocfs_stats  *obd_svc_stats;
+       atomic_t           obd_evict_inprogress;
+       wait_queue_head_t           obd_evict_inprogress_waitq;
+       struct list_head             obd_evict_list; /* protected with pet_lock */
+
+       /**
+        * Ldlm pool part. Save last calculated SLV and Limit.
+        */
+       rwlock_t                obd_pool_lock;
+       int                 obd_pool_limit;
+       __u64             obd_pool_slv;
+
+       /**
+        * A list of outstanding class_incref()'s against this obd. For
+        * debugging.
+        */
+       struct lu_ref     obd_reference;
+
+       int                    obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW     0x0001
+#define OBD_LLOG_FL_EXIT       0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+       OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+       OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC             "async"
+#define KEY_BLOCKSIZE_BITS      "blocksize_bits"
+#define KEY_BLOCKSIZE     "blocksize"
+#define KEY_CAPA_KEY       "capa_key"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH       "fid2path"
+#define KEY_CHECKSUM       "checksum"
+#define KEY_CLEAR_FS       "clear_fs"
+#define KEY_CONN_DATA     "conn_data"
+#define KEY_EVICT_BY_NID       "evict_by_nid"
+#define KEY_FIEMAP           "fiemap"
+#define KEY_FLUSH_CTX     "flush_ctx"
+#define KEY_GRANT_SHRINK       "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INIT_RECOV   "initial_recov"
+#define KEY_INTERMDS       "inter_mds"
+#define KEY_LAST_ID         "last_id"
+#define KEY_LAST_FID           "last_fid"
+#define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
+#define KEY_LOVDESC         "lovdesc"
+#define KEY_LOV_IDX         "lov_idx"
+#define KEY_MAX_EASIZE   "max_easize"
+#define KEY_MDS_CONN       "mds_conn"
+#define KEY_MGSSEC           "mgssec"
+#define KEY_NEXT_ID         "next_id"
+#define KEY_READ_ONLY     "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS           "set_fs"
+#define KEY_TGT_COUNT     "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF       "sptlrpc_conf"
+#define KEY_CONNECT_FLAG       "connect_flags"
+#define KEY_SYNC_LOCK_CANCEL    "sync_lock_cancel"
+
+#define KEY_CACHE_SET          "cache_set"
+#define KEY_CACHE_LRU_SHRINK   "cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX    "changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN     (1 << 0)
+#define IT_CREAT    (1 << 1)
+#define IT_READDIR  (1 << 2)
+#define IT_GETATTR  (1 << 3)
+#define IT_LOOKUP   (1 << 4)
+#define IT_UNLINK   (1 << 5)
+#define IT_TRUNC    (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC     (1 << 8)
+#define IT_PIN      (1 << 9)
+#define IT_LAYOUT   (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN  (1 << 12)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+       /* CREAT needs to be tested before open (both could be set) */
+       if (it->it_op & IT_CREAT)
+               return LCK_CW;
+       else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+                             IT_LAYOUT))
+               return LCK_CR;
+
+       LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+       return -EINVAL;
+}
+
+struct md_op_data {
+       struct lu_fid      op_fid1; /* operation fid1 (usualy parent) */
+       struct lu_fid      op_fid2; /* operation fid2 (usualy child) */
+       struct lu_fid      op_fid3; /* 2 extra fids to find conflicting */
+       struct lu_fid      op_fid4; /* to the operation locks. */
+       mdsno_t          op_mds;  /* what mds server open will go to */
+       struct lustre_handle    op_handle;
+       obd_time                op_mod_time;
+       const char           *op_name;
+       int                  op_namelen;
+       __u32              op_mode;
+       struct lmv_stripe_md   *op_mea1;
+       struct lmv_stripe_md   *op_mea2;
+       __u32              op_suppgids[2];
+       __u32              op_fsuid;
+       __u32              op_fsgid;
+       cfs_cap_t              op_cap;
+       void               *op_data;
+
+       /* iattr fields and blocks. */
+       struct iattr        op_attr;
+       unsigned int        op_attr_flags;
+       __u64              op_valid;
+       loff_t            op_attr_blocks;
+
+       /* Size-on-MDS epoch and flags. */
+       __u64              op_ioepoch;
+       __u32              op_flags;
+
+       /* Capa fields */
+       struct obd_capa *op_capa1;
+       struct obd_capa *op_capa2;
+
+       /* Various operation flags. */
+       __u32              op_bias;
+
+       /* Operation type */
+       __u32              op_opc;
+
+       /* Used by readdir */
+       __u64              op_offset;
+
+       /* Used by readdir */
+       __u32              op_npages;
+
+       /* used to transfer info between the stacks of MD client
+        * see enum op_cli_flags */
+       __u32                   op_cli_flags;
+};
+
+enum op_cli_flags {
+       CLI_SET_MEA     = 1 << 0,
+       CLI_RM_ENTRY    = 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+                               struct md_enqueue_info *minfo,
+                               int rc);
+
+/* seq client type */
+enum lu_cli_type {
+       LUSTRE_SEQ_METADATA = 1,
+       LUSTRE_SEQ_DATA
+};
+
+struct md_enqueue_info {
+       struct md_op_data       mi_data;
+       struct lookup_intent    mi_it;
+       struct lustre_handle    mi_lockh;
+       struct inode       *mi_dir;
+       md_enqueue_cb_t  mi_cb;
+       __u64              mi_cbdata;
+       unsigned int        mi_generation;
+};
+
+struct obd_ops {
+       module_t *o_owner;
+       int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+                          void *karg, void *uarg);
+       int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+                         __u32 keylen, void *key, __u32 *vallen, void *val,
+                         struct lov_stripe_md *lsm);
+       int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+                               __u32 keylen, void *key,
+                               __u32 vallen, void *val,
+                               struct ptlrpc_request_set *set);
+       int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
+       int (*o_detach)(struct obd_device *dev);
+       int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+       int (*o_precleanup)(struct obd_device *dev,
+                           enum obd_cleanup_stage cleanup_stage);
+       int (*o_cleanup)(struct obd_device *dev);
+       int (*o_process_config)(struct obd_device *dev, obd_count len,
+                               void *data);
+       int (*o_postrecov)(struct obd_device *dev);
+       int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+                         int priority);
+       int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+       /* connect to the target device with given connection
+        * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+        * granted by the target, which are guaranteed to be a subset of flags
+        * asked for. If @ocd == NULL, use default parameters. */
+       int (*o_connect)(const struct lu_env *env,
+                        struct obd_export **exp, struct obd_device *src,
+                        struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+                        void *localdata);
+       int (*o_reconnect)(const struct lu_env *env,
+                          struct obd_export *exp, struct obd_device *src,
+                          struct obd_uuid *cluuid,
+                          struct obd_connect_data *ocd,
+                          void *localdata);
+       int (*o_disconnect)(struct obd_export *exp);
+
+       /* Initialize/finalize fids infrastructure. */
+       int (*o_fid_init)(struct obd_device *obd,
+                         struct obd_export *exp, enum lu_cli_type type);
+       int (*o_fid_fini)(struct obd_device *obd);
+
+       /* Allocate new fid according to passed @hint. */
+       int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+                          struct md_op_data *op_data);
+
+       /*
+        * Object with @fid is getting deleted, we may want to do something
+        * about this.
+        */
+       int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+                       struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+       int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+                             __u64 max_age, struct ptlrpc_request_set *set);
+       int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+                       struct lov_stripe_md *mem_src);
+       int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
+                         struct lov_mds_md *disk_src, int disk_len);
+       int (*o_preallocate)(struct lustre_handle *, obd_count *req,
+                            obd_id *ids);
+       /* FIXME: add fid capability support for create & destroy! */
+       int (*o_precreate)(struct obd_export *exp);
+       int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+                       struct obdo *oa, struct lov_stripe_md **ea,
+                       struct obd_trans_info *oti);
+       int (*o_create_async)(struct obd_export *exp,  struct obd_info *oinfo,
+                             struct lov_stripe_md **ea,
+                             struct obd_trans_info *oti);
+       int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+                        struct obdo *oa, struct lov_stripe_md *ea,
+                        struct obd_trans_info *oti, struct obd_export *md_exp,
+                        void *capa);
+       int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+                        struct obd_info *oinfo, struct obd_trans_info *oti);
+       int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+                              struct obd_trans_info *oti,
+                              struct ptlrpc_request_set *rqset);
+       int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+                        struct obd_info *oinfo);
+       int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+                              struct ptlrpc_request_set *set);
+       int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pgarr,
+                    struct obd_trans_info *oti);
+       int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          struct ost_lvb *lvb, int kms_only);
+       int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                           obd_off size, int shrink);
+       int (*o_punch)(const struct lu_env *, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti,
+                      struct ptlrpc_request_set *rqset);
+       int (*o_sync)(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_info *oinfo, obd_size start, obd_size end,
+                     struct ptlrpc_request_set *set);
+       int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
+                        struct lov_stripe_md *src, obd_size start,
+                        obd_size end, struct obd_trans_info *oti);
+       int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst,
+                     struct lustre_handle *srconn, struct lov_stripe_md *src,
+                     obd_size start, obd_size end, struct obd_trans_info *);
+       int (*o_iterate)(struct lustre_handle *conn,
+                        int (*)(obd_id, obd_seq, void *),
+                        obd_id *startid, obd_seq seq, void *data);
+       int (*o_preprw)(const struct lu_env *env, int cmd,
+                       struct obd_export *exp, struct obdo *oa, int objcount,
+                       struct obd_ioobj *obj, struct niobuf_remote *remote,
+                       int *nr_pages, struct niobuf_local *local,
+                       struct obd_trans_info *oti, struct lustre_capa *capa);
+       int (*o_commitrw)(const struct lu_env *env, int cmd,
+                         struct obd_export *exp, struct obdo *oa,
+                         int objcount, struct obd_ioobj *obj,
+                         struct niobuf_remote *remote, int pages,
+                         struct niobuf_local *local,
+                         struct obd_trans_info *oti, int rc);
+       int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct ptlrpc_request_set *rqset);
+       int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
+                              ldlm_iterator_t it, void *data);
+       int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+                            ldlm_iterator_t it, void *data);
+       int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
+                       __u32 mode, struct lustre_handle *);
+       int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
+                              ldlm_cancel_flags_t flags, void *opaque);
+       int (*o_init_export)(struct obd_export *exp);
+       int (*o_destroy_export)(struct obd_export *exp);
+       int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *,
+                            int cmd, obd_off *);
+
+       /* llog related obd_methods */
+       int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp,
+                          struct obd_device *disk_obd, int *idx);
+       int (*o_llog_finish)(struct obd_device *obd, int count);
+       int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
+
+       /* metadata-only methods */
+       int (*o_pin)(struct obd_export *, const struct lu_fid *fid,
+                    struct obd_capa *, struct obd_client_handle *, int flag);
+       int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
+
+       int (*o_import_event)(struct obd_device *, struct obd_import *,
+                             enum obd_import_event);
+
+       int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+                       enum obd_notify_event ev, void *data);
+
+       int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+       struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+       /* quota methods */
+       int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+                           struct obd_quotactl *);
+       int (*o_quotactl)(struct obd_device *, struct obd_export *,
+                         struct obd_quotactl *);
+
+       int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+       /* pools methods */
+       int (*o_pool_new)(struct obd_device *obd, char *poolname);
+       int (*o_pool_del)(struct obd_device *obd, char *poolname);
+       int (*o_pool_add)(struct obd_device *obd, char *poolname,
+                         char *ostname);
+       int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+                         char *ostname);
+       void (*o_getref)(struct obd_device *obd);
+       void (*o_putref)(struct obd_device *obd);
+       /*
+        * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+        * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+        * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+       LUSTRE_OPC_MKDIR    = (1 << 0),
+       LUSTRE_OPC_SYMLINK  = (1 << 1),
+       LUSTRE_OPC_MKNOD    = (1 << 2),
+       LUSTRE_OPC_CREATE   = (1 << 3),
+       LUSTRE_OPC_ANY      = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32        0x7fffffffUL
+#define MAX_HASH_SIZE      0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct lustre_md {
+       struct mdt_body  *body;
+       struct lov_stripe_md    *lsm;
+       struct lmv_stripe_md    *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+       struct posix_acl        *posix_acl;
+#endif
+       struct mdt_remote_perm  *remote_perm;
+       struct obd_capa  *mds_capa;
+       struct obd_capa  *oss_capa;
+};
+
+struct md_open_data {
+       struct obd_client_handle *mod_och;
+       struct ptlrpc_request    *mod_open_req;
+       struct ptlrpc_request    *mod_close_req;
+       atomic_t              mod_refcount;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+       int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+                          struct obd_capa **);
+       int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+       int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+                            ldlm_iterator_t, void *);
+       int (*m_close)(struct obd_export *, struct md_op_data *,
+                      struct md_open_data *, struct ptlrpc_request **);
+       int (*m_create)(struct obd_export *, struct md_op_data *,
+                       const void *, int, int, __u32, __u32, cfs_cap_t,
+                       __u64, struct ptlrpc_request **);
+       int (*m_done_writing)(struct obd_export *, struct md_op_data  *,
+                             struct md_open_data *);
+       int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+                        struct lookup_intent *, struct md_op_data *,
+                        struct lustre_handle *, void *, int,
+                        struct ptlrpc_request **, __u64);
+       int (*m_getattr)(struct obd_export *, struct md_op_data *,
+                        struct ptlrpc_request **);
+       int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+                             struct ptlrpc_request **);
+       int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+                            void *, int, struct lookup_intent *, int,
+                            struct ptlrpc_request **,
+                            ldlm_blocking_callback, __u64);
+       int (*m_link)(struct obd_export *, struct md_op_data *,
+                     struct ptlrpc_request **);
+       int (*m_rename)(struct obd_export *, struct md_op_data *,
+                       const char *, int, const char *, int,
+                       struct ptlrpc_request **);
+       int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+                          const struct lu_fid *,
+                          struct ptlrpc_request **);
+       int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+                        int , void *, int, struct ptlrpc_request **,
+                        struct md_open_data **mod);
+       int (*m_sync)(struct obd_export *, const struct lu_fid *,
+                     struct obd_capa *, struct ptlrpc_request **);
+       int (*m_readpage)(struct obd_export *, struct md_op_data *,
+                         struct page **, struct ptlrpc_request **);
+
+       int (*m_unlink)(struct obd_export *, struct md_op_data *,
+                       struct ptlrpc_request **);
+
+       int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+                         struct obd_capa *, obd_valid, const char *,
+                         const char *, int, int, int, __u32,
+                         struct ptlrpc_request **);
+
+       int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+                         struct obd_capa *, obd_valid, const char *,
+                         const char *, int, int, int,
+                         struct ptlrpc_request **);
+
+       int (*m_init_ea_size)(struct obd_export *, int, int, int);
+
+       int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+                              struct obd_export *, struct obd_export *,
+                              struct lustre_md *);
+
+       int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+       int (*m_set_open_replay_data)(struct obd_export *,
+                                     struct obd_client_handle *,
+                                     struct ptlrpc_request *);
+       int (*m_clear_open_replay_data)(struct obd_export *,
+                                       struct obd_client_handle *);
+       int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+       ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+                                   const struct lu_fid *, ldlm_type_t,
+                                   ldlm_policy_data_t *, ldlm_mode_t,
+                                   struct lustre_handle *);
+
+       int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+                              ldlm_policy_data_t *, ldlm_mode_t,
+                              ldlm_cancel_flags_t flags, void *opaque);
+       int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+                           renew_capa_cb_t cb);
+       int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+                            const struct req_msg_field *, struct obd_capa **);
+
+       int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+                                struct obd_capa *, __u32,
+                                struct ptlrpc_request **);
+
+       int (*m_intent_getattr_async)(struct obd_export *,
+                                     struct md_enqueue_info *,
+                                     struct ldlm_enqueue_info *);
+
+       int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+                                struct lu_fid *, __u64 *bits);
+
+       /*
+        * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+        * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+        * wrapper function in include/linux/obd_class.h.
+        */
+};
+
+struct lsm_operations {
+       void (*lsm_free)(struct lov_stripe_md *);
+       int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+                          struct obd_export *md_exp);
+       void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+                                   obd_off *);
+       void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+                                    obd_off *);
+       int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+                              __u16 *stripe_count);
+       int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+                            struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+       switch(magic) {
+       case LOV_MAGIC_V1:
+              return &lsm_v1_ops;
+       case LOV_MAGIC_V3:
+              return &lsm_v3_ops;
+       default:
+              CERROR("Cannot recognize lsm_magic %08x\n", magic);
+              return NULL;
+       }
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START   1
+#define OBD_CALC_STRIPE_END     2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+       return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+       struct md_open_data *mod;
+       OBD_ALLOC_PTR(mod);
+       if (mod == NULL)
+               return NULL;
+       atomic_set(&mod->mod_refcount, 1);
+       return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)                                       \
+({                                                           \
+       if (atomic_dec_and_test(&(mod)->mod_refcount)) {          \
+               if ((mod)->mod_open_req)                          \
+                       ptlrpc_req_finished((mod)->mod_open_req);   \
+               OBD_FREE_PTR(mod);                            \
+       }                                                      \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+       return atomic_read(&cli->cl_resends) ?
+              atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+       return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+       const char      *start;
+       char            *end;
+
+       if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+               return false;
+
+       /* caller does not care of idx */
+       if (idx == NULL)
+               return true;
+
+       /* volatile file, the MDT can be set from name */
+       /* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+       /* if no MDT is specified, use std way */
+       if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+               goto bad_format;
+       /* test for no MDT idx case */
+       if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+           (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+               *idx = -1;
+               return true;
+       }
+       /* we have an idx, read it */
+       start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+       *idx = strtoul(start, &end, 0);
+       /* error cases:
+        * no digit, no trailing :, negative value
+        */
+       if (((*idx == 0) && (end == start)) ||
+           (*end != ':') || (*idx < 0))
+               goto bad_format;
+
+       return true;
+bad_format:
+       /* bad format of mdt idx, we cannot return an error
+        * to caller so we use hash algo */
+       CERROR("Bad volatile file name format: %s\n",
+              name + LUSTRE_VOLATILE_HDR_LEN);
+       return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+       LASSERT(obd != NULL);
+       return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644 (file)
index 0000000..c8249fb
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644 (file)
index 0000000..5f740f1
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+       switch (cksum_type) {
+       case OBD_CKSUM_CRC32:
+               return CFS_HASH_ALG_CRC32;
+       case OBD_CKSUM_ADLER:
+               return CFS_HASH_ALG_ADLER32;
+       case OBD_CKSUM_CRC32C:
+               return CFS_HASH_ALG_CRC32C;
+       default:
+               CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+               LBUG();
+       }
+       return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline obd_flag cksum_type_pack(cksum_type_t cksum_type)
+{
+       unsigned int    performance = 0, tmp;
+       obd_flag        flag = OBD_FL_CKSUM_ADLER;
+
+       if (cksum_type & OBD_CKSUM_CRC32) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_CRC32;
+               }
+       }
+       if (cksum_type & OBD_CKSUM_CRC32C) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_CRC32C;
+               }
+       }
+       if (cksum_type & OBD_CKSUM_ADLER) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_ADLER;
+               }
+       }
+       if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+                                                  OBD_CKSUM_CRC32 |
+                                                  OBD_CKSUM_ADLER))))
+               CWARN("unknown cksum type %x\n", cksum_type);
+
+       return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
+{
+       switch (o_flags & OBD_FL_CKSUM_ALL) {
+       case OBD_FL_CKSUM_CRC32C:
+               return OBD_CKSUM_CRC32C;
+       case OBD_FL_CKSUM_CRC32:
+               return OBD_CKSUM_CRC32;
+       default:
+               break;
+       }
+
+       return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+       cksum_type_t ret = OBD_CKSUM_ADLER;
+
+       CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+               ret |= OBD_CKSUM_CRC32C;
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+               ret |= OBD_CKSUM_CRC32;
+
+       return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+       int          base_speed;
+       cksum_type_t    ret = OBD_CKSUM_ADLER;
+
+       CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+       base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+           base_speed)
+               ret |= OBD_CKSUM_CRC32C;
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+           base_speed)
+               ret |= OBD_CKSUM_CRC32;
+
+       return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+       return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644 (file)
index 0000000..de5c585
--- /dev/null
@@ -0,0 +1,2281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_class.h>
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+                                        * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+                                        * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD      0x0004  /* requests will be sent via ptlrpcd
+                                        * instead of a specific set. This
+                                        * means that we cannot rely on the set
+                                        * interpret routine to be called.
+                                        * lov_statfs_fini() must thus be called
+                                        * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0    0x0008  /* The statfs is only for retrieving
+                                        * information from MDT0. */
+#define OBD_FL_PUNCH    0x00000001      /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+                       struct lprocfs_vars *, const char *nm,
+                       struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                         const char * typ_name,
+                                         struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+                                          int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+                        struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+                                    const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+                            struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+                               const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+                 const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#ifdef LPROCFS
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+       char           *cfg_obdname;
+       void           *cfg_instance;
+       struct super_block *cfg_sb;
+       struct obd_uuid     cfg_uuid;
+       llog_cb_t           cfg_callback;
+       int              cfg_last_idx; /* for partial llog processing */
+       int              cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                           char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          char *name, struct config_llog_instance *cfg);
+
+enum {
+       CONFIG_T_CONFIG  = 0,
+       CONFIG_T_SPTLRPC = 1,
+       CONFIG_T_RECOVER = 2,
+       CONFIG_T_MAX     = 3
+};
+
+/* list of active configuration logs  */
+struct config_llog_data {
+       struct ldlm_res_id        cld_resid;
+       struct config_llog_instance cld_cfg;
+       struct list_head                  cld_list_chain;
+       atomic_t                cld_refcount;
+       struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+       struct config_llog_data    *cld_recover;    /* imperative recover log */
+       struct obd_export         *cld_mgcexp;
+       struct mutex                cld_lock;
+       int                      cld_type;
+       unsigned int            cld_stopping:1, /* we were told to stop
+                                                    * watching */
+                                   cld_lostlock:1; /* lock not requeued */
+       char                    cld_logname[0];
+};
+
+struct lustre_profile {
+       struct list_head       lp_list;
+       char        *lp_profile;
+       char        *lp_dt;
+       char        *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)      do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)      do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)                                     \
+({                                                                   \
+       atomic_inc(&(exp)->exp_rpc_count);                        \
+       CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+              (exp), atomic_read(&(exp)->exp_rpc_count));        \
+})
+
+#define class_export_rpc_dec(exp)                                     \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_rpc_count);                        \
+       atomic_dec(&(exp)->exp_rpc_count);                        \
+       CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+              (exp), atomic_read(&(exp)->exp_rpc_count));        \
+})
+
+#define class_export_lock_get(exp, lock)                               \
+({                                                                   \
+       atomic_inc(&(exp)->exp_locks_count);                    \
+       __class_export_add_lock_ref(exp, lock);                  \
+       CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+              (exp), atomic_read(&(exp)->exp_locks_count));    \
+       class_export_get(exp);                                    \
+})
+
+#define class_export_lock_put(exp, lock)                               \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_locks_count);                    \
+       atomic_dec(&(exp)->exp_locks_count);                    \
+       __class_export_del_lock_ref(exp, lock);                  \
+       CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+              (exp), atomic_read(&(exp)->exp_locks_count));    \
+       class_export_put(exp);                                    \
+})
+
+#define class_export_cb_get(exp)                                       \
+({                                                                   \
+       atomic_inc(&(exp)->exp_cb_count);                          \
+       CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+              (exp), atomic_read(&(exp)->exp_cb_count));          \
+       class_export_get(exp);                                    \
+})
+
+#define class_export_cb_put(exp)                                       \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_cb_count);                  \
+       atomic_dec(&(exp)->exp_cb_count);                          \
+       CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+              (exp), atomic_read(&(exp)->exp_cb_count));          \
+       class_export_put(exp);                                    \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+                                   struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                 struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+                                   int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+       return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+               (obd->obd_force ? OBD_OPT_FORCE : 0) |
+               (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+               0);
+}
+
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+                    unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+                 unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev)       (dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)                                   \
+do {                                                       \
+       if (!(obd)) {                                      \
+               CERROR("NULL device\n");                        \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)                             \
+do {                                                       \
+       OBD_CHECK_DEV(obd);                                  \
+       if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
+               CERROR("Device %d not setup\n",          \
+                      (obd)->obd_minor);                      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+} while (0)
+
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op)                           \
+       ((offsetof(struct obd_ops, o_ ## op) -            \
+         offsetof(struct obd_ops, o_iocontrol))                \
+        / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op)                           \
+       if ((obdx)->obd_stats != NULL) {                          \
+               unsigned int coffset;                        \
+               coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+                       OBD_COUNTER_OFFSET(op);            \
+               LASSERT(coffset < (obdx)->obd_stats->ls_num);     \
+               lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+       }
+
+#define EXP_COUNTER_INCREMENT(export, op)                                  \
+       if ((export)->exp_obd->obd_stats != NULL) {                       \
+               unsigned int coffset;                                   \
+               coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+                       OBD_COUNTER_OFFSET(op);                       \
+               LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num);     \
+               lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+               if ((export)->exp_nid_stats != NULL &&                 \
+                   (export)->exp_nid_stats->nid_stats != NULL)       \
+                       lprocfs_counter_incr(                           \
+                               (export)->exp_nid_stats->nid_stats, coffset);\
+       }
+
+#define MD_COUNTER_OFFSET(op)                             \
+       ((offsetof(struct md_ops, m_ ## op) -              \
+         offsetof(struct md_ops, m_getstatus))          \
+        / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op)                    \
+       if ((obd)->md_stats != NULL) {                     \
+               unsigned int coffset;                       \
+               coffset = (unsigned int)((obdx)->md_cntr_base) + \
+                       MD_COUNTER_OFFSET(op);             \
+               LASSERT(coffset < (obdx)->md_stats->ls_num);     \
+               lprocfs_counter_incr((obdx)->md_stats, coffset); \
+       }
+
+#define EXP_MD_COUNTER_INCREMENT(export, op)                            \
+       if ((export)->exp_obd->obd_stats != NULL) {                       \
+               unsigned int coffset;                                   \
+               coffset = (unsigned int)((export)->exp_obd->md_cntr_base) +  \
+                       MD_COUNTER_OFFSET(op);                         \
+               LASSERT(coffset < (export)->exp_obd->md_stats->ls_num);      \
+               lprocfs_counter_incr((export)->exp_obd->md_stats, coffset);  \
+               if ((export)->exp_md_stats != NULL)                       \
+                       lprocfs_counter_incr(                           \
+                               (export)->exp_md_stats, coffset);           \
+       }
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+       /* Always add in ldlm_stats */
+       tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+                                                 ,LPROCFS_STATS_FLAG_NOPERCPU);
+       if (tmp->nid_ldlm_stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+       return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+                                     tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err)                     \
+do {                                                       \
+       if (!OBT(obd) || !MDP((obd), op)) {                  \
+               if (err)                                        \
+                       CERROR("md_" #op ": dev %s/%d no operation\n", \
+                              obd->obd_name, obd->obd_minor);  \
+               RETURN(err);                                \
+       }                                                      \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op)                               \
+do {                                                       \
+       if ((exp) == NULL) {                                \
+               CERROR("obd_" #op ": NULL export\n");      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+       if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+               CERROR("obd_" #op ": cleaned up obd\n");        \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+       if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+               CERROR("obd_" #op ": dev %s/%d no operation\n", \
+                      (exp)->exp_obd->obd_name,                \
+                      (exp)->exp_obd->obd_minor);            \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)                     \
+do {                                                       \
+       if (!OBT(obd) || !OBP((obd), op)) {                  \
+               if (err)                                        \
+                       CERROR("obd_" #op ": dev %d no operation\n",    \
+                              obd->obd_minor);          \
+               RETURN(err);                                \
+       }                                                      \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)                               \
+do {                                                       \
+       if ((exp) == NULL) {                                \
+               CERROR("obd_" #op ": NULL export\n");      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+       if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+               CERROR("obd_" #op ": cleaned up obd\n");        \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+       if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+               CERROR("obd_" #op ": dev %d no operation\n",    \
+                      (exp)->exp_obd->obd_minor);            \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)                            \
+do {                                                            \
+       if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {             \
+               if (err)                                             \
+                       CERROR("lop_" #op ": dev %d no operation\n", \
+                              ctxt->loc_obd->obd_minor);           \
+               RETURN(err);                                     \
+       }                                                           \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+       return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+                              struct obd_export *exp, __u32 keylen,
+                              void *key, __u32 *vallen, void *val,
+                              struct lov_stripe_md *lsm)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, get_info);
+       EXP_COUNTER_INCREMENT(exp, get_info);
+
+       rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+                                        lsm);
+       RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+                                    struct obd_export *exp, obd_count keylen,
+                                    void *key, obd_count vallen, void *val,
+                                    struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, set_info_async);
+       EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+       rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+                                              val, set);
+       RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)                 \
+       struct lu_device_type *ldt;       \
+       struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       ldt = obd->obd_type->typ_lu;
+       if (ldt != NULL) {
+               struct lu_context  session_ctx;
+               struct lu_env env;
+               lu_context_init(&session_ctx, LCT_SESSION);
+               session_ctx.lc_thread = NULL;
+               lu_context_enter(&session_ctx);
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       env.le_ses = &session_ctx;
+                       d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+                       lu_env_fini(&env);
+                       if (!IS_ERR(d)) {
+                               obd->obd_lu_dev = d;
+                               d->ld_obd = obd;
+                               rc = 0;
+                       } else
+                               rc = PTR_ERR(d);
+               }
+               lu_context_exit(&session_ctx);
+               lu_context_fini(&session_ctx);
+
+       } else {
+               OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+               OBD_COUNTER_INCREMENT(obd, setup);
+               rc = OBP(obd, setup)(obd, cfg);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+                                enum obd_cleanup_stage cleanup_stage)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+                       struct lu_env env;
+
+                       rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                       if (rc == 0) {
+                               ldt->ldt_ops->ldto_device_fini(&env, d);
+                               lu_env_fini(&env);
+                       }
+               }
+       }
+       OBD_CHECK_DT_OP(obd, precleanup, 0);
+       OBD_COUNTER_INCREMENT(obd, precleanup);
+
+       rc = OBP(obd, precleanup)(obd, cleanup_stage);
+       RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               struct lu_env env;
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       ldt->ldt_ops->ldto_device_free(&env, d);
+                       lu_env_fini(&env);
+                       obd->obd_lu_dev = NULL;
+               }
+       }
+       OBD_CHECK_DT_OP(obd, cleanup, 0);
+       OBD_COUNTER_INCREMENT(obd, cleanup);
+
+       rc = OBP(obd, cleanup)(obd);
+       RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+       ENTRY;
+
+       /* If we set up but never connected, the
+          client import will not have been cleaned. */
+       down_write(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import) {
+               struct obd_import *imp;
+               imp = obd->u.cli.cl_import;
+               CDEBUG(D_CONFIG, "%s: client import never connected\n",
+                      obd->obd_name);
+               ptlrpc_invalidate_import(imp);
+               if (imp->imp_rq_pool) {
+                       ptlrpc_free_rq_pool(imp->imp_rq_pool);
+                       imp->imp_rq_pool = NULL;
+               }
+               client_destroy_import(imp);
+               obd->u.cli.cl_import = NULL;
+       }
+       up_write(&obd->u.cli.cl_sem);
+
+       EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+
+       obd->obd_process_conf = 1;
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               struct lu_env env;
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       rc = d->ld_ops->ldo_process_config(&env, d, data);
+                       lu_env_fini(&env);
+               }
+       } else {
+               OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+               rc = OBP(obd, process_config)(obd, datalen, data);
+       }
+       OBD_COUNTER_INCREMENT(obd, process_config);
+       obd->obd_process_conf = 0;
+
+       RETURN(rc);
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+                            struct lov_mds_md **disk_tgt,
+                            struct lov_stripe_md *mem_src)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, packmd);
+       EXP_COUNTER_INCREMENT(exp, packmd);
+
+       rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+       RETURN(rc);
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+                                 struct lov_stripe_md *mem_src)
+{
+       return obd_packmd(exp, NULL, mem_src);
+}
+
+/* helper functions */
+static inline int obd_alloc_diskmd(struct obd_export *exp,
+                                  struct lov_mds_md **disk_tgt)
+{
+       LASSERT(disk_tgt);
+       LASSERT(*disk_tgt == NULL);
+       return obd_packmd(exp, disk_tgt, NULL);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+                                 struct lov_mds_md **disk_tgt)
+{
+       LASSERT(disk_tgt);
+       LASSERT(*disk_tgt);
+       /*
+        * LU-2590, for caller's convenience, *disk_tgt could be host
+        * endianness, it needs swab to LE if necessary, while just
+        * lov_mds_md header needs it for figuring out how much memory
+        * needs to be freed.
+        */
+       if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+           (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+            ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+               lustre_swab_lov_mds_md(*disk_tgt);
+       return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+                              struct lov_stripe_md **mem_tgt,
+                              struct lov_mds_md *disk_src,
+                              int disk_len)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, unpackmd);
+       EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+       rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+       RETURN(rc);
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+                                 struct lov_stripe_md **mem_tgt)
+{
+       LASSERT(mem_tgt);
+       LASSERT(*mem_tgt == NULL);
+       return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+                                struct lov_stripe_md **mem_tgt)
+{
+       int rc;
+
+       LASSERT(mem_tgt);
+       LASSERT(*mem_tgt);
+       rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+       *mem_tgt = NULL;
+       return rc;
+}
+
+static inline int obd_precreate(struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, precreate);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, precreate);
+
+       rc = OBP(exp->exp_obd, precreate)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_create_async(struct obd_export *exp,
+                                  struct obd_info *oinfo,
+                                  struct lov_stripe_md **ea,
+                                  struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, create_async);
+       EXP_COUNTER_INCREMENT(exp, create_async);
+
+       rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+       RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+                            struct obdo *obdo, struct lov_stripe_md **ea,
+                            struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, create);
+       EXP_COUNTER_INCREMENT(exp, create);
+
+       rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+       RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+                             struct obdo *obdo, struct lov_stripe_md *ea,
+                             struct obd_trans_info *oti,
+                             struct obd_export *md_exp, void *capa)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, destroy);
+       EXP_COUNTER_INCREMENT(exp, destroy);
+
+       rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+       RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_info *oinfo)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, getattr);
+       EXP_COUNTER_INCREMENT(exp, getattr);
+
+       rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+       RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, getattr_async);
+       EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+       rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+       RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_info *oinfo,
+                             struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr);
+       EXP_COUNTER_INCREMENT(exp, setattr);
+
+       rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+       RETURN(rc);
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct obd_trans_info *oti)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr_async);
+       EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+   all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct obd_trans_info *oti,
+                                   struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr_async);
+       EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+       rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+       RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                              int priority)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, add_conn);
+
+       rc = OBP(obd, add_conn)(imp, uuid, priority);
+       RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, del_conn);
+
+       rc = OBP(obd, del_conn)(imp, uuid);
+       RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+       struct obd_uuid *uuid;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+       EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+       uuid = OBP(exp->exp_obd, get_uuid)(exp);
+       RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+                             struct obd_export **exp,struct obd_device *obd,
+                             struct obd_uuid *cluuid,
+                             struct obd_connect_data *data,
+                             void *localdata)
+{
+       int rc;
+       __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+                                                  * check */
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, connect);
+
+       rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+       /* check that only subset is granted */
+       LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+                                   data->ocd_connect_flags));
+       RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+                               struct obd_export *exp,
+                               struct obd_device *obd,
+                               struct obd_uuid *cluuid,
+                               struct obd_connect_data *d,
+                               void *localdata)
+{
+       int rc;
+       __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+                                                  * check */
+
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, reconnect, 0);
+       OBD_COUNTER_INCREMENT(obd, reconnect);
+
+       rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+       /* check that only subset is granted */
+       LASSERT(ergo(d != NULL,
+                    (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+       RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, disconnect);
+       EXP_COUNTER_INCREMENT(exp, disconnect);
+
+       rc = OBP(exp->exp_obd, disconnect)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+                              enum lu_cli_type type)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, fid_init, 0);
+       OBD_COUNTER_INCREMENT(obd, fid_init);
+
+       rc = OBP(obd, fid_init)(obd, exp, type);
+       RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, fid_fini, 0);
+       OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+       rc = OBP(obd, fid_fini)(obd);
+       RETURN(rc);
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+                               struct lu_fid *fid,
+                               struct md_op_data *op_data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, fid_alloc);
+       EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+       rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+       RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+       EXP_COUNTER_INCREMENT(exp, ping);
+
+       rc = OBP(exp->exp_obd, ping)(env, exp);
+       RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_new);
+
+       rc = OBP(obd, pool_new)(obd, poolname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_del);
+
+       rc = OBP(obd, pool_del)(obd, poolname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_add);
+
+       rc = OBP(obd, pool_add)(obd, poolname, ostname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+       rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+       RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+       ENTRY;
+       if (OBT(obd) && OBP(obd, getref)) {
+               OBD_COUNTER_INCREMENT(obd, getref);
+               OBP(obd, getref)(obd);
+       }
+       EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+       ENTRY;
+       if (OBT(obd) && OBP(obd, putref)) {
+               OBD_COUNTER_INCREMENT(obd, putref);
+               OBP(obd, putref)(obd);
+       }
+       EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+       int rc = 0;
+
+       ENTRY;
+       if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+           OBP((exp)->exp_obd, init_export))
+               rc = OBP(exp->exp_obd, init_export)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+       if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+           OBP((exp)->exp_obd, destroy_export))
+               OBP(exp->exp_obd, destroy_export)(exp);
+       RETURN(0);
+}
+
+static inline int obd_extent_calc(struct obd_export *exp,
+                                 struct lov_stripe_md *md,
+                                 int cmd, obd_off *offset)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_DT_OP(exp, extent_calc);
+       rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset);
+       RETURN(rc);
+}
+
+static inline struct dentry *
+obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen)
+{
+       struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt;
+       LASSERT(exp->exp_obd);
+
+       return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi),
+                                        exp->exp_obd);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+                                  struct obd_info *oinfo,
+                                  __u64 max_age,
+                                  struct ptlrpc_request_set *rqset)
+{
+       int rc = 0;
+       struct obd_device *obd;
+       ENTRY;
+
+       if (exp == NULL || exp->exp_obd == NULL)
+               RETURN(-EINVAL);
+
+       obd = exp->exp_obd;
+       OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, statfs);
+
+       CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n",
+              obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+       if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+               rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+       } else {
+               CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64
+                      " objects "LPU64"/"LPU64"\n",
+                      obd->obd_name, &obd->obd_osfs,
+                      obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                      obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+               spin_unlock(&obd->obd_osfs_lock);
+               oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+               if (oinfo->oi_cb_up)
+                       oinfo->oi_cb_up(oinfo, 0);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+                                  struct obd_statfs *osfs, __u64 max_age,
+                                  __u32 flags)
+{
+       struct ptlrpc_request_set *set = NULL;
+       struct obd_info oinfo = { { { 0 } } };
+       int rc = 0;
+       ENTRY;
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       oinfo.oi_osfs = osfs;
+       oinfo.oi_flags = flags;
+       rc = obd_statfs_async(exp, &oinfo, max_age, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+                            struct obd_statfs *osfs, __u64 max_age,
+                            __u32 flags)
+{
+       int rc = 0;
+       struct obd_device *obd = exp->exp_obd;
+       ENTRY;
+
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, statfs);
+
+       CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n",
+              obd->obd_osfs_age, max_age);
+       if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+               rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+               if (rc == 0) {
+                       spin_lock(&obd->obd_osfs_lock);
+                       memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+                       obd->obd_osfs_age = cfs_time_current_64();
+                       spin_unlock(&obd->obd_osfs_lock);
+               }
+       } else {
+               CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64
+                      " objects "LPU64"/"LPU64"\n",
+                      obd->obd_name, &obd->obd_osfs,
+                      obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                      obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+               spin_unlock(&obd->obd_osfs_lock);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+                                obd_size start, obd_size end)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+       EXP_COUNTER_INCREMENT(exp, sync);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_sync(const struct lu_env *env, struct obd_export *exp,
+                          struct obd_info *oinfo, obd_size start, obd_size end,
+                          struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+       EXP_COUNTER_INCREMENT(exp, sync);
+
+       rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set);
+       RETURN(rc);
+}
+
+static inline int obd_punch_rqset(struct obd_export *exp,
+                                 struct obd_info *oinfo,
+                                 struct obd_trans_info *oti)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, punch);
+       EXP_COUNTER_INCREMENT(exp, punch);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_punch(const struct lu_env *env, struct obd_export *exp,
+                           struct obd_info *oinfo, struct obd_trans_info *oti,
+                           struct ptlrpc_request_set *rqset)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, punch);
+       EXP_COUNTER_INCREMENT(exp, punch);
+
+       rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset);
+       RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+                         struct obd_info *oinfo, obd_count oa_bufs,
+                         struct brw_page *pg, struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, brw);
+       EXP_COUNTER_INCREMENT(exp, brw);
+
+       if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+               CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+                      "or OBD_BRW_CHECK\n");
+               LBUG();
+       }
+
+       rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
+       RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+                            struct obd_export *exp, struct obdo *oa,
+                            int objcount, struct obd_ioobj *obj,
+                            struct niobuf_remote *remote, int *pages,
+                            struct niobuf_local *local,
+                            struct obd_trans_info *oti,
+                            struct lustre_capa *capa)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, preprw);
+       EXP_COUNTER_INCREMENT(exp, preprw);
+
+       rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+                                      pages, local, oti, capa);
+       RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+                              struct obd_export *exp, struct obdo *oa,
+                              int objcount, struct obd_ioobj *obj,
+                              struct niobuf_remote *rnb, int pages,
+                              struct niobuf_local *local,
+                              struct obd_trans_info *oti, int rc)
+{
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, commitrw);
+       EXP_COUNTER_INCREMENT(exp, commitrw);
+
+       rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+                                        rnb, pages, local, oti, rc);
+       RETURN(rc);
+}
+
+static inline int obd_merge_lvb(struct obd_export *exp,
+                               struct lov_stripe_md *lsm,
+                               struct ost_lvb *lvb, int kms_only)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, merge_lvb);
+       EXP_COUNTER_INCREMENT(exp, merge_lvb);
+
+       rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+       RETURN(rc);
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+                                struct lov_stripe_md *lsm, obd_off size,
+                                int shrink)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, adjust_kms);
+       EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+       rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+       RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+                               int len, void *karg, void *uarg)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, iocontrol);
+       EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+       rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+       RETURN(rc);
+}
+
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct ldlm_enqueue_info *einfo)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, enqueue);
+       EXP_COUNTER_INCREMENT(exp, enqueue);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+                             struct obd_info *oinfo,
+                             struct ldlm_enqueue_info *einfo,
+                             struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, enqueue);
+       EXP_COUNTER_INCREMENT(exp, enqueue);
+
+       rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+       RETURN(rc);
+}
+
+static inline int obd_change_cbdata(struct obd_export *exp,
+                                   struct lov_stripe_md *lsm,
+                                   ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, change_cbdata);
+       EXP_COUNTER_INCREMENT(exp, change_cbdata);
+
+       rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data);
+       RETURN(rc);
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+                                 struct lov_stripe_md *lsm,
+                                 ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, find_cbdata);
+       EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+       rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+       RETURN(rc);
+}
+
+static inline int obd_cancel(struct obd_export *exp,
+                            struct lov_stripe_md *ea, __u32 mode,
+                            struct lustre_handle *lockh)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, cancel);
+       EXP_COUNTER_INCREMENT(exp, cancel);
+
+       rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh);
+       RETURN(rc);
+}
+
+static inline int obd_cancel_unused(struct obd_export *exp,
+                                   struct lov_stripe_md *ea,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, cancel_unused);
+       EXP_COUNTER_INCREMENT(exp, cancel_unused);
+
+       rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque);
+       RETURN(rc);
+}
+
+static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid,
+                         struct obd_capa *oc, struct obd_client_handle *handle,
+                         int flag)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, pin);
+       EXP_COUNTER_INCREMENT(exp, pin);
+
+       rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag);
+       RETURN(rc);
+}
+
+static inline int obd_unpin(struct obd_export *exp,
+                           struct obd_client_handle *handle, int flag)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, unpin);
+       EXP_COUNTER_INCREMENT(exp, unpin);
+
+       rc = OBP(exp->exp_obd, unpin)(exp, handle, flag);
+       RETURN(rc);
+}
+
+
+static inline void obd_import_event(struct obd_device *obd,
+                                   struct obd_import *imp,
+                                   enum obd_import_event event)
+{
+       ENTRY;
+       if (!obd) {
+               CERROR("NULL device\n");
+               EXIT;
+               return;
+       }
+       if (obd->obd_set_up && OBP(obd, import_event)) {
+               OBD_COUNTER_INCREMENT(obd, import_event);
+               OBP(obd, import_event)(obd, imp, event);
+       }
+       EXIT;
+}
+
+static inline int obd_llog_connect(struct obd_export *exp,
+                                  struct llogd_conn_body *body)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0);
+       EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+       rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+       RETURN(rc);
+}
+
+
+static inline int obd_notify(struct obd_device *obd,
+                            struct obd_device *watched,
+                            enum obd_notify_event ev,
+                            void *data)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DEV(obd);
+
+       /* the check for async_recov is a complete hack - I'm hereby
+          overloading the meaning to also mean "this was called from
+          mds_postsetup".  I know that my mds is able to handle notifies
+          by this point, and it needs to get them to execute mds_postrecov. */
+       if (!obd->obd_set_up && !obd->obd_async_recov) {
+               CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       if (!OBP(obd, notify)) {
+               CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+               RETURN(-ENOSYS);
+       }
+
+       OBD_COUNTER_INCREMENT(obd, notify);
+       rc = OBP(obd, notify)(obd, watched, ev, data);
+       RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+                                     struct obd_device *observed,
+                                     enum obd_notify_event ev,
+                                     void *data)
+{
+       int rc1;
+       int rc2;
+
+       struct obd_notify_upcall *onu;
+
+       if (observer->obd_observer)
+               rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+       else
+               rc1 = 0;
+       /*
+        * Also, call non-obd listener, if any
+        */
+       onu = &observer->obd_upcall;
+       if (onu->onu_upcall != NULL)
+               rc2 = onu->onu_upcall(observer, observed, ev,
+                                     onu->onu_owner, NULL);
+       else
+               rc2 = 0;
+
+       return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+                                struct obd_quotactl *oqctl)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, quotacheck);
+       EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+       rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+       RETURN(rc);
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+                              struct obd_quotactl *oqctl)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, quotactl);
+       EXP_COUNTER_INCREMENT(exp, quotactl);
+
+       rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+       RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+                                  struct obd_device *obd)
+{
+       /* returns: 0 on healthy
+        *       >0 on unhealthy + reason code/flag
+        *          however the only suppored reason == 1 right now
+        *          We'll need to define some better reasons
+        *          or flags in the future.
+        *       <0 on error
+        */
+       int rc;
+       ENTRY;
+
+       /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+       if (obd == NULL || !OBT(obd)) {
+               CERROR("cleaned up obd\n");
+               RETURN(-EOPNOTSUPP);
+       }
+       if (!obd->obd_set_up || obd->obd_stopping)
+               RETURN(0);
+       if (!OBP(obd, health_check))
+               RETURN(0);
+
+       rc = OBP(obd, health_check)(env, obd);
+       RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+                                       struct obd_device *observer)
+{
+       ENTRY;
+       OBD_CHECK_DEV(obd);
+       down_write(&obd->obd_observer_link_sem);
+       if (obd->obd_observer && observer) {
+               up_write(&obd->obd_observer_link_sem);
+               RETURN(-EALREADY);
+       }
+       obd->obd_observer = observer;
+       up_write(&obd->obd_observer_link_sem);
+       RETURN(0);
+}
+
+static inline int obd_pin_observer(struct obd_device *obd,
+                                  struct obd_device **observer)
+{
+       ENTRY;
+       down_read(&obd->obd_observer_link_sem);
+       if (!obd->obd_observer) {
+               *observer = NULL;
+               up_read(&obd->obd_observer_link_sem);
+               RETURN(-ENOENT);
+       }
+       *observer = obd->obd_observer;
+       RETURN(0);
+}
+
+static inline int obd_unpin_observer(struct obd_device *obd)
+{
+       ENTRY;
+       up_read(&obd->obd_observer_link_sem);
+       RETURN(0);
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+                                              obd_page_removal_cb_t cb,
+                                              obd_pin_extent_cb pin_cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+       rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+       RETURN(rc);
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+                                                obd_page_removal_cb_t cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+       rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+       RETURN(rc);
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+                                             obd_lock_cancel_cb cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+       rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+       RETURN(rc);
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+                                                obd_lock_cancel_cb cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+       rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+       RETURN(rc);
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+                              struct lu_fid *fid, struct obd_capa **pc)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_MD_OP(exp, getstatus);
+       EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+       rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+       RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                            struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getattr);
+       EXP_MD_COUNTER_INCREMENT(exp, getattr);
+       rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+                                  const struct lu_fid *fid)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, null_inode);
+       EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+       rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+       RETURN(rc);
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+                                const struct lu_fid *fid,
+                                ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, find_cbdata);
+       EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+       rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+       RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+                          struct md_open_data *mod,
+                          struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, close);
+       EXP_MD_COUNTER_INCREMENT(exp, close);
+       rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+       RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+                           const void *data, int datalen, int mode, __u32 uid,
+                           __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+                           struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, create);
+       EXP_MD_COUNTER_INCREMENT(exp, create);
+       rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+                                      uid, gid, cap_effective, rdev, request);
+       RETURN(rc);
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+                                 struct md_op_data *op_data,
+                                 struct md_open_data *mod)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, done_writing);
+       EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+       rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+       RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+                            struct ldlm_enqueue_info *einfo,
+                            struct lookup_intent *it,
+                            struct md_op_data *op_data,
+                            struct lustre_handle *lockh,
+                            void *lmm, int lmmsize,
+                            struct ptlrpc_request **req,
+                            int extra_lock_flags)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, enqueue);
+       EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+       rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+                                       lmm, lmmsize, req, extra_lock_flags);
+       RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+                                 struct md_op_data *op_data,
+                                 struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getattr_name);
+       EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+       rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+                                struct md_op_data *op_data, void *lmm,
+                                int lmmsize, struct lookup_intent *it,
+                                int lookup_flags, struct ptlrpc_request **reqp,
+                                ldlm_blocking_callback cb_blocking,
+                                __u64 extra_lock_flags)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, intent_lock);
+       EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+       rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+                                           it, lookup_flags, reqp, cb_blocking,
+                                           extra_lock_flags);
+       RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+                         struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, link);
+       EXP_MD_COUNTER_INCREMENT(exp, link);
+       rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+                           const char *old, int oldlen, const char *new,
+                           int newlen, struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, rename);
+       EXP_MD_COUNTER_INCREMENT(exp, rename);
+       rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+                                      newlen, request);
+       RETURN(rc);
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+                              const struct lu_fid *pfid,
+                              const struct lu_fid *cfid,
+                              struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, is_subdir);
+       EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+       rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+       RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+                            void *ea, int ealen, void *ea2, int ea2len,
+                            struct ptlrpc_request **request,
+                            struct md_open_data **mod)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, setattr);
+       EXP_MD_COUNTER_INCREMENT(exp, setattr);
+       rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+                                       ea2, ea2len, request, mod);
+       RETURN(rc);
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+                         struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, sync);
+       EXP_MD_COUNTER_INCREMENT(exp, sync);
+       rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+       RETURN(rc);
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+                             struct page **pages,
+                             struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, readpage);
+       EXP_MD_COUNTER_INCREMENT(exp, readpage);
+       rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+       RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                           struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, unlink);
+       EXP_MD_COUNTER_INCREMENT(exp, unlink);
+       rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+                                  struct ptlrpc_request *req,
+                                  struct obd_export *dt_exp,
+                                  struct obd_export *md_exp,
+                                  struct lustre_md *md)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, get_lustre_md);
+       EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+       RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+                                   struct lustre_md *md)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, free_lustre_md);
+       EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+       RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+                             const struct lu_fid *fid, struct obd_capa *oc,
+                             obd_valid valid, const char *name,
+                             const char *input, int input_size,
+                             int output_size, int flags, __u32 suppgid,
+                             struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, setxattr);
+       EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+       RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+                                          input_size, output_size, flags,
+                                          suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+                             const struct lu_fid *fid, struct obd_capa *oc,
+                             obd_valid valid, const char *name,
+                             const char *input, int input_size,
+                             int output_size, int flags,
+                             struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getxattr);
+       EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+       RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+                                          input_size, output_size, flags,
+                                          request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+                                         struct obd_client_handle *och,
+                                         struct ptlrpc_request *open_req)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, set_open_replay_data);
+       EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+       RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+                                           struct obd_client_handle *och)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+       EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+       RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+                                  __u64 *lockh, void *data, __u64 *bits)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, set_lock_data);
+       EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+       RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+                                  const struct lu_fid *fid,
+                                  ldlm_policy_data_t *policy,
+                                  ldlm_mode_t mode,
+                                  ldlm_cancel_flags_t flags,
+                                  void *opaque)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_MD_OP(exp, cancel_unused);
+       EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+       rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+                                             flags, opaque);
+       RETURN(rc);
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+                                       const struct lu_fid *fid,
+                                       ldlm_type_t type,
+                                       ldlm_policy_data_t *policy,
+                                       ldlm_mode_t mode,
+                                       struct lustre_handle *lockh)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, lock_match);
+       EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+       RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+                                            policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+                                 int def_asize, int cookiesize)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, init_ea_size);
+       EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+       RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+                                              cookiesize));
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+                                    const struct lu_fid *fid,
+                                    struct obd_capa *oc, __u32 suppgid,
+                                    struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, get_remote_perm);
+       EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+       RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+                                                 request));
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+                               renew_capa_cb_t cb)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, renew_capa);
+       EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+       rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+       RETURN(rc);
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+                                struct ptlrpc_request *req,
+                                const struct req_msg_field *field,
+                                struct obd_capa **oc)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, unpack_capa);
+       EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+       rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+       RETURN(rc);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+                                         struct md_enqueue_info *minfo,
+                                         struct ldlm_enqueue_info *einfo)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, intent_getattr_async);
+       EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+       rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+       RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+                                    struct lookup_intent *it,
+                                    struct lu_fid *fid, __u64 *bits)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, revalidate_lock);
+       EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+       rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+       RETURN(rc);
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)                                                       \
+do {                                                                     \
+       OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO);        \
+} while(0)
+
+#define OBDO_FREE(ptr)                                                 \
+do {                                                                     \
+       OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                          \
+} while(0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+       /* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+       /* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+       struct obd_export **lri_exp;
+       register_lwp_cb     lri_cb_func;
+       void               *lri_cb_data;
+       struct list_head            lri_list;
+       char                lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* mea.c */
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen);
+int raw_name2idx(int hashtype, int count, const char *name, int namelen);
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h
new file mode 100644 (file)
index 0000000..d82f334
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_LOV_H__
+#define _OBD_LOV_H__
+
+#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS)
+
+static inline int lov_stripe_md_size(__u16 stripes)
+{
+       return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
+}
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+       if (lmm_magic == LOV_MAGIC_V3)
+               return sizeof(struct lov_mds_md_v3) +
+                       stripes * sizeof(struct lov_ost_data_v1);
+       else
+               return sizeof(struct lov_mds_md_v1) +
+                       stripes * sizeof(struct lov_ost_data_v1);
+}
+
+struct lov_version_size {
+       __u32   lvs_magic;
+       size_t  lvs_lmm_size;
+       size_t  lvs_lod_size;
+};
+
+static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic)
+{
+       static const struct lov_version_size lmm_ver_size[] = {
+                       { .lvs_magic = LOV_MAGIC_V3,
+                         .lvs_lmm_size = sizeof(struct lov_mds_md_v3),
+                         .lvs_lod_size = sizeof(struct lov_ost_data_v1) },
+                       { .lvs_magic = LOV_MAGIC_V1,
+                         .lvs_lmm_size = sizeof(struct lov_mds_md_v1),
+                         .lvs_lod_size = sizeof(struct lov_ost_data_v1)} };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) {
+               if (lmm_magic == lmm_ver_size[i].lvs_magic) {
+                       if (ea_size <= lmm_ver_size[i].lvs_lmm_size)
+                               return 0;
+                       return (ea_size - lmm_ver_size[i].lvs_lmm_size) /
+                               lmm_ver_size[i].lvs_lod_size;
+               }
+       }
+
+       /* Invalid LOV magic, so no stripes could fit */
+       return 0;
+}
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG > 32
+# define lov_do_div64(n,base) ({                                       \
+       uint64_t __base = (base);                                       \
+       uint64_t __rem;                                                 \
+       __rem = ((uint64_t)(n)) % __base;                               \
+       (n) = ((uint64_t)(n)) / __base;                                 \
+       __rem;                                                          \
+  })
+#else
+# define lov_do_div64(n,base) ({                                       \
+       uint64_t __rem;                                                 \
+       if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+               int __remainder;                                              \
+               LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+                        "division %llu / %llu\n", (n), (uint64_t)(base));    \
+               __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);          \
+               (n) >>= LOV_MIN_STRIPE_BITS;                            \
+               __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);       \
+               __rem <<= LOV_MIN_STRIPE_BITS;                          \
+               __rem += __remainder;                                   \
+       } else {                                                        \
+               __rem = do_div(n, base);                                \
+       }                                                               \
+       __rem;                                                          \
+  })
+#endif
+
+#define IOC_LOV_TYPE              'g'
+#define IOC_LOV_MIN_NR          50
+#define IOC_LOV_SET_OSC_ACTIVE  _IOWR('g', 50, long)
+#define IOC_LOV_MAX_NR          50
+
+#define QOS_DEFAULT_THRESHOLD     10 /* MB */
+#define QOS_DEFAULT_MAXAGE           5  /* Seconds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h
new file mode 100644 (file)
index 0000000..af89843
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
+ *
+ * Data structures for object storage targets and client: OST & OSC's
+ *
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_OST_H
+#define _LUSTRE_OST_H
+
+#include <obd_class.h>
+
+struct osc_brw_async_args {
+       struct obdo       *aa_oa;
+       int             aa_requested_nob;
+       int             aa_nio_count;
+       obd_count         aa_page_count;
+       int             aa_resends;
+       struct brw_page  **aa_ppga;
+       struct client_obd *aa_cli;
+       struct list_head         aa_oaps;
+       struct list_head         aa_exts;
+       struct obd_capa   *aa_ocapa;
+       struct cl_req     *aa_clerq;
+};
+
+#define osc_grant_args osc_brw_async_args
+struct osc_async_args {
+       struct obd_info   *aa_oi;
+};
+
+struct osc_setattr_args {
+       struct obdo      *sa_oa;
+       obd_enqueue_update_f sa_upcall;
+       void            *sa_cookie;
+};
+
+struct osc_fsync_args {
+       struct obd_info     *fa_oi;
+       obd_enqueue_update_f fa_upcall;
+       void            *fa_cookie;
+};
+
+struct osc_enqueue_args {
+       struct obd_export       *oa_exp;
+       __u64               *oa_flags;
+       obd_enqueue_update_f      oa_upcall;
+       void                 *oa_cookie;
+       struct ost_lvb     *oa_lvb;
+       struct lustre_handle     *oa_lockh;
+       struct ldlm_enqueue_info *oa_ei;
+       unsigned int          oa_agl:1;
+};
+
+#if 0
+int osc_extent_blocking_cb(struct ldlm_lock *lock,
+                          struct ldlm_lock_desc *new, void *data,
+                          int flag);
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644 (file)
index 0000000..5f2b4e8
--- /dev/null
@@ -0,0 +1,853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/libcfs/libcfs.h>
+#include <lvfs.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_support.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+       OBD_MEMORY_STAT = 0,
+       OBD_MEMORY_PAGES_STAT = 1,
+       OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;         /* seconds */
+extern unsigned int ldlm_timeout;       /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_unstable_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+                  size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS       10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT         100
+#define LDLM_TIMEOUT_DEFAULT       20
+#define MDS_LDLM_TIMEOUT_DEFAULT       6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD   (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT   (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+                            INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN       1
+#define OBD_IR_FACTOR_MAX       10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300          /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL      1200/*20 minutes*/
+
+#define OBD_FAIL_MDS                0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET        0x102
+#define OBD_FAIL_MDS_GETATTR_PACK      0x103
+#define OBD_FAIL_MDS_READPAGE_NET      0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE      0x106
+#define OBD_FAIL_MDS_REINT_NET    0x107
+#define OBD_FAIL_MDS_REINT_UNPACK      0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE      0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK      0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK          0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME      0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET      0x113
+#define OBD_FAIL_MDS_OPEN_PACK    0x114
+#define OBD_FAIL_MDS_CLOSE_NET    0x115
+#define OBD_FAIL_MDS_CLOSE_PACK          0x116
+#define OBD_FAIL_MDS_CONNECT_NET        0x117
+#define OBD_FAIL_MDS_CONNECT_PACK      0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET       0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
+#define OBD_FAIL_MDS_STATFS_PACK        0x11d
+#define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET        0x120
+#define OBD_FAIL_MDS_UNPIN_NET    0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET      0x124
+#define OBD_FAIL_MDS_SYNC_PACK    0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET    0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK   0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO          0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN          0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE        0x12b
+#define OBD_FAIL_MDS_OST_SETATTR        0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET      0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET      0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD          0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET      0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET      0x132
+#define OBD_FAIL_MDS_SETXATTR      0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP      0x135
+#define OBD_FAIL_MDS_RESEND          0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY        0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK      0x145
+#define OBD_FAIL_MDS_PDO_LOCK2    0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE  0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET         0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET         0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET          0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET           0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET       0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET     0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET          0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET            0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT            0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN                 0x171
+#define OBD_FAIL_MDS_LL_BLOCK           0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET      0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET      0x186
+#define OBD_FAIL_MDS_DQACQ_NET    0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY                       0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH                       0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL                       0x192
+#define OBD_FAIL_OSD_FID_MAPPING                       0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT                      0x194
+
+#define OBD_FAIL_OST                0x200
+#define OBD_FAIL_OST_CONNECT_NET        0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET      0x203
+#define OBD_FAIL_OST_CREATE_NET          0x204
+#define OBD_FAIL_OST_DESTROY_NET        0x205
+#define OBD_FAIL_OST_GETATTR_NET        0x206
+#define OBD_FAIL_OST_SETATTR_NET        0x207
+#define OBD_FAIL_OST_OPEN_NET      0x208
+#define OBD_FAIL_OST_CLOSE_NET    0x209
+#define OBD_FAIL_OST_BRW_NET        0x20a
+#define OBD_FAIL_OST_PUNCH_NET    0x20b
+#define OBD_FAIL_OST_STATFS_NET          0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET      0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC          0x215
+#define OBD_FAIL_OST_EROFS            0x216
+#define OBD_FAIL_OST_ENOENT          0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET      0x218
+#define OBD_FAIL_OST_QUOTACTL_NET      0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE      0x21c
+#define OBD_FAIL_OST_DROP_REQ      0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE      0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2      0x225
+#define OBD_FAIL_OST_NOMEM            0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO          0x229
+#define OBD_FAIL_OST_DQACQ_NET    0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+
+#define OBD_FAIL_LDLM              0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET                      0x302
+#define OBD_FAIL_LDLM_CONVERT_NET                      0x303
+#define OBD_FAIL_LDLM_CANCEL_NET                       0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET                  0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET                  0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET                  0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY          0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE      0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE      0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT        0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST      0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE        0x318
+#define OBD_FAIL_LDLM_NEW_LOCK    0x319
+#define OBD_FAIL_LDLM_AGL_DELAY          0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK        0x31b
+#define OBD_FAIL_LDLM_OST_LVB           0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC                0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST        0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST        0x404
+#define OBD_FAIL_OSC_MATCH            0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ      0x406
+#define OBD_FAIL_OSC_SHUTDOWN      0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE    0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE        0x410
+#define OBD_FAIL_OSC_NO_GRANT      0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME      0x412
+
+#define OBD_FAIL_PTLRPC                  0x500
+#define OBD_FAIL_PTLRPC_ACK          0x501
+#define OBD_FAIL_PTLRPC_RQBD        0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC        0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ      0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP      0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG        0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+
+#define OBD_FAIL_OBD_PING_NET      0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET      0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
+#define OBD_FAIL_OBD_DQACQ            0x604
+#define OBD_FAIL_OBD_LLOG_SETUP          0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET      0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK     0x608
+#define OBD_FAIL_OBD_NO_LRU             0x609
+
+#define OBD_FAIL_TGT_REPLY_NET    0x700
+#define OBD_FAIL_TGT_CONN_RACE    0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP        0x707
+#define OBD_FAIL_TGT_FAKE_EXP      0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY      0x709
+#define OBD_FAIL_TGT_LAST_REPLAY        0x710
+#define OBD_FAIL_TGT_CLIENT_ADD          0x711
+#define OBD_FAIL_TGT_RCVG_FLAG    0x712
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM           0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT        0x805
+
+#define OBD_FAIL_MGS                0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ    0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+
+#define OBD_FAIL_QUOTA_DQACQ_NET                       0xA01
+#define OBD_FAIL_QUOTA_EDQUOT      0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE      0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC    0xC00
+
+#define OBD_FAIL_SEQ                0x1000
+#define OBD_FAIL_SEQ_QUERY_NET    0x1001
+#define OBD_FAIL_SEQ_EXHAUST            0x1002
+
+#define OBD_FAIL_FLD                0x1100
+#define OBD_FAIL_FLD_QUERY_NET    0x1101
+
+#define OBD_FAIL_SEC_CTX                0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET      0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET      0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG                         0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET           0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET                 0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL                 0x1310
+#define OBD_FAIL_SEQ_ALLOC                       0x1311
+
+#define OBD_FAIL_LLITE                       0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR         0x1402
+#define OBD_FAIL_LOV_INIT                          0x1403
+#define OBD_FAIL_GLIMPSE_DELAY                     0x1404
+
+#define OBD_FAIL_FID_INDIR     0x1501
+#define OBD_FAIL_FID_INLMA     0x1502
+#define OBD_FAIL_FID_IGIF      0x1504
+#define OBD_FAIL_FID_LOOKUP    0x1505
+#define OBD_FAIL_FID_NOLMA     0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1          0x1600
+#define OBD_FAIL_LFSCK_DELAY2          0x1601
+#define OBD_FAIL_LFSCK_DELAY3          0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH    0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE     0x1604
+#define OBD_FAIL_LFSCK_FATAL1          0x1608
+#define OBD_FAIL_LFSCK_FATAL2          0x1609
+#define OBD_FAIL_LFSCK_CRASH           0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO         0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN   0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET                        0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP            0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)             CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)                   CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)         CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)         CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)         CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)               CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)           CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)         CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)                       CFS_RACE(id)
+#define OBD_FAIL_ONCE                     CFS_FAIL_ONCE
+#define OBD_FAILED                           CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef LPROCFS
+#define obd_memory_add(size)                                             \
+       lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)                                             \
+       lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()                                                     \
+       lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,              \
+                               LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order)                                             \
+       lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                           (long)(1 << (order)))
+#define obd_pages_sub(order)                                             \
+       lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                           (long)(1 << (order)))
+#define obd_pages_sum()                                                       \
+       lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                               LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+       obd_alloc += size;
+       if (obd_alloc > obd_max_alloc)
+               obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+       obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+       obd_pages += 1<< order;
+       if (obd_pages > obd_max_pages)
+               obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+       obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum()  (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)                                 \
+               obd_memory_add(size);                              \
+               CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+                      (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)                             \
+       LASSERT(ptr);                                              \
+       obd_memory_sub(size);                                      \
+       CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",              \
+              (int)(size), ptr);                                      \
+       POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)                   \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmalloc(size, flags) :                                \
+               cfs_cpt_malloc(cptab, cpt, size, flags);                      \
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n",  \
+                      (int)(size), __FILE__, __LINE__);                      \
+       } else {                                                              \
+               memset(ptr, 0, size);                                         \
+               CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n",          \
+                      (int)(size), ptr);                                     \
+       }                                                                     \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr)                                                 \
+({                                                                         \
+       kfree(ptr);                                                     \
+       (ptr) = NULL;                                                    \
+       0;                                                                  \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)                   \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmalloc(size, flags) :                                \
+               cfs_cpt_malloc(cptab, cpt, size, flags);                      \
+       if (likely((ptr) != NULL &&                                        \
+                  (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+                   !obd_alloc_fail(ptr, #ptr, "km", size,                  \
+                                   __FILE__, __LINE__) ||                  \
+                   OBD_FREE_RTN0(ptr)))){                                  \
+               memset(ptr, 0, size);                                    \
+               OBD_ALLOC_POST(ptr, size, "kmalloced");                \
+       }                                                                    \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)                                   \
+       __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)                   \
+       __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)                                 \
+       OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)                                   \
+       OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)                        \
+do {                                                                         \
+       (ptr) = cptab == NULL ?                                               \
+               vmalloc(size) :                                       \
+               cfs_cpt_vmalloc(cptab, cpt, size);                            \
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",        \
+                      (int)(size));                                      \
+               CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \
+                      obd_memory_sum(), atomic_read(&libcfs_kmemory));   \
+       } else {                                                              \
+               memset(ptr, 0, size);                                    \
+               OBD_ALLOC_POST(ptr, size, "vmalloced");                \
+       }                                                                    \
+} while(0)
+
+# define OBD_VMALLOC(ptr, size)                                                      \
+        __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)                                      \
+        __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt peformance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size)                                         \
+do {                                                                     \
+       if (size > OBD_ALLOC_BIG)                                            \
+               OBD_VMALLOC(ptr, size);                                \
+       else                                                              \
+               OBD_ALLOC(ptr, size);                                    \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)                           \
+do {                                                                         \
+       if (size > OBD_ALLOC_BIG)                                             \
+               OBD_CPT_VMALLOC(ptr, cptab, cpt, size);                       \
+       else                                                                  \
+               OBD_CPT_ALLOC(ptr, cptab, cpt, size);                         \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)                                           \
+do {                                                                     \
+       if (size > OBD_ALLOC_BIG)                                            \
+               OBD_VFREE(ptr, size);                                    \
+       else                                                              \
+               OBD_FREE(ptr, size);                                      \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE);   \
+                                   kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)                                               \
+do {                                                                     \
+       OBD_FREE_PRE(ptr, size, "kfreed");                                  \
+       kfree(ptr);                                                     \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle)                                              \
+do {                                                                         \
+       struct portals_handle *__h = (handle);                                \
+                                                                             \
+       LASSERT(handle != NULL);                                              \
+       __h->h_cookie = (unsigned long)(ptr);                                 \
+       __h->h_size = (size);                                                 \
+       call_rcu(&__h->h_rcu, class_handle_free_cb);                          \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+
+#define OBD_VFREE(ptr, size)                           \
+       do {                                            \
+               OBD_FREE_PRE(ptr, size, "vfreed");      \
+               vfree(ptr);                     \
+               POISON_PTR(ptr);                        \
+       } while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)                                   \
+({                                                                         \
+       kmem_cache_free((slab), (ptr));                             \
+       (ptr) = NULL;                                                    \
+       0;                                                                  \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)          \
+do {                                                                         \
+       LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmem_cache_alloc(slab, type) :                        \
+               cfs_mem_cache_cpt_alloc(slab, cptab, cpt, type);              \
+       if (likely((ptr) != NULL &&                                        \
+                  (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+                   !obd_alloc_fail(ptr, #ptr, "slab-", size,            \
+                                   __FILE__, __LINE__) ||                  \
+                   OBD_SLAB_FREE_RTN0(ptr, slab)))) {                  \
+               memset(ptr, 0, size);                                    \
+               OBD_ALLOC_POST(ptr, size, "slab-alloced");                  \
+       }                                                                    \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)                           \
+       __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)           \
+       __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size)                                 \
+do {                                                                     \
+       OBD_FREE_PRE(ptr, size, "slab-freed");                          \
+       kmem_cache_free(slab, ptr);                                     \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)                                              \
+       OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)                              \
+       OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)                                        \
+       OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)                        \
+       OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)                             \
+       OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)                     \
+       OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)                                         \
+       OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+       (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)                  \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               alloc_page(gfp_mask) :                                \
+               cfs_page_cpt_alloc(cptab, cpt, gfp_mask);                     \
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\
+                      "failed\n", (int)1,                                  \
+                      (__u64)(1 << PAGE_CACHE_SHIFT));                  \
+               CERROR(LPU64" total bytes and "LPU64" total pages "        \
+                      "("LPU64" bytes) allocated by Lustre, "          \
+                      "%d total bytes by LNET\n",                          \
+                      obd_memory_sum(),                                      \
+                      obd_pages_sum() << PAGE_CACHE_SHIFT,                  \
+                      obd_pages_sum(),                                \
+                      atomic_read(&libcfs_kmemory));                \
+       } else {                                                              \
+               obd_pages_add(0);                                            \
+               CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / "      \
+                      LPU64" bytes at %p.\n",                          \
+                      (int)1,                                          \
+                      (__u64)(1 << PAGE_CACHE_SHIFT), ptr);                \
+       }                                                                    \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask)                                        \
+       __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask)                        \
+       __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr)                                                 \
+do {                                                                     \
+       LASSERT(ptr);                                                    \
+       obd_pages_sub(0);                                                    \
+       CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \
+              "at %p.\n",                                                  \
+              (int)1, (__u64)(1 << PAGE_CACHE_SHIFT),                    \
+              ptr);                                                      \
+       __free_page(ptr);                                                  \
+       (ptr) = (void *)0xdeadbeef;                                        \
+} while (0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644 (file)
index 0000000..7f3974b
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+# include <lustre_dlm.h>
+# include <lustre_lite.h>
+# include <lustre_mdc.h>
+# include <linux/pagemap.h>
+# include <linux/file.h>
+
+#include "cl_object.h"
+#include "lclient.h"
+# include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+       .cld_start = 0,
+       .cld_end   = CL_PAGE_EOF,
+       .cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *      0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+       blkcnt_t cnt = 0;
+       struct ccc_object *vob = cl_inode2ccc(inode);
+       void          *results[1];
+
+       if (inode->i_mapping != NULL)
+               cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+                                                 results, 0, 1,
+                                                 PAGECACHE_TAG_DIRTY);
+       if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+               cnt = 1;
+
+       return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                   struct inode *inode, struct cl_object *clob, int agl)
+{
+       struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+       struct cl_inode_info *lli   = cl_i2info(inode);
+       const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+       struct ccc_io   *cio   = ccc_env_io(env);
+       struct cl_lock       *lock;
+       int result;
+
+       ENTRY;
+       result = 0;
+       if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+               CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+               if (lli->lli_has_smd) {
+                       /* NOTE: this looks like DLM lock request, but it may
+                        *       not be one. Due to CEF_ASYNC flag (translated
+                        *       to LDLM_FL_HAS_INTENT by osc), this is
+                        *       glimpse request, that won't revoke any
+                        *       conflicting DLM locks held. Instead,
+                        *       ll_glimpse_callback() will be called on each
+                        *       client holding a DLM lock against this file,
+                        *       and resulting size will be returned for each
+                        *       stripe. DLM lock on [0, EOF] is acquired only
+                        *       if there were no conflicting locks. If there
+                        *       were conflicting locks, enqueuing or waiting
+                        *       fails with -ENAVAIL, but valid inode
+                        *       attributes are returned anyway. */
+                       *descr = whole_file;
+                       descr->cld_obj   = clob;
+                       descr->cld_mode  = CLM_PHANTOM;
+                       descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+                       if (agl)
+                               descr->cld_enq_flags |= CEF_AGL;
+                       cio->cui_glimpse = 1;
+                       /*
+                        * CEF_ASYNC is used because glimpse sub-locks cannot
+                        * deadlock (because they never conflict with other
+                        * locks) and, hence, can be enqueued out-of-order.
+                        *
+                        * CEF_MUST protects glimpse lock from conversion into
+                        * a lockless mode.
+                        */
+                       lock = cl_lock_request(env, io, descr, "glimpse",
+                                              current);
+                       cio->cui_glimpse = 0;
+
+                       if (lock == NULL)
+                               RETURN(0);
+
+                       if (IS_ERR(lock))
+                               RETURN(PTR_ERR(lock));
+
+                       LASSERT(agl == 0);
+                       result = cl_wait(env, lock);
+                       if (result == 0) {
+                               cl_merge_lvb(env, inode);
+                               if (cl_isize_read(inode) > 0 &&
+                                   inode->i_blocks == 0) {
+                                       /*
+                                        * LU-417: Add dirty pages block count
+                                        * lest i_blocks reports 0, some "cp" or
+                                        * "tar" may think it's a completely
+                                        * sparse file and skip it.
+                                        */
+                                       inode->i_blocks = dirty_cnt(inode);
+                               }
+                               cl_unuse(env, lock);
+                       }
+                       cl_lock_release(env, lock, "glimpse", current);
+               } else {
+                       CDEBUG(D_DLMTRACE, "No objects for inode\n");
+                       cl_merge_lvb(env, inode);
+               }
+       }
+
+       RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+                    struct cl_io **ioout, int *refcheck)
+{
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_inode_info   *lli = cl_i2info(inode);
+       struct cl_object       *clob = lli->lli_clob;
+       int result;
+
+       if (S_ISREG(cl_inode_mode(inode))) {
+               env = cl_env_get(refcheck);
+               if (!IS_ERR(env)) {
+                       io = ccc_env_thread_io(env);
+                       io->ci_obj = clob;
+                       *envout = env;
+                       *ioout  = io;
+                       result = +1;
+               } else
+                       result = PTR_ERR(env);
+       } else
+               result = 0;
+       return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+       /*
+        * We don't need ast_flags argument to cl_glimpse_size(), because
+        * osc_lock_enqueue() takes care of the possible deadlock that said
+        * argument was introduced to avoid.
+        */
+       /*
+        * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+        * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+        * blocking anyway.
+        */
+       struct lu_env     *env = NULL;
+       struct cl_io       *io  = NULL;
+       int                  result;
+       int                  refcheck;
+
+       ENTRY;
+
+       result = cl_io_get(inode, &env, &io, &refcheck);
+       if (result > 0) {
+       again:
+               io->ci_verify_layout = 1;
+               result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+               if (result > 0)
+                       /*
+                        * nothing to do for this io. This currently happens
+                        * when stripe sub-object's are not yet created.
+                        */
+                       result = io->ci_result;
+               else if (result == 0)
+                       result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+                                                agl);
+
+               OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+               cl_io_fini(env, io);
+               if (unlikely(io->ci_need_restart))
+                       goto again;
+               cl_env_put(env, &refcheck);
+       }
+       RETURN(result);
+}
+
+int cl_local_size(struct inode *inode)
+{
+       struct lu_env      *env = NULL;
+       struct cl_io        *io  = NULL;
+       struct ccc_thread_info  *cti;
+       struct cl_object        *clob;
+       struct cl_lock_descr    *descr;
+       struct cl_lock    *lock;
+       int                   result;
+       int                   refcheck;
+
+       ENTRY;
+
+       if (!cl_i2info(inode)->lli_has_smd)
+               RETURN(0);
+
+       result = cl_io_get(inode, &env, &io, &refcheck);
+       if (result <= 0)
+               RETURN(result);
+
+       clob = io->ci_obj;
+       result = cl_io_init(env, io, CIT_MISC, clob);
+       if (result > 0)
+               result = io->ci_result;
+       else if (result == 0) {
+               cti = ccc_env_info(env);
+               descr = &cti->cti_descr;
+
+               *descr = whole_file;
+               descr->cld_obj = clob;
+               lock = cl_lock_peek(env, io, descr, "localsize", current);
+               if (lock != NULL) {
+                       cl_merge_lvb(env, inode);
+                       cl_unuse(env, lock);
+                       cl_lock_release(env, lock, "localsize", current);
+                       result = 0;
+               } else
+                       result = -ENODATA;
+       }
+       cl_io_fini(env, io);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644 (file)
index 0000000..4a01666
--- /dev/null
@@ -0,0 +1,1325 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include <lclient.h>
+
+#include "../llite/llite_internal.h"
+
+const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+       {
+               .ckd_cache = &ccc_lock_kmem,
+               .ckd_name  = "ccc_lock_kmem",
+               .ckd_size  = sizeof (struct ccc_lock)
+       },
+       {
+               .ckd_cache = &ccc_object_kmem,
+               .ckd_name  = "ccc_object_kmem",
+               .ckd_size  = sizeof (struct ccc_object)
+       },
+       {
+               .ckd_cache = &ccc_thread_kmem,
+               .ckd_name  = "ccc_thread_kmem",
+               .ckd_size  = sizeof (struct ccc_thread_info),
+       },
+       {
+               .ckd_cache = &ccc_session_kmem,
+               .ckd_name  = "ccc_session_kmem",
+               .ckd_size  = sizeof (struct ccc_session)
+       },
+       {
+               .ckd_cache = &ccc_req_kmem,
+               .ckd_name  = "ccc_req_kmem",
+               .ckd_size  = sizeof (struct ccc_req)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct ccc_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct ccc_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct ccc_session *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct ccc_session *session = data;
+       OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = ccc_key_init,
+       .lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = ccc_session_key_init,
+       .lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key);
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       struct ccc_device  *vdv;
+       int rc;
+       ENTRY;
+
+       vdv = lu2ccc_dev(d);
+       vdv->cdv_next = lu2cl_dev(next);
+
+       LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+       next->ld_site = d->ld_site;
+       rc = next->ld_type->ldt_ops->ldto_device_init(
+                       env, next, next->ld_type->ldt_name, NULL);
+       if (rc == 0) {
+               lu_device_get(next);
+               lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+       }
+       RETURN(rc);
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                  struct lu_device_type *t,
+                                  struct lustre_cfg *cfg,
+                                  const struct lu_device_operations *luops,
+                                  const struct cl_device_operations *clops)
+{
+       struct ccc_device *vdv;
+       struct lu_device  *lud;
+       struct cl_site    *site;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(vdv);
+       if (vdv == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lud = &vdv->cdv_cl.cd_lu_dev;
+       cl_device_init(&vdv->cdv_cl, t);
+       ccc2lu_dev(vdv)->ld_ops = luops;
+       vdv->cdv_cl.cd_ops = clops;
+
+       OBD_ALLOC_PTR(site);
+       if (site != NULL) {
+               rc = cl_site_init(site, &vdv->cdv_cl);
+               if (rc == 0)
+                       rc = lu_site_init_finish(&site->cs_lu);
+               else {
+                       LASSERT(lud->ld_site == NULL);
+                       CERROR("Cannot init lu_site, rc %d.\n", rc);
+                       OBD_FREE_PTR(site);
+               }
+       } else
+               rc = -ENOMEM;
+       if (rc != 0) {
+               ccc_device_free(env, lud);
+               lud = ERR_PTR(rc);
+       }
+       RETURN(lud);
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct ccc_device *vdv  = lu2ccc_dev(d);
+       struct cl_site    *site = lu2cl_site(d->ld_site);
+       struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+       if (d->ld_site != NULL) {
+               cl_site_fini(site);
+               OBD_FREE_PTR(site);
+       }
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(vdv);
+       return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                       struct cl_req *req)
+{
+       struct ccc_req *vrq;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
+       if (vrq != NULL) {
+               cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env = NULL;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+       int result;
+
+       result = lu_kmem_init(ccc_caches);
+       if (result)
+               return result;
+
+       result = lu_device_type_init(device_type);
+       if (result)
+               goto out_kmem;
+
+       ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+                                         LCT_REMEMBER|LCT_NOREF);
+       if (IS_ERR(ccc_inode_fini_env)) {
+               result = PTR_ERR(ccc_inode_fini_env);
+               goto out_device;
+       }
+
+       ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+       return 0;
+out_device:
+       lu_device_type_fini(device_type);
+out_kmem:
+       lu_kmem_fini(ccc_caches);
+       return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+       if (ccc_inode_fini_env != NULL) {
+               cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+               ccc_inode_fini_env = NULL;
+       }
+       lu_device_type_fini(device_type);
+       lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev,
+                                  const struct cl_object_operations *clops,
+                                  const struct lu_object_operations *luops)
+{
+       struct ccc_object *vob;
+       struct lu_object  *obj;
+
+       OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
+       if (vob != NULL) {
+               struct cl_object_header *hdr;
+
+               obj = ccc2lu(vob);
+               hdr = &vob->cob_header;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+
+               vob->cob_cl.co_ops = clops;
+               obj->lo_ops = luops;
+       } else
+               obj = NULL;
+       return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+                           struct ccc_object *vob,
+                           const struct cl_object_conf *conf)
+{
+       vob->cob_inode = conf->coc_inode;
+       vob->cob_transient_pages = 0;
+       cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+       return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf)
+{
+       struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+       struct ccc_object *vob = lu2ccc(obj);
+       struct lu_object  *below;
+       struct lu_device  *under;
+       int result;
+
+       under = &dev->cdv_next->cd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+       if (below != NULL) {
+               const struct cl_object_conf *cconf;
+
+               cconf = lu2cl_conf(conf);
+               INIT_LIST_HEAD(&vob->cob_pending_list);
+               lu_object_add(obj, below);
+               result = ccc_object_init0(env, vob, cconf);
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct ccc_object *vob = lu2ccc(obj);
+
+       lu_object_fini(obj);
+       lu_object_header_fini(obj->lo_header);
+       OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *unused,
+                 const struct cl_lock_operations *lkops)
+{
+       struct ccc_lock *clk;
+       int result;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
+       if (clk != NULL) {
+               cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid)
+{
+       return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+                      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       ENTRY;
+       lvb->lvb_mtime = cl_inode_mtime(inode);
+       lvb->lvb_atime = cl_inode_atime(inode);
+       lvb->lvb_ctime = cl_inode_ctime(inode);
+       /*
+        * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+        * "cp" or "tar" on remote node may think it's a completely sparse file
+        * and skip it.
+        */
+       if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+               lvb->lvb_blocks = dirty_cnt(inode);
+       RETURN(0);
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_object_conf *conf)
+{
+       /* TODO: destroy all pages attached to this object. */
+       return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       cl_isize_lock(inode);
+       cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       cl_object_attr_unlock(obj);
+       cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          struct cl_io *io)
+{
+       struct ccc_io   *cio  = ccc_env_io(env);
+       struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+       struct cl_page       *page = slice->cpl_page;
+
+       int result;
+
+       ENTRY;
+
+       if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+           io->ci_type == CIT_FAULT) {
+               if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+                       result = -EBUSY;
+               else {
+                       desc->cld_start = page->cp_index;
+                       desc->cld_end   = page->cp_index;
+                       desc->cld_obj   = page->cp_obj;
+                       desc->cld_mode  = CLM_READ;
+                       result = cl_queue_match(&io->ci_lockset.cls_done,
+                                               desc) ? -EBUSY : 0;
+               }
+       } else
+               result = 0;
+       RETURN(result);
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+       /*
+        * Cached read?
+        */
+       LBUG();
+       return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused,
+                                  int nonblock)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+       return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+       struct cl_page *page = slice->cpl_page;
+
+       ccc_transient_page_verify(slice->cpl_page);
+
+       /*
+        * For transient pages, remove it from the radix tree.
+        */
+       cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused)
+{
+       ENTRY;
+       /* transient page should always be sent. */
+       RETURN(0);
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+                    const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+       struct ccc_lock *clk = cl2ccc_lock(slice);
+       OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+                    const struct cl_lock_slice *slice,
+                    struct cl_io *unused, __u32 enqflags)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+                      const struct cl_lock_slice *slice,
+                      const struct cl_lock_descr *need,
+                      const struct cl_io *io)
+{
+       const struct cl_lock       *lock  = slice->cls_lock;
+       const struct cl_lock_descr *descr = &lock->cll_descr;
+       const struct ccc_io     *cio   = ccc_env_io(env);
+       int                      result;
+
+       ENTRY;
+       /*
+        * Work around DLM peculiarity: it assumes that glimpse
+        * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+        * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+        * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+        * doesn't enqueue CLM_WRITE sub-locks.
+        */
+       if (cio->cui_glimpse)
+               result = descr->cld_mode != CLM_WRITE;
+
+       /*
+        * Also, don't match incomplete write locks for read, otherwise read
+        * would enqueue missing sub-locks in the write mode.
+        */
+       else if (need->cld_mode != descr->cld_mode)
+               result = lock->cll_state >= CLS_ENQUEUED;
+       else
+               result = 1;
+       RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+                   const struct cl_lock_slice *slice,
+                   enum cl_lock_state state)
+{
+       struct cl_lock *lock = slice->cls_lock;
+       ENTRY;
+
+       /*
+        * Refresh inode attributes when the lock is moving into CLS_HELD
+        * state, and only when this is a result of real enqueue, rather than
+        * of finding lock in the cache.
+        */
+       if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+               struct cl_object *obj;
+               struct inode     *inode;
+
+               obj   = slice->cls_obj;
+               inode = ccc_object_inode(obj);
+
+               /* vmtruncate() sets the i_size
+                * under both a DLM lock and the
+                * ll_inode_size_lock().  If we don't get the
+                * ll_inode_size_lock() here we can match the DLM lock and
+                * reset i_size.  generic_file_write can then trust the
+                * stale i_size when doing appending writes and effectively
+                * cancel the result of the truncate.  Getting the
+                * ll_inode_size_lock() after the enqueue maintains the DLM
+                * -> ll_inode_size_lock() acquiring order. */
+               if (lock->cll_descr.cld_start == 0 &&
+                   lock->cll_descr.cld_end == CL_PAGE_EOF)
+                       cl_merge_lvb(env, inode);
+       }
+       EXIT;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct cl_io *io = ios->cis_io;
+
+       CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                         __u32 enqflags, enum cl_lock_mode mode,
+                         pgoff_t start, pgoff_t end)
+{
+       struct ccc_io     *cio   = ccc_env_io(env);
+       struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
+       struct cl_object       *obj   = io->ci_obj;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+       memset(&cio->cui_link, 0, sizeof cio->cui_link);
+
+       if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+               descr->cld_mode = CLM_GROUP;
+               descr->cld_gid  = cio->cui_fd->fd_grouplock.cg_gid;
+       } else {
+               descr->cld_mode  = mode;
+       }
+       descr->cld_obj   = obj;
+       descr->cld_start = start;
+       descr->cld_end   = end;
+       descr->cld_enq_flags = enqflags;
+
+       cl_io_lock_add(env, io, &cio->cui_link);
+       RETURN(0);
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+                      struct ccc_io *cio, struct cl_io *io)
+{
+       int i;
+       size_t size = io->u.ci_rw.crw_count;
+
+       cio->cui_iov_olen = 0;
+       if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+               return;
+
+       for (i = 0; i < cio->cui_tot_nrsegs; i++) {
+               struct iovec *iv = &cio->cui_iov[i];
+
+               if (iv->iov_len < size)
+                       size -= iv->iov_len;
+               else {
+                       if (iv->iov_len > size) {
+                               cio->cui_iov_olen = iv->iov_len;
+                               iv->iov_len = size;
+                       }
+                       break;
+               }
+       }
+
+       cio->cui_nrsegs = i + 1;
+       LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
+                "tot_nrsegs: %lu, nrsegs: %lu\n",
+                cio->cui_tot_nrsegs, cio->cui_nrsegs);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                   __u32 enqflags, enum cl_lock_mode mode,
+                   loff_t start, loff_t end)
+{
+       struct cl_object *obj = io->ci_obj;
+       return ccc_io_one_lock_index(env, io, enqflags, mode,
+                                    cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       CLOBINVRNT(env, ios->cis_io->ci_obj,
+                  ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+                   const struct cl_io_slice *ios,
+                   size_t nob)
+{
+       struct ccc_io    *cio = cl2ccc_io(env, ios);
+       struct cl_io     *io  = ios->cis_io;
+       struct cl_object *obj = ios->cis_io->ci_obj;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       if (!cl_is_normalio(env, io))
+               return;
+
+       LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
+       LASSERT(cio->cui_tot_count  >= nob);
+
+       cio->cui_iov    += cio->cui_nrsegs;
+       cio->cui_tot_nrsegs -= cio->cui_nrsegs;
+       cio->cui_tot_count  -= nob;
+
+       /* update the iov */
+       if (cio->cui_iov_olen > 0) {
+               struct iovec *iv;
+
+               cio->cui_iov--;
+               cio->cui_tot_nrsegs++;
+               iv = &cio->cui_iov[0];
+               if (io->ci_continue) {
+                       iv->iov_base += iv->iov_len;
+                       LASSERT(cio->cui_iov_olen > iv->iov_len);
+                       iv->iov_len = cio->cui_iov_olen - iv->iov_len;
+               } else {
+                       /* restore the iov_len, in case of restart io. */
+                       iv->iov_len = cio->cui_iov_olen;
+               }
+               cio->cui_iov_olen = 0;
+       }
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+       struct cl_attr *attr  = ccc_env_thread_attr(env);
+       struct inode   *inode = ccc_object_inode(obj);
+       loff_t    pos   = start + count - 1;
+       loff_t kms;
+       int result;
+
+       /*
+        * Consistency guarantees: following possibilities exist for the
+        * relation between region being accessed and real file size at this
+        * moment:
+        *
+        *  (A): the region is completely inside of the file;
+        *
+        *  (B-x): x bytes of region are inside of the file, the rest is
+        *  outside;
+        *
+        *  (C): the region is completely outside of the file.
+        *
+        * This classification is stable under DLM lock already acquired by
+        * the caller, because to change the class, other client has to take
+        * DLM lock conflicting with our lock. Also, any updates to ->i_size
+        * by other threads on this client are serialized by
+        * ll_inode_size_lock(). This guarantees that short reads are handled
+        * correctly in the face of concurrent writes and truncates.
+        */
+       ccc_object_size_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       if (result == 0) {
+               kms = attr->cat_kms;
+               if (pos > kms) {
+                       /*
+                        * A glimpse is necessary to determine whether we
+                        * return a short read (B) or some zeroes at the end
+                        * of the buffer (C)
+                        */
+                       ccc_object_size_unlock(obj);
+                       result = cl_glimpse_lock(env, io, inode, obj, 0);
+                       if (result == 0 && exceed != NULL) {
+                               /* If objective page index exceed end-of-file
+                                * page index, return directly. Do not expect
+                                * kernel will check such case correctly.
+                                * linux-2.6.18-128.1.1 miss to do that.
+                                * --bug 17336 */
+                               loff_t size = cl_isize_read(inode);
+                               unsigned long cur_index = start >> PAGE_CACHE_SHIFT;
+
+                               if ((size == 0 && cur_index != 0) ||
+                                   (((size - 1) >> PAGE_CACHE_SHIFT) < cur_index))
+                               *exceed = 1;
+                       }
+                       return result;
+               } else {
+                       /*
+                        * region is within kms and, hence, within real file
+                        * size (A). We need to increase i_size to cover the
+                        * read region so that generic_file_read() will do its
+                        * job, but that doesn't mean the kms size is
+                        * _correct_, it is only the _minimum_ size. If
+                        * someone does a stat they will get the correct size
+                        * which will always be >= the kms value here.
+                        * b=11081
+                        */
+                       if (cl_isize_read(inode) < kms) {
+                               cl_isize_write_nolock(inode, kms);
+                               CDEBUG(D_VFSTRACE,
+                                      DFID" updating i_size "LPU64"\n",
+                                      PFID(lu_object_fid(&obj->co_lu)),
+                                      (__u64)cl_isize_read(inode));
+
+                       }
+               }
+       }
+       ccc_object_size_unlock(obj);
+       return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+                       const struct cl_req_slice *slice, int ioret)
+{
+       struct ccc_req *vrq;
+
+       if (ioret > 0)
+               cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+       vrq = cl2ccc_req(slice);
+       OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_parent_seq
+ *
+ *    - o_[ug]id
+ *
+ *    - o_parent_oid
+ *
+ *    - o_parent_ver
+ *
+ *    - o_ioepoch,
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+                     const struct cl_req_slice *slice,
+                     const struct cl_object *obj,
+                     struct cl_req_attr *attr, obd_valid flags)
+{
+       struct inode *inode;
+       struct obdo  *oa;
+       obd_flag      valid_flags;
+
+       oa = attr->cra_oa;
+       inode = ccc_object_inode(obj);
+       valid_flags = OBD_MD_FLTYPE;
+
+       if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+               LASSERT(attr->cra_capa == NULL);
+               attr->cra_capa = cl_capa_lookup(inode,
+                                               slice->crs_req->crq_type);
+       }
+
+       if (slice->crs_req->crq_type == CRT_WRITE) {
+               if (flags & OBD_MD_FLEPOCH) {
+                       oa->o_valid |= OBD_MD_FLEPOCH;
+                       oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+                       valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                                      OBD_MD_FLUID | OBD_MD_FLGID;
+               }
+       }
+       obdo_from_inode(oa, inode, valid_flags & flags);
+       obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+       memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+              JOBSTATS_JOBID_SIZE);
+}
+
+const struct cl_req_operations ccc_req_ops = {
+       .cro_attr_set   = ccc_req_attr_set,
+       .cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+                  struct obd_capa *capa)
+{
+       struct lu_env *env;
+       struct cl_io  *io;
+       int         result;
+       int         refcheck;
+
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = cl_i2info(inode)->lli_clob;
+
+       io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+       io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+       io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+       io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+       io->u.ci_setattr.sa_valid = attr->ia_valid;
+       io->u.ci_setattr.sa_capa = capa;
+
+again:
+       if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+               struct ccc_io *cio = ccc_env_io(env);
+
+               if (attr->ia_valid & ATTR_FILE)
+                       /* populate the file descriptor for ftruncate to honor
+                        * group lock - see LU-787 */
+                       cio->cui_fd = cl_iattr2fd(inode, attr);
+
+               result = cl_io_loop(env, io);
+       } else {
+               result = io->ci_result;
+       }
+       cl_io_fini(env, io);
+       if (unlikely(io->ci_need_restart))
+               goto again;
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+       return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+       return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+       return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+       return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+       return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+       return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+       return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+                        const struct cl_io_slice *slice)
+{
+       struct ccc_io *cio;
+
+       cio = container_of(slice, struct ccc_io, cui_cl);
+       LASSERT(cio == ccc_env_io(env));
+       return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+       return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+       struct inode     *inode = ccc_object_inode(obj);
+       struct cl_inode_info *lli   = cl_i2info(inode);
+
+       return (S_ISREG(cl_inode_mode(inode)) ||
+               /* i_mode of unlinked inode is zeroed. */
+               cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+       return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+       KLASSERT(PageLocked(vmpage));
+       return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+       struct lu_env   *env;
+       struct cl_inode_info *lli;
+       struct cl_object     *clob;
+       struct lu_site       *site;
+       struct lu_fid   *fid;
+       struct cl_object_conf conf = {
+               .coc_inode = inode,
+               .u = {
+                       .coc_md    = md
+               }
+       };
+       int result = 0;
+       int refcheck;
+
+       LASSERT(md->body->valid & OBD_MD_FLID);
+       LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
+
+       site = cl_i2sbi(inode)->ll_site;
+       lli  = cl_i2info(inode);
+       fid  = &lli->lli_fid;
+       LASSERT(fid_is_sane(fid));
+
+       if (lli->lli_clob == NULL) {
+               /* clob is slave of inode, empty lli_clob means for new inode,
+                * there is no clob in cache with the given fid, so it is
+                * unnecessary to perform lookup-alloc-lookup-insert, just
+                * alloc and insert directly. */
+               LASSERT(inode->i_state & I_NEW);
+               conf.coc_lu.loc_flags = LOC_F_NEW;
+               clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+                                     fid, &conf);
+               if (!IS_ERR(clob)) {
+                       /*
+                        * No locking is necessary, as new inode is
+                        * locked by I_NEW bit.
+                        */
+                       lli->lli_clob = clob;
+                       lli->lli_has_smd = md->lsm != NULL;
+                       lu_object_ref_add(&clob->co_lu, "inode", inode);
+               } else
+                       result = PTR_ERR(clob);
+       } else {
+               result = cl_conf_set(env, lli->lli_clob, &conf);
+       }
+
+       cl_env_put(env, &refcheck);
+
+       if (result != 0)
+               CERROR("Failure to initialize cl object "DFID": %d\n",
+                      PFID(fid), result);
+       return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+       struct lu_object_header *header = obj->co_lu.lo_header;
+       wait_queue_t       waiter;
+
+       if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+               struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+               struct lu_site_bkt_data *bkt;
+
+               bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+               init_waitqueue_entry_current(&waiter);
+               add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+               while (1) {
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       if (atomic_read(&header->loh_ref) == 1)
+                               break;
+                       waitq_wait(&waiter, TASK_UNINTERRUPTIBLE);
+               }
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+       }
+
+       cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+       struct lu_env      *env;
+       struct cl_inode_info    *lli  = cl_i2info(inode);
+       struct cl_object        *clob = lli->lli_clob;
+       int refcheck;
+       int emergency;
+
+       if (clob != NULL) {
+               void                *cookie;
+
+               cookie = cl_env_reenter();
+               env = cl_env_get(&refcheck);
+               emergency = IS_ERR(env);
+               if (emergency) {
+                       mutex_lock(&ccc_inode_fini_guard);
+                       LASSERT(ccc_inode_fini_env != NULL);
+                       cl_env_implant(ccc_inode_fini_env, &refcheck);
+                       env = ccc_inode_fini_env;
+               }
+               /*
+                * cl_object cache is a slave to inode cache (which, in turn
+                * is a slave to dentry cache), don't keep cl_object in memory
+                * when its master is evicted.
+                */
+               cl_object_kill(env, clob);
+               lu_object_ref_del(&clob->co_lu, "inode", inode);
+               cl_object_put_last(env, clob);
+               lli->lli_clob = NULL;
+               if (emergency) {
+                       cl_env_unplant(ccc_inode_fini_env, &refcheck);
+                       mutex_unlock(&ccc_inode_fini_guard);
+               } else
+                       cl_env_put(env, &refcheck);
+               cl_env_reexit(cookie);
+       }
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+       __u16 type = 0;
+       struct luda_type *lt;
+       int len = 0;
+
+       if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+               const unsigned align = sizeof(struct luda_type) - 1;
+
+               len = le16_to_cpu(ent->lde_namelen);
+               len = (len + align) & ~align;
+               lt = (void *)ent->lde_name + len;
+               type = IFTODT(le16_to_cpu(lt->lt_type));
+       }
+       return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+       if (BITS_PER_LONG == 32 || api32)
+               RETURN(fid_flatten32(fid));
+       else
+               RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+       __u32 gen;
+       ENTRY;
+
+       if (fid_is_igif(fid)) {
+               gen = lu_igif_gen(fid);
+               RETURN(gen);
+       }
+
+       gen = (fid_flatten(fid) >> 32);
+       RETURN(gen);
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+       return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+       lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644 (file)
index 0000000..8ecbef9
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+#include <lclient.h>
+
+#include <lustre_lite.h>
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+       struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+       __u32 valsize = sizeof(struct lov_desc);
+       int rc, easize, def_easize, cookiesize;
+       struct lov_desc desc;
+       __u16 stripes;
+       ENTRY;
+
+       rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+                         &valsize, &desc, NULL);
+       if (rc)
+               RETURN(rc);
+
+       stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+       lsm.lsm_stripe_count = stripes;
+       easize = obd_size_diskmd(dt_exp, &lsm);
+
+       lsm.lsm_stripe_count = desc.ld_default_stripe_count;
+       def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+       cookiesize = stripes * sizeof(struct llog_cookie);
+
+       CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+              easize, cookiesize);
+
+       rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize);
+       RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+                 struct obd_device *watched,
+                 enum obd_notify_event ev, void *owner, void *data)
+{
+       struct lustre_client_ocd *lco;
+       struct client_obd       *cli;
+       __u64 flags;
+       int   result;
+
+       ENTRY;
+       if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+               cli = &watched->u.cli;
+               lco = owner;
+               flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+               CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n",
+                      lco->lco_flags, flags);
+               mutex_lock(&lco->lco_lock);
+               lco->lco_flags &= flags;
+               /* for each osc event update ea size */
+               if (lco->lco_dt_exp)
+                       cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+               mutex_unlock(&lco->lco_lock);
+               result = 0;
+       } else {
+               CERROR("unexpected notification from %s %s!\n",
+                      watched->obd_type->typ_name,
+                      watched->obd_name);
+               result = -EINVAL;
+       }
+       RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                    struct ccc_grouplock *cg)
+{
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_lock   *lock;
+       struct cl_lock_descr   *descr;
+       __u32              enqflags;
+       int                  refcheck;
+       int                  rc;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = obj;
+       io->ci_ignore_layout = 1;
+
+       rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (rc) {
+               LASSERT(rc < 0);
+               cl_env_put(env, &refcheck);
+               return rc;
+       }
+
+       descr = &ccc_env_info(env)->cti_descr;
+       descr->cld_obj = obj;
+       descr->cld_start = 0;
+       descr->cld_end = CL_PAGE_EOF;
+       descr->cld_gid = gid;
+       descr->cld_mode = CLM_GROUP;
+
+       enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+       descr->cld_enq_flags = enqflags;
+
+       lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+       if (IS_ERR(lock)) {
+               cl_io_fini(env, io);
+               cl_env_put(env, &refcheck);
+               return PTR_ERR(lock);
+       }
+
+       cg->cg_env  = cl_env_get(&refcheck);
+       cg->cg_io   = io;
+       cg->cg_lock = lock;
+       cg->cg_gid  = gid;
+       LASSERT(cg->cg_env == env);
+
+       cl_env_unplant(env, &refcheck);
+       return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+       struct lu_env  *env  = cg->cg_env;
+       struct cl_io   *io   = cg->cg_io;
+       struct cl_lock *lock = cg->cg_lock;
+       int          refcheck;
+
+       LASSERT(cg->cg_env);
+       LASSERT(cg->cg_gid);
+
+       cl_env_implant(env, &refcheck);
+       cl_env_put(env, &refcheck);
+
+       cl_unuse(env, lock);
+       cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+       cl_io_fini(env, io);
+       cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644 (file)
index 0000000..ce90c7e
--- /dev/null
@@ -0,0 +1,764 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+# include <lustre_dlm.h>
+#include <obd_support.h>
+#include <interval_tree.h>
+
+enum {
+       INTERVAL_RED = 0,
+       INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+       LASSERT(node->in_parent != NULL);
+       return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+       LASSERT(node->in_parent != NULL);
+       return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+       return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+       return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+                                struct interval_node_extent *e2)
+{
+       int rc;
+       if (e1->start == e2->start) {
+               if (e1->end < e2->end)
+                       rc = -1;
+               else if (e1->end > e2->end)
+                       rc = 1;
+               else
+                       rc = 0;
+       } else {
+               if (e1->start < e2->start)
+                       rc = -1;
+               else
+                       rc = 1;
+       }
+       return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+                              struct interval_node_extent *e2)
+{
+       return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+                                   struct interval_node_extent *e2)
+{
+       return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+                              struct interval_node *n2)
+{
+       return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+                            struct interval_node *n2)
+{
+       return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+       return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+       return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)             \
+for (node = interval_first(root); node != NULL;         \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)     \
+for (node = interval_last(root); node != NULL;   \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       while (node->in_left)
+               node = node->in_left;
+       RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       while (node->in_right)
+               node = node->in_right;
+       RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       if (node->in_right)
+               RETURN(interval_first(node->in_right));
+       while (node->in_parent && node_is_right_child(node))
+               node = node->in_parent;
+       RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+
+       if (node->in_left)
+               RETURN(interval_last(node->in_left));
+
+       while (node->in_parent && node_is_left_child(node))
+               node = node->in_parent;
+
+       RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+                                   interval_callback_t func,
+                                   void *data)
+{
+       struct interval_node *node;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+       ENTRY;
+
+       interval_for_each(node, root) {
+               rc = func(node, data);
+               if (rc == INTERVAL_ITER_STOP)
+                       break;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                           interval_callback_t func,
+                                           void *data)
+{
+       struct interval_node *node;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+       ENTRY;
+
+       interval_for_each_reverse(node, root) {
+               rc = func(node, data);
+               if (rc == INTERVAL_ITER_STOP)
+                       break;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+                                   struct interval_node_extent *ex)
+{
+       struct interval_node *walk = root;
+       int rc;
+       ENTRY;
+
+       while (walk) {
+               rc = extent_compare(ex, &walk->in_extent);
+               if (rc == 0)
+                       break;
+               else if (rc < 0)
+                       walk = walk->in_left;
+               else
+                       walk = walk->in_right;
+       }
+
+       RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+                                   struct interval_node *rotate)
+{
+       __u64 left_max, right_max;
+
+       rotate->in_max_high = node->in_max_high;
+       left_max = node->in_left ? node->in_left->in_max_high : 0;
+       right_max = node->in_right ? node->in_right->in_max_high : 0;
+       node->in_max_high  = max_u64(interval_high(node),
+                                    max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+                         struct interval_node **root)
+{
+       struct interval_node *right = node->in_right;
+       struct interval_node *parent = node->in_parent;
+
+       node->in_right = right->in_left;
+       if (node->in_right)
+               right->in_left->in_parent = node;
+
+       right->in_left = node;
+       right->in_parent = parent;
+       if (parent) {
+               if (node_is_left_child(node))
+                       parent->in_left = right;
+               else
+                       parent->in_right = right;
+       } else {
+               *root = right;
+       }
+       node->in_parent = right;
+
+       /* update max_high for node and right */
+       __rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+                          struct interval_node **root)
+{
+       struct interval_node *left = node->in_left;
+       struct interval_node *parent = node->in_parent;
+
+       node->in_left = left->in_right;
+       if (node->in_left)
+               left->in_right->in_parent = node;
+       left->in_right = node;
+
+       left->in_parent = parent;
+       if (parent) {
+               if (node_is_right_child(node))
+                       parent->in_right = left;
+               else
+                       parent->in_left = left;
+       } else {
+               *root = left;
+       }
+       node->in_parent = left;
+
+       /* update max_high for node and left */
+       __rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {                       \
+       struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+                                 struct interval_node **root)
+{
+       struct interval_node *parent, *gparent;
+       ENTRY;
+
+       while ((parent = node->in_parent) && node_is_red(parent)) {
+               gparent = parent->in_parent;
+               /* Parent is RED, so gparent must not be NULL */
+               if (node_is_left_child(parent)) {
+                       struct interval_node *uncle;
+                       uncle = gparent->in_right;
+                       if (uncle && node_is_red(uncle)) {
+                               uncle->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_BLACK;
+                               gparent->in_color = INTERVAL_RED;
+                               node = gparent;
+                               continue;
+                       }
+
+                       if (parent->in_right == node) {
+                               __rotate_left(parent, root);
+                               interval_swap(node, parent);
+                       }
+
+                       parent->in_color = INTERVAL_BLACK;
+                       gparent->in_color = INTERVAL_RED;
+                       __rotate_right(gparent, root);
+               } else {
+                       struct interval_node *uncle;
+                       uncle = gparent->in_left;
+                       if (uncle && node_is_red(uncle)) {
+                               uncle->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_BLACK;
+                               gparent->in_color = INTERVAL_RED;
+                               node = gparent;
+                               continue;
+                       }
+
+                       if (node_is_left_child(node)) {
+                               __rotate_right(parent, root);
+                               interval_swap(node, parent);
+                       }
+
+                       parent->in_color = INTERVAL_BLACK;
+                       gparent->in_color = INTERVAL_RED;
+                       __rotate_left(gparent, root);
+               }
+       }
+
+       (*root)->in_color = INTERVAL_BLACK;
+       EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                     struct interval_node **root)
+
+{
+       struct interval_node **p, *parent = NULL;
+       ENTRY;
+
+       LASSERT(!interval_is_intree(node));
+       p = root;
+       while (*p) {
+               parent = *p;
+               if (node_equal(parent, node))
+                       RETURN(parent);
+
+               /* max_high field must be updated after each iteration */
+               if (parent->in_max_high < interval_high(node))
+                       parent->in_max_high = interval_high(node);
+
+               if (node_compare(node, parent) < 0)
+                       p = &parent->in_left;
+               else
+                       p = &parent->in_right;
+       }
+
+       /* link node into the tree */
+       node->in_parent = parent;
+       node->in_color = INTERVAL_RED;
+       node->in_left = node->in_right = NULL;
+       *p = node;
+
+       interval_insert_color(node, root);
+       node->in_intree = 1;
+
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+       return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+                                struct interval_node *parent,
+                                struct interval_node **root)
+{
+       struct interval_node *tmp;
+       ENTRY;
+
+       while (node_is_black_or_0(node) && node != *root) {
+               if (parent->in_left == node) {
+                       tmp = parent->in_right;
+                       if (node_is_red(tmp)) {
+                               tmp->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_RED;
+                               __rotate_left(parent, root);
+                               tmp = parent->in_right;
+                       }
+                       if (node_is_black_or_0(tmp->in_left) &&
+                           node_is_black_or_0(tmp->in_right)) {
+                               tmp->in_color = INTERVAL_RED;
+                               node = parent;
+                               parent = node->in_parent;
+                       } else {
+                               if (node_is_black_or_0(tmp->in_right)) {
+                                       struct interval_node *o_left;
+                                       if ((o_left = tmp->in_left))
+                                            o_left->in_color = INTERVAL_BLACK;
+                                       tmp->in_color = INTERVAL_RED;
+                                       __rotate_right(tmp, root);
+                                       tmp = parent->in_right;
+                               }
+                               tmp->in_color = parent->in_color;
+                               parent->in_color = INTERVAL_BLACK;
+                               if (tmp->in_right)
+                                   tmp->in_right->in_color = INTERVAL_BLACK;
+                               __rotate_left(parent, root);
+                               node = *root;
+                               break;
+                       }
+               } else {
+                       tmp = parent->in_left;
+                       if (node_is_red(tmp)) {
+                               tmp->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_RED;
+                               __rotate_right(parent, root);
+                               tmp = parent->in_left;
+                       }
+                       if (node_is_black_or_0(tmp->in_left) &&
+                           node_is_black_or_0(tmp->in_right)) {
+                               tmp->in_color = INTERVAL_RED;
+                               node = parent;
+                               parent = node->in_parent;
+                       } else {
+                               if (node_is_black_or_0(tmp->in_left)) {
+                                       struct interval_node *o_right;
+                                       if ((o_right = tmp->in_right))
+                                           o_right->in_color = INTERVAL_BLACK;
+                                       tmp->in_color = INTERVAL_RED;
+                                       __rotate_left(tmp, root);
+                                       tmp = parent->in_left;
+                               }
+                               tmp->in_color = parent->in_color;
+                               parent->in_color = INTERVAL_BLACK;
+                               if (tmp->in_left)
+                                       tmp->in_left->in_color = INTERVAL_BLACK;
+                               __rotate_right(parent, root);
+                               node = *root;
+                               break;
+                       }
+               }
+       }
+       if (node)
+               node->in_color = INTERVAL_BLACK;
+       EXIT;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse  a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+                          __u64  old_maxhigh)
+{
+       __u64 left_max, right_max;
+       ENTRY;
+
+       while (node) {
+               left_max = node->in_left ? node->in_left->in_max_high : 0;
+               right_max = node->in_right ? node->in_right->in_max_high : 0;
+               node->in_max_high = max_u64(interval_high(node),
+                                           max_u64(left_max, right_max));
+
+               if (node->in_max_high >= old_maxhigh)
+                       break;
+               node = node->in_parent;
+       }
+       EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+                   struct interval_node **root)
+{
+       struct interval_node *child, *parent;
+       int color;
+       ENTRY;
+
+       LASSERT(interval_is_intree(node));
+       node->in_intree = 0;
+       if (!node->in_left) {
+               child = node->in_right;
+       } else if (!node->in_right) {
+               child = node->in_left;
+       } else { /* Both left and right child are not NULL */
+               struct interval_node *old = node;
+
+               node = interval_next(node);
+               child = node->in_right;
+               parent = node->in_parent;
+               color = node->in_color;
+
+               if (child)
+                       child->in_parent = parent;
+               if (parent == old)
+                       parent->in_right = child;
+               else
+                       parent->in_left = child;
+
+               node->in_color = old->in_color;
+               node->in_right = old->in_right;
+               node->in_left = old->in_left;
+               node->in_parent = old->in_parent;
+
+               if (old->in_parent) {
+                       if (node_is_left_child(old))
+                               old->in_parent->in_left = node;
+                       else
+                               old->in_parent->in_right = node;
+               } else {
+                       *root = node;
+               }
+
+               old->in_left->in_parent = node;
+               if (old->in_right)
+                       old->in_right->in_parent = node;
+               update_maxhigh(child ? : parent, node->in_max_high);
+               update_maxhigh(node, old->in_max_high);
+               if (parent == old)
+                        parent = node;
+               goto color;
+       }
+       parent = node->in_parent;
+       color = node->in_color;
+
+       if (child)
+               child->in_parent = parent;
+       if (parent) {
+               if (node_is_left_child(node))
+                       parent->in_left = child;
+               else
+                       parent->in_right = child;
+       } else {
+               *root = child;
+       }
+
+       update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+       if (color == INTERVAL_BLACK)
+               interval_erase_color(child, parent, root);
+       EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+                                         struct interval_node_extent *ext)
+{
+       return (ext->start <= node->in_max_high &&
+               ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *            return 0;
+ *       if (ext->end < interval_low(node)) {
+ *            interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *            if (extent_overlapped(ext, &node->in_extent))
+ *                    func(node, data);
+ *            interval_search(node->in_left, ext, func, data);
+ *            interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+                                  struct interval_node_extent *ext,
+                                  interval_callback_t func,
+                                  void *data)
+{
+       struct interval_node *parent;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+
+       LASSERT(ext != NULL);
+       LASSERT(func != NULL);
+
+       while (node) {
+               if (ext->end < interval_low(node)) {
+                       if (node->in_left) {
+                               node = node->in_left;
+                               continue;
+                       }
+               } else if (interval_may_overlap(node, ext)) {
+                       if (extent_overlapped(ext, &node->in_extent)) {
+                               rc = func(node, data);
+                               if (rc == INTERVAL_ITER_STOP)
+                                       break;
+                       }
+
+                       if (node->in_left) {
+                               node = node->in_left;
+                               continue;
+                       }
+                       if (node->in_right) {
+                               node = node->in_right;
+                               continue;
+                       }
+               }
+
+               parent = node->in_parent;
+               while (parent) {
+                       if (node_is_left_child(node) &&
+                           parent->in_right) {
+                               /* If we ever got the left, it means that the
+                                * parent met ext->end<interval_low(parent), or
+                                * may_overlap(parent). If the former is true,
+                                * we needn't go back. So stop early and check
+                                * may_overlap(parent) after this loop.  */
+                               node = parent->in_right;
+                               break;
+                       }
+                       node = parent;
+                       parent = parent->in_parent;
+               }
+               if (parent == NULL || !interval_may_overlap(parent, ext))
+                       break;
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+                                             void *args)
+{
+       *(int *)args = 1;
+       return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+                          struct interval_node_extent *ext)
+{
+       int has = 0;
+       (void)interval_search(root, ext, interval_overlap_cb, &has);
+       return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *     struct interval_node *tmp;
+ *     static __u64 res = 0;
+ *
+ *     if (root == NULL)
+ *             return res;
+ *     if (root->in_max_high < low) {
+ *             res = max_u64(root->in_max_high + 1, res);
+ *             return res;
+ *     } else if (low < interval_low(root)) {
+ *             interval_expand_low(root->in_left, low);
+ *             return res;
+ *     }
+ *
+ *     if (interval_high(root) < low)
+ *             res = max_u64(interval_high(root) + 1, res);
+ *     interval_expand_low(root->in_left, low);
+ *     interval_expand_low(root->in_right, low);
+ *
+ *     return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+       /* we only concern the empty tree right now. */
+       if (root == NULL)
+               return 0;
+       return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+       __u64 result = ~0;
+
+       while (node != NULL) {
+               if (node->in_max_high < high)
+                       break;
+
+               if (interval_low(node) > high) {
+                       result = interval_low(node) - 1;
+                       node = node->in_left;
+               } else {
+                       node = node->in_right;
+               }
+       }
+
+       return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+                    struct interval_node_extent *ext,
+                    struct interval_node_extent *limiter)
+{
+       /* The assertion of interval_is_overlapped is expensive because we may
+        * travel many nodes to find the overlapped node. */
+       LASSERT(interval_is_overlapped(root, ext) == 0);
+       if (!limiter || limiter->start < ext->start)
+               ext->start = interval_expand_low(root, ext->start);
+       if (!limiter || limiter->end > ext->end)
+               ext->end = interval_expand_high(root, ext->end);
+       LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644 (file)
index 0000000..853409a
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+       /* on server-side resource of lock doesn't change */
+       if (!lock->l_ns_srv)
+               spin_lock(&lock->l_lock);
+
+       lock_res(lock->l_resource);
+
+       lock->l_res_locked = 1;
+       return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+       /* on server-side resource of lock doesn't change */
+       lock->l_res_locked = 0;
+
+       unlock_res(lock->l_resource);
+       if (!lock->l_ns_srv)
+               spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644 (file)
index 0000000..f7432f7
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       struct list_head *tmp;
+       struct ldlm_lock *lck;
+       __u64 kms = 0;
+       ENTRY;
+
+       /* don't let another thread in ldlm_extent_shift_kms race in
+        * just after we finish and take our lock into account in its
+        * calculation of the kms */
+       lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+       list_for_each(tmp, &res->lr_granted) {
+               lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+                       continue;
+
+               if (lck->l_policy_data.l_extent.end >= old_kms)
+                       RETURN(old_kms);
+
+               /* This extent _has_ to be smaller than old_kms (checked above)
+                * so kms can only ever be smaller or the same as old_kms. */
+               if (lck->l_policy_data.l_extent.end + 1 > kms)
+                       kms = lck->l_policy_data.l_extent.end + 1;
+       }
+       LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms);
+
+       RETURN(kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+       struct ldlm_interval *node;
+       ENTRY;
+
+       LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+       OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+       if (node == NULL)
+               RETURN(NULL);
+
+       INIT_LIST_HEAD(&node->li_group);
+       ldlm_interval_attach(node, lock);
+       RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+       if (node) {
+               LASSERT(list_empty(&node->li_group));
+               LASSERT(!interval_is_intree(&node->li_node));
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                         struct ldlm_lock *l)
+{
+       LASSERT(l->l_tree_node == NULL);
+       LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+       list_add_tail(&l->l_sl_policy, &n->li_group);
+       l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+       struct ldlm_interval *n = l->l_tree_node;
+
+       if (n == NULL)
+               return NULL;
+
+       LASSERT(!list_empty(&n->li_group));
+       l->l_tree_node = NULL;
+       list_del_init(&l->l_sl_policy);
+
+       return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+       int index;
+
+       LASSERT(mode != 0);
+       LASSERT(IS_PO2(mode));
+       for (index = -1; mode; index++, mode >>= 1) ;
+       LASSERT(index < LCK_MODE_NUM);
+       return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                         struct ldlm_lock *lock)
+{
+       struct interval_node *found, **root;
+       struct ldlm_interval *node;
+       struct ldlm_extent *extent;
+       int idx;
+
+       LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+       node = lock->l_tree_node;
+       LASSERT(node != NULL);
+       LASSERT(!interval_is_intree(&node->li_node));
+
+       idx = lock_mode_to_index(lock->l_granted_mode);
+       LASSERT(lock->l_granted_mode == 1 << idx);
+       LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+       /* node extent initialize */
+       extent = &lock->l_policy_data.l_extent;
+       interval_set(&node->li_node, extent->start, extent->end);
+
+       root = &res->lr_itree[idx].lit_root;
+       found = interval_insert(&node->li_node, root);
+       if (found) { /* The policy group found. */
+               struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+               LASSERT(tmp != NULL);
+               ldlm_interval_free(tmp);
+               ldlm_interval_attach(to_ldlm_interval(found), lock);
+       }
+       res->lr_itree[idx].lit_size++;
+
+       /* even though we use interval tree to manage the extent lock, we also
+        * add the locks into grant list, for debug purpose, .. */
+       ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       struct ldlm_interval *node = lock->l_tree_node;
+       struct ldlm_interval_tree *tree;
+       int idx;
+
+       if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+               return;
+
+       idx = lock_mode_to_index(lock->l_granted_mode);
+       LASSERT(lock->l_granted_mode == 1 << idx);
+       tree = &res->lr_itree[idx];
+
+       LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+       tree->lit_size--;
+       node = ldlm_interval_detach(lock);
+       if (node) {
+               interval_erase(&node->li_node, &tree->lit_root);
+               ldlm_interval_free(node);
+       }
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_extent.start = wpolicy->l_extent.start;
+       lpolicy->l_extent.end = wpolicy->l_extent.end;
+       lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_extent.start = lpolicy->l_extent.start;
+       wpolicy->l_extent.end = lpolicy->l_extent.end;
+       wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644 (file)
index 0000000..f100a84
--- /dev/null
@@ -0,0 +1,849 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <linux/list.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *           and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *           have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+       for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+       return((new->l_policy_data.l_flock.owner ==
+               lock->l_policy_data.l_flock.owner) &&
+              (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+       return((new->l_policy_data.l_flock.start <=
+               lock->l_policy_data.l_flock.end) &&
+              (new->l_policy_data.l_flock.end >=
+               lock->l_policy_data.l_flock.start));
+}
+
+static inline int ldlm_flock_blocking_link(struct ldlm_lock *req,
+                                          struct ldlm_lock *lock)
+{
+       int rc = 0;
+
+       /* For server only */
+       if (req->l_export == NULL)
+               return 0;
+
+       if (unlikely(req->l_export->exp_flock_hash == NULL)) {
+               rc = ldlm_init_flock_export(req->l_export);
+               if (rc)
+                       goto error;
+       }
+
+       LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+       req->l_policy_data.l_flock.blocking_owner =
+               lock->l_policy_data.l_flock.owner;
+       req->l_policy_data.l_flock.blocking_export =
+               lock->l_export;
+       req->l_policy_data.l_flock.blocking_refs = 0;
+
+       cfs_hash_add(req->l_export->exp_flock_hash,
+                    &req->l_policy_data.l_flock.owner,
+                    &req->l_exp_flock_hash);
+error:
+       return rc;
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+       /* For server only */
+       if (req->l_export == NULL)
+               return;
+
+       check_res_locked(req->l_resource);
+       if (req->l_export->exp_flock_hash != NULL &&
+           !hlist_unhashed(&req->l_exp_flock_hash))
+               cfs_hash_del(req->l_export->exp_flock_hash,
+                            &req->l_policy_data.l_flock.owner,
+                            &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+       ENTRY;
+
+       LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+                  mode, flags);
+
+       /* Safe to not lock here, since it should be empty anyway */
+       LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+       list_del_init(&lock->l_res_link);
+       if (flags == LDLM_FL_WAIT_NOREPROC &&
+           !(lock->l_flags & LDLM_FL_FAILED)) {
+               /* client side - set a flag to prevent sending a CANCEL */
+               lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+               /* when reaching here, it is under lock_res_and_lock(). Thus,
+                  need call the nolock version of ldlm_lock_decref_internal*/
+               ldlm_lock_decref_internal_nolock(lock, mode);
+       }
+
+       ldlm_lock_destroy_nolock(lock);
+       EXIT;
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+       struct obd_export *req_exp = req->l_export;
+       struct obd_export *bl_exp = bl_lock->l_export;
+       __u64 req_owner = req->l_policy_data.l_flock.owner;
+       __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+       /* For server only */
+       if (req_exp == NULL)
+               return 0;
+
+       class_export_get(bl_exp);
+       while (1) {
+               struct obd_export *bl_exp_new;
+               struct ldlm_lock *lock = NULL;
+               struct ldlm_flock *flock;
+
+               if (bl_exp->exp_flock_hash != NULL)
+                       lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+                                              &bl_owner);
+               if (lock == NULL)
+                       break;
+
+               flock = &lock->l_policy_data.l_flock;
+               LASSERT(flock->owner == bl_owner);
+               bl_owner = flock->blocking_owner;
+               bl_exp_new = class_export_get(flock->blocking_export);
+               class_export_put(bl_exp);
+
+               cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+               bl_exp = bl_exp_new;
+
+               if (bl_owner == req_owner && bl_exp == req_exp) {
+                       class_export_put(bl_exp);
+                       return 1;
+               }
+       }
+       class_export_put(bl_exp);
+
+       return 0;
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ *   - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ *   - blocking ASTs have not been sent yet, so list of conflicting locks
+ *     would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+                       ldlm_error_t *err, struct list_head *work_list)
+{
+       struct ldlm_resource *res = req->l_resource;
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+       struct list_head *tmp;
+       struct list_head *ownlocks = NULL;
+       struct ldlm_lock *lock = NULL;
+       struct ldlm_lock *new = req;
+       struct ldlm_lock *new2 = NULL;
+       ldlm_mode_t mode = req->l_req_mode;
+       int local = ns_is_client(ns);
+       int added = (mode == LCK_NL);
+       int overlaps = 0;
+       int splitted = 0;
+       const struct ldlm_callback_suite null_cbs = { NULL };
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE, "flags %#llx owner "LPU64" pid %u mode %u start "
+              LPU64" end "LPU64"\n", *flags,
+              new->l_policy_data.l_flock.owner,
+              new->l_policy_data.l_flock.pid, mode,
+              req->l_policy_data.l_flock.start,
+              req->l_policy_data.l_flock.end);
+
+       *err = ELDLM_OK;
+
+       if (local) {
+               /* No blocking ASTs are sent to the clients for
+                * Posix file & record locks */
+               req->l_blocking_ast = NULL;
+       } else {
+               /* Called on the server for lock cancels. */
+               req->l_blocking_ast = ldlm_flock_blocking_ast;
+       }
+
+reprocess:
+       if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+               /* This loop determines where this processes locks start
+                * in the resource lr_granted list. */
+               list_for_each(tmp, &res->lr_granted) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+                       if (ldlm_same_flock_owner(lock, req)) {
+                               ownlocks = tmp;
+                               break;
+                       }
+               }
+       } else {
+               lockmode_verify(mode);
+
+               /* This loop determines if there are existing locks
+                * that conflict with the new lock request. */
+               list_for_each(tmp, &res->lr_granted) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+
+                       if (ldlm_same_flock_owner(lock, req)) {
+                               if (!ownlocks)
+                                       ownlocks = tmp;
+                               continue;
+                       }
+
+                       /* locks are compatible, overlap doesn't matter */
+                       if (lockmode_compat(lock->l_granted_mode, mode))
+                               continue;
+
+                       if (!ldlm_flocks_overlap(lock, req))
+                               continue;
+
+                       if (!first_enq)
+                               RETURN(LDLM_ITER_CONTINUE);
+
+                       if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = -EAGAIN;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       if (*flags & LDLM_FL_TEST_LOCK) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               req->l_req_mode = lock->l_granted_mode;
+                               req->l_policy_data.l_flock.pid =
+                                       lock->l_policy_data.l_flock.pid;
+                               req->l_policy_data.l_flock.start =
+                                       lock->l_policy_data.l_flock.start;
+                               req->l_policy_data.l_flock.end =
+                                       lock->l_policy_data.l_flock.end;
+                               *flags |= LDLM_FL_LOCK_CHANGED;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       if (ldlm_flock_deadlock(req, lock)) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = -EDEADLK;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       rc = ldlm_flock_blocking_link(req, lock);
+                       if (rc) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = rc;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+                       ldlm_resource_add_lock(res, &res->lr_waiting, req);
+                       *flags |= LDLM_FL_BLOCK_GRANTED;
+                       RETURN(LDLM_ITER_STOP);
+               }
+       }
+
+       if (*flags & LDLM_FL_TEST_LOCK) {
+               ldlm_flock_destroy(req, mode, *flags);
+               req->l_req_mode = LCK_NL;
+               *flags |= LDLM_FL_LOCK_CHANGED;
+               RETURN(LDLM_ITER_STOP);
+       }
+
+       /* In case we had slept on this lock request take it off of the
+        * deadlock detection hash list. */
+       ldlm_flock_blocking_unlink(req);
+
+       /* Scan the locks owned by this process that overlap this request.
+        * We may have to merge or split existing locks. */
+
+       if (!ownlocks)
+               ownlocks = &res->lr_granted;
+
+       list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+               lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+               if (!ldlm_same_flock_owner(lock, new))
+                       break;
+
+               if (lock->l_granted_mode == mode) {
+                       /* If the modes are the same then we need to process
+                        * locks that overlap OR adjoin the new lock. The extra
+                        * logic condition is necessary to deal with arithmetic
+                        * overflow and underflow. */
+                       if ((new->l_policy_data.l_flock.start >
+                            (lock->l_policy_data.l_flock.end + 1))
+                           && (lock->l_policy_data.l_flock.end !=
+                               OBD_OBJECT_EOF))
+                               continue;
+
+                       if ((new->l_policy_data.l_flock.end <
+                            (lock->l_policy_data.l_flock.start - 1))
+                           && (lock->l_policy_data.l_flock.start != 0))
+                               break;
+
+                       if (new->l_policy_data.l_flock.start <
+                           lock->l_policy_data.l_flock.start) {
+                               lock->l_policy_data.l_flock.start =
+                                       new->l_policy_data.l_flock.start;
+                       } else {
+                               new->l_policy_data.l_flock.start =
+                                       lock->l_policy_data.l_flock.start;
+                       }
+
+                       if (new->l_policy_data.l_flock.end >
+                           lock->l_policy_data.l_flock.end) {
+                               lock->l_policy_data.l_flock.end =
+                                       new->l_policy_data.l_flock.end;
+                       } else {
+                               new->l_policy_data.l_flock.end =
+                                       lock->l_policy_data.l_flock.end;
+                       }
+
+                       if (added) {
+                               ldlm_flock_destroy(lock, mode, *flags);
+                       } else {
+                               new = lock;
+                               added = 1;
+                       }
+                       continue;
+               }
+
+               if (new->l_policy_data.l_flock.start >
+                   lock->l_policy_data.l_flock.end)
+                       continue;
+
+               if (new->l_policy_data.l_flock.end <
+                   lock->l_policy_data.l_flock.start)
+                       break;
+
+               ++overlaps;
+
+               if (new->l_policy_data.l_flock.start <=
+                   lock->l_policy_data.l_flock.start) {
+                       if (new->l_policy_data.l_flock.end <
+                           lock->l_policy_data.l_flock.end) {
+                               lock->l_policy_data.l_flock.start =
+                                       new->l_policy_data.l_flock.end + 1;
+                               break;
+                       }
+                       ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+                       continue;
+               }
+               if (new->l_policy_data.l_flock.end >=
+                   lock->l_policy_data.l_flock.end) {
+                       lock->l_policy_data.l_flock.end =
+                               new->l_policy_data.l_flock.start - 1;
+                       continue;
+               }
+
+               /* split the existing lock into two locks */
+
+               /* if this is an F_UNLCK operation then we could avoid
+                * allocating a new lock and use the req lock passed in
+                * with the request but this would complicate the reply
+                * processing since updates to req get reflected in the
+                * reply. The client side replays the lock request so
+                * it must see the original lock data in the reply. */
+
+               /* XXX - if ldlm_lock_new() can sleep we should
+                * release the lr_lock, allocate the new lock,
+                * and restart processing this lock. */
+               if (!new2) {
+                       unlock_res_and_lock(req);
+                       new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+                                               lock->l_granted_mode, &null_cbs,
+                                               NULL, 0, LVB_T_NONE);
+                       lock_res_and_lock(req);
+                       if (!new2) {
+                               ldlm_flock_destroy(req, lock->l_granted_mode,
+                                                  *flags);
+                               *err = -ENOLCK;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+                       goto reprocess;
+               }
+
+               splitted = 1;
+
+               new2->l_granted_mode = lock->l_granted_mode;
+               new2->l_policy_data.l_flock.pid =
+                       new->l_policy_data.l_flock.pid;
+               new2->l_policy_data.l_flock.owner =
+                       new->l_policy_data.l_flock.owner;
+               new2->l_policy_data.l_flock.start =
+                       lock->l_policy_data.l_flock.start;
+               new2->l_policy_data.l_flock.end =
+                       new->l_policy_data.l_flock.start - 1;
+               lock->l_policy_data.l_flock.start =
+                       new->l_policy_data.l_flock.end + 1;
+               new2->l_conn_export = lock->l_conn_export;
+               if (lock->l_export != NULL) {
+                       new2->l_export = class_export_lock_get(lock->l_export, new2);
+                       if (new2->l_export->exp_lock_hash &&
+                           hlist_unhashed(&new2->l_exp_hash))
+                               cfs_hash_add(new2->l_export->exp_lock_hash,
+                                            &new2->l_remote_handle,
+                                            &new2->l_exp_hash);
+               }
+               if (*flags == LDLM_FL_WAIT_NOREPROC)
+                       ldlm_lock_addref_internal_nolock(new2,
+                                                        lock->l_granted_mode);
+
+               /* insert new2 at lock */
+               ldlm_resource_add_lock(res, ownlocks, new2);
+               LDLM_LOCK_RELEASE(new2);
+               break;
+       }
+
+       /* if new2 is created but never used, destroy it*/
+       if (splitted == 0 && new2 != NULL)
+               ldlm_lock_destroy_nolock(new2);
+
+       /* At this point we're granting the lock request. */
+       req->l_granted_mode = req->l_req_mode;
+
+       /* Add req to the granted queue before calling ldlm_reprocess_all(). */
+       if (!added) {
+               list_del_init(&req->l_res_link);
+               /* insert new lock before ownlocks in list. */
+               ldlm_resource_add_lock(res, ownlocks, req);
+       }
+
+       if (*flags != LDLM_FL_WAIT_NOREPROC) {
+               /* The only one possible case for client-side calls flock
+                * policy function is ldlm_flock_completion_ast inside which
+                * carries LDLM_FL_WAIT_NOREPROC flag. */
+               CERROR("Illegal parameter for client-side-only module.\n");
+               LBUG();
+       }
+
+       /* In case we're reprocessing the requested lock we can't destroy
+        * it until after calling ldlm_add_ast_work_item() above so that laawi()
+        * can bump the reference count on \a req. Otherwise \a req
+        * could be freed before the completion AST can be sent.  */
+       if (added)
+               ldlm_flock_destroy(req, mode, *flags);
+
+       ldlm_resource_dump(D_INFO, res);
+       RETURN(LDLM_ITER_CONTINUE);
+}
+
+struct ldlm_flock_wait_data {
+       struct ldlm_lock *fwd_lock;
+       int            fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+       /* take lock off the deadlock detection hash list. */
+       lock_res_and_lock(lock);
+       ldlm_flock_blocking_unlink(lock);
+
+       /* client side - set flag to prevent lock from being put on LRU list */
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       unlock_res_and_lock(lock);
+
+       EXIT;
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       struct file_lock                *getlk = lock->l_ast_data;
+       struct obd_device             *obd;
+       struct obd_import             *imp = NULL;
+       struct ldlm_flock_wait_data     fwd;
+       struct l_wait_info            lwi;
+       ldlm_error_t                err;
+       int                          rc = 0;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+              flags, data, getlk);
+
+       /* Import invalidation. We need to actually release the lock
+        * references being held, so that it can go away. No point in
+        * holding the lock even if app still believes it has it, since
+        * server already dropped it anyway. Only for granted locks too. */
+       if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+           (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+               if (lock->l_req_mode == lock->l_granted_mode &&
+                   lock->l_granted_mode != LCK_NL &&
+                   NULL == data)
+                       ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+               /* Need to wake up the waiter if we were evicted */
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               if (NULL == data)
+                       /* mds granted the lock in the reply */
+                       goto granted;
+               /* CP AST RPC: lock get granted, wake it up */
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "sleeping");
+       fwd.fwd_lock = lock;
+       obd = class_exp2obd(lock->l_conn_export);
+
+       /* if this is a local lock, there is no import */
+       if (NULL != obd)
+               imp = obd->u.cli.cl_import;
+
+       if (NULL != imp) {
+               spin_lock(&imp->imp_lock);
+               fwd.fwd_generation = imp->imp_generation;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+       /* Go to sleep until the lock is granted. */
+       rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+granted:
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+       if (lock->l_destroyed) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+               RETURN(0);
+       }
+
+       if (lock->l_flags & LDLM_FL_FAILED) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+               RETURN(-EIO);
+       }
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue granted");
+
+       lock_res_and_lock(lock);
+
+       /* take lock off the deadlock detection hash list. */
+       ldlm_flock_blocking_unlink(lock);
+
+       /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+       list_del_init(&lock->l_res_link);
+
+       if (flags & LDLM_FL_TEST_LOCK) {
+               /* fcntl(F_GETLK) request */
+               /* The old mode was saved in getlk->fl_type so that if the mode
+                * in the lock changes we can decref the appropriate refcount.*/
+               ldlm_flock_destroy(lock, flock_type(getlk),
+                                  LDLM_FL_WAIT_NOREPROC);
+               switch (lock->l_granted_mode) {
+               case LCK_PR:
+                       flock_set_type(getlk, F_RDLCK);
+                       break;
+               case LCK_PW:
+                       flock_set_type(getlk, F_WRLCK);
+                       break;
+               default:
+                       flock_set_type(getlk, F_UNLCK);
+               }
+               flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+               flock_set_start(getlk,
+                               (loff_t)lock->l_policy_data.l_flock.start);
+               flock_set_end(getlk,
+                             (loff_t)lock->l_policy_data.l_flock.end);
+       } else {
+               __u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+               /* We need to reprocess the lock to do merges or splits
+                * with existing locks owned by this process. */
+               ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+       }
+       unlock_res_and_lock(lock);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag)
+{
+       ENTRY;
+
+       LASSERT(lock);
+       LASSERT(flag == LDLM_CB_CANCELING);
+
+       /* take lock off the deadlock detection hash list. */
+       lock_res_and_lock(lock);
+       ldlm_flock_blocking_unlink(lock);
+       unlock_res_and_lock(lock);
+       RETURN(0);
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                      ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+       lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+       lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+       /* Compat code, old clients had no idea about owner field and
+        * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+        * April 2011 */
+       lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                      ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+       lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+       lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+       lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+       wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+       wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+       wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_flock *flock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       LDLM_LOCK_GET(lock);
+
+       flock = &lock->l_policy_data.l_flock;
+       LASSERT(flock->blocking_export != NULL);
+       class_export_get(flock->blocking_export);
+       flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_flock *flock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       LDLM_LOCK_RELEASE(lock);
+
+       flock = &lock->l_policy_data.l_flock;
+       LASSERT(flock->blocking_export != NULL);
+       class_export_put(flock->blocking_export);
+       if (--flock->blocking_refs == 0) {
+               flock->blocking_owner = 0;
+               flock->blocking_export = NULL;
+       }
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+       .hs_hash        = ldlm_export_flock_hash,
+       .hs_key  = ldlm_export_flock_key,
+       .hs_keycmp      = ldlm_export_flock_keycmp,
+       .hs_object      = ldlm_export_flock_object,
+       .hs_get  = ldlm_export_flock_get,
+       .hs_put  = ldlm_export_flock_put,
+       .hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+       exp->exp_flock_hash =
+               cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+                               HASH_EXP_LOCK_CUR_BITS,
+                               HASH_EXP_LOCK_MAX_BITS,
+                               HASH_EXP_LOCK_BKT_BITS, 0,
+                               CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+                               &ldlm_export_flock_ops,
+                               CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+       if (!exp->exp_flock_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+       ENTRY;
+       if (exp->exp_flock_hash) {
+               cfs_hash_putref(exp->exp_flock_hash);
+               exp->exp_flock_hash = NULL;
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644 (file)
index 0000000..574b2ff
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644 (file)
index 0000000..a08e6d9
--- /dev/null
@@ -0,0 +1,277 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern atomic_t ldlm_srv_namespace_nr;
+extern atomic_t ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_namespace_list;
+
+static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_list : &ldlm_cli_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+       LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
+       LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+       LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+       LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+       LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+                                     * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+                   ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+                         struct list_head *cancels, int count, int max,
+                         ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                    struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                              struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+       struct ptlrpc_request_set       *set;
+       int                              type; /* LDLM_{CP,BL,GL}_CALLBACK */
+       atomic_t                         restart;
+       struct list_head                        *list;
+       union ldlm_gl_desc              *gl_desc; /* glimpse AST descriptor */
+};
+
+typedef enum {
+       LDLM_WORK_BL_AST,
+       LDLM_WORK_CP_AST,
+       LDLM_WORK_REVOKE_AST,
+       LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+                 enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+                ldlm_type_t type, ldlm_mode_t,
+                const struct ldlm_callback_suite *cbs,
+                void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+                              void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                           struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                     ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+                          struct ldlm_lock_desc *ld,
+                          struct list_head *cancels, int count,
+                          ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+                           int first_enq, ldlm_error_t *err,
+                           struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern proc_dir_entry_t *ldlm_svc_proc_dir;
+extern proc_dir_entry_t *ldlm_type_proc_dir;
+
+struct ldlm_state {
+       struct ptlrpc_service *ldlm_cb_service;
+       struct ptlrpc_service *ldlm_cancel_service;
+       struct ptlrpc_client *ldlm_client;
+       struct ptlrpc_connection *ldlm_server_conn;
+       struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+       struct ldlm_lock *lock;
+       LASSERT(!list_empty(&node->li_group));
+
+       lock = list_entry(node->li_group.next, struct ldlm_lock,
+                             l_sl_policy);
+       return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+       LDLM_POLICY_CANCEL_LOCK,
+       LDLM_POLICY_KEEP_LOCK,
+       LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER(var, type)                                   \
+       static int lprocfs_rd_##var(char *page, char **start, off_t off,    \
+                                   int count, int *eof, void *data)    \
+       {                                                                  \
+               struct ldlm_pool *pl = data;                            \
+               type tmp;                                                  \
+                                                                           \
+               spin_lock(&pl->pl_lock);                                    \
+               tmp = pl->pl_##var;                                         \
+               spin_unlock(&pl->pl_lock);                                  \
+                                                                           \
+               return lprocfs_rd_uint(page, start, off, count, eof, &tmp); \
+       }                                                                  \
+       struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type)                                   \
+       int lprocfs_wr_##var(struct file *file, const char *buffer,      \
+                            unsigned long count, void *data)          \
+       {                                                                  \
+               struct ldlm_pool *pl = data;                            \
+               type tmp;                                                  \
+               int rc;                                              \
+                                                                           \
+               rc = lprocfs_wr_uint(file, buffer, count, &tmp);            \
+               if (rc < 0) {                                          \
+                       CERROR("Can't parse user input, rc = %d\n", rc);    \
+                       return rc;                                        \
+               }                                                          \
+                                                                           \
+               spin_lock(&pl->pl_lock);                                    \
+               pl->pl_##var = tmp;                                         \
+               spin_unlock(&pl->pl_lock);                                  \
+                                                                           \
+               return rc;                                                \
+       }                                                                  \
+       struct __##var##__dummy_write {;} /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+       int ret = 0;
+
+       lock_res_and_lock(lock);
+       if (((lock->l_req_mode == lock->l_granted_mode) &&
+            !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+           (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+               ret = 1;
+       unlock_res_and_lock(lock);
+
+       return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+                                           ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+                                           ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644 (file)
index 0000000..42df530
--- /dev/null
@@ -0,0 +1,868 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority, int create)
+{
+       struct ptlrpc_connection *ptlrpc_conn;
+       struct obd_import_conn *imp_conn = NULL, *item;
+       int rc = 0;
+       ENTRY;
+
+       if (!create && !priority) {
+               CDEBUG(D_HA, "Nothing to do\n");
+               RETURN(-EINVAL);
+       }
+
+       ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+       if (!ptlrpc_conn) {
+               CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+               RETURN (-ENOENT);
+       }
+
+       if (create) {
+               OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+               if (!imp_conn) {
+                       GOTO(out_put, rc = -ENOMEM);
+               }
+       }
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+               if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+                       if (priority) {
+                               list_del(&item->oic_item);
+                               list_add(&item->oic_item,
+                                            &imp->imp_conn_list);
+                               item->oic_last_attempt = 0;
+                       }
+                       CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+                              imp, imp->imp_obd->obd_name, uuid->uuid,
+                              (priority ? ", moved to head" : ""));
+                       spin_unlock(&imp->imp_lock);
+                       GOTO(out_free, rc = 0);
+               }
+       }
+       /* No existing import connection found for \a uuid. */
+       if (create) {
+               imp_conn->oic_conn = ptlrpc_conn;
+               imp_conn->oic_uuid = *uuid;
+               imp_conn->oic_last_attempt = 0;
+               if (priority)
+                       list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+               else
+                       list_add_tail(&imp_conn->oic_item,
+                                         &imp->imp_conn_list);
+               CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+                      imp, imp->imp_obd->obd_name, uuid->uuid,
+                      (priority ? "head" : "tail"));
+       } else {
+               spin_unlock(&imp->imp_lock);
+               GOTO(out_free, rc = -ENOENT);
+       }
+
+       spin_unlock(&imp->imp_lock);
+       RETURN(0);
+out_free:
+       if (imp_conn)
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+       ptlrpc_connection_put(ptlrpc_conn);
+       RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority)
+{
+       return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       struct obd_import_conn *imp_conn;
+       struct obd_export *dlmexp;
+       int rc = -ENOENT;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (list_empty(&imp->imp_conn_list)) {
+               LASSERT(!imp->imp_connection);
+               GOTO(out, rc);
+       }
+
+       list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+               if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+                       continue;
+               LASSERT(imp_conn->oic_conn);
+
+               if (imp_conn == imp->imp_conn_current) {
+                       LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+                       if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+                           imp->imp_state != LUSTRE_IMP_DISCON) {
+                               CERROR("can't remove current connection\n");
+                               GOTO(out, rc = -EBUSY);
+                       }
+
+                       ptlrpc_connection_put(imp->imp_connection);
+                       imp->imp_connection = NULL;
+
+                       dlmexp = class_conn2export(&imp->imp_dlm_handle);
+                       if (dlmexp && dlmexp->exp_connection) {
+                               LASSERT(dlmexp->exp_connection ==
+                                       imp_conn->oic_conn);
+                               ptlrpc_connection_put(dlmexp->exp_connection);
+                               dlmexp->exp_connection = NULL;
+                       }
+               }
+
+               list_del(&imp_conn->oic_item);
+               ptlrpc_connection_put(imp_conn->oic_conn);
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+               CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+                      imp, imp->imp_obd->obd_name, uuid->uuid);
+               rc = 0;
+               break;
+       }
+out:
+       spin_unlock(&imp->imp_lock);
+       if (rc == -ENOENT)
+               CERROR("connection %s not found\n", uuid->uuid);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                           struct obd_uuid *uuid)
+{
+       struct obd_import_conn *conn;
+       int rc = -ENOENT;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               /* Check if conn UUID does have this peer NID. */
+               if (class_check_uuid(&conn->oic_uuid, peer)) {
+                       *uuid = conn->oic_uuid;
+                       rc = 0;
+                       break;
+               }
+       }
+       spin_unlock(&imp->imp_lock);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+       /* Drop security policy instance after all RPCs have finished/aborted
+        * to let all busy contexts be released. */
+       class_import_get(imp);
+       class_destroy_import(imp);
+       sptlrpc_import_sec_put(imp);
+       class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *     setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *     setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+       char *ptr;
+
+       ptr = strrchr(obdname, '-');
+       if (ptr == NULL)
+               return 0;
+
+       if (strncmp(ptr + 1, "MDT", 3) == 0)
+               return 1;
+
+       return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+       struct client_obd *cli = &obddev->u.cli;
+       struct obd_import *imp;
+       struct obd_uuid server_uuid;
+       int rq_portal, rp_portal, connect_op;
+       char *name = obddev->obd_type->typ_name;
+       ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+       int rc;
+       char    *cli_name = lustre_cfg_buf(lcfg, 0);
+       ENTRY;
+
+       /* In a more perfect world, we would hang a ptlrpc_client off of
+        * obd_type and just use the values from there. */
+       if (!strcmp(name, LUSTRE_OSC_NAME) ||
+           (!(strcmp(name, LUSTRE_OSP_NAME)) &&
+            (is_osp_on_mdt(cli_name) &&
+              strstr(lustre_cfg_buf(lcfg, 1), "OST") != NULL))) {
+               /* OSC or OSP_on_MDT for OSTs */
+               rq_portal = OST_REQUEST_PORTAL;
+               rp_portal = OSC_REPLY_PORTAL;
+               connect_op = OST_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_CLI;
+               cli->cl_sp_to = LUSTRE_SP_OST;
+               ns_type = LDLM_NS_TYPE_OSC;
+       } else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+                  !strcmp(name, LUSTRE_LWP_NAME) ||
+                  (!strcmp(name, LUSTRE_OSP_NAME) &&
+                   (is_osp_on_mdt(cli_name) &&
+                    strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL))) {
+               /* MDC or OSP_on_MDT for other MDTs */
+               rq_portal = MDS_REQUEST_PORTAL;
+               rp_portal = MDC_REPLY_PORTAL;
+               connect_op = MDS_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_CLI;
+               cli->cl_sp_to = LUSTRE_SP_MDT;
+               ns_type = LDLM_NS_TYPE_MDC;
+       } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+               rq_portal = MGS_REQUEST_PORTAL;
+               rp_portal = MGC_REPLY_PORTAL;
+               connect_op = MGS_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_MGC;
+               cli->cl_sp_to = LUSTRE_SP_MGS;
+               cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+               ns_type = LDLM_NS_TYPE_MGC;
+       } else {
+               CERROR("unknown client OBD type \"%s\", can't setup\n",
+                      name);
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("requires a TARGET UUID\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+               CERROR("client UUID must be less than 38 characters\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+               CERROR("setup requires a SERVER UUID\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+               CERROR("target UUID must be less than 38 characters\n");
+               RETURN(-EINVAL);
+       }
+
+       init_rwsem(&cli->cl_sem);
+       sema_init(&cli->cl_mgc_sem, 1);
+       cli->cl_conn_count = 0;
+       memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+              min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+                    sizeof(server_uuid)));
+
+       cli->cl_dirty = 0;
+       cli->cl_avail_grant = 0;
+       /* FIXME: Should limit this for the sum of all cl_dirty_max. */
+       cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+       if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > num_physpages / 8)
+               cli->cl_dirty_max = num_physpages << (PAGE_CACHE_SHIFT - 3);
+       INIT_LIST_HEAD(&cli->cl_cache_waiters);
+       INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+       INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+       INIT_LIST_HEAD(&cli->cl_loi_write_list);
+       INIT_LIST_HEAD(&cli->cl_loi_read_list);
+       client_obd_list_lock_init(&cli->cl_loi_list_lock);
+       atomic_set(&cli->cl_pending_w_pages, 0);
+       atomic_set(&cli->cl_pending_r_pages, 0);
+       cli->cl_r_in_flight = 0;
+       cli->cl_w_in_flight = 0;
+
+       spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+       spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+       spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+       /* lru for osc. */
+       INIT_LIST_HEAD(&cli->cl_lru_osc);
+       atomic_set(&cli->cl_lru_shrinkers, 0);
+       atomic_set(&cli->cl_lru_busy, 0);
+       atomic_set(&cli->cl_lru_in_list, 0);
+       INIT_LIST_HEAD(&cli->cl_lru_list);
+       client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+       init_waitqueue_head(&cli->cl_destroy_waitq);
+       atomic_set(&cli->cl_destroy_in_flight, 0);
+       /* Turn on checksumming by default. */
+       cli->cl_checksum = 1;
+       /*
+        * The supported checksum types will be worked out at connect time
+        * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+        * through procfs.
+        */
+       cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+       atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+       /* This value may be reduced at connect time in
+        * ptlrpc_connect_interpret() . We initialize it to only
+        * 1MB until we know what the performance looks like.
+        * In the future this should likely be increased. LU-1431 */
+       cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+                                         LNET_MTU >> PAGE_CACHE_SHIFT);
+
+       if (!strcmp(name, LUSTRE_MDC_NAME)) {
+               cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 2;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 3;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 4;
+       } else {
+               if (osc_on_mdt(obddev->obd_name))
+                       cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+               else
+                       cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+       }
+       rc = ldlm_get_ref();
+       if (rc) {
+               CERROR("ldlm_get_ref failed: %d\n", rc);
+               GOTO(err, rc);
+       }
+
+       ptlrpc_init_client(rq_portal, rp_portal, name,
+                          &obddev->obd_ldlm_client);
+
+       imp = class_new_import(obddev);
+       if (imp == NULL)
+               GOTO(err_ldlm, rc = -ENOENT);
+       imp->imp_client = &obddev->obd_ldlm_client;
+       imp->imp_connect_op = connect_op;
+       memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+              LUSTRE_CFG_BUFLEN(lcfg, 1));
+       class_import_put(imp);
+
+       rc = client_import_add_conn(imp, &server_uuid, 1);
+       if (rc) {
+               CERROR("can't add initial connection\n");
+               GOTO(err_import, rc);
+       }
+
+       cli->cl_import = imp;
+       /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+       cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+       cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+               if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+                       CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+                              name, obddev->obd_name,
+                              cli->cl_target_uuid.uuid);
+                       spin_lock(&imp->imp_lock);
+                       imp->imp_deactive = 1;
+                       spin_unlock(&imp->imp_lock);
+               }
+       }
+
+       obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+                                                  LDLM_NAMESPACE_CLIENT,
+                                                  LDLM_NAMESPACE_GREEDY,
+                                                  ns_type);
+       if (obddev->obd_namespace == NULL) {
+               CERROR("Unable to create client namespace - %s\n",
+                      obddev->obd_name);
+               GOTO(err_import, rc = -ENOMEM);
+       }
+
+       cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+       RETURN(rc);
+
+err_import:
+       class_destroy_import(imp);
+err_ldlm:
+       ldlm_put_ref();
+err:
+       RETURN(rc);
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+       ENTRY;
+
+       ldlm_namespace_free_post(obddev->obd_namespace);
+       obddev->obd_namespace = NULL;
+
+       LASSERT(obddev->u.cli.cl_import == NULL);
+
+       ldlm_put_ref();
+       RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+                         struct obd_export **exp,
+                         struct obd_device *obd, struct obd_uuid *cluuid,
+                         struct obd_connect_data *data, void *localdata)
+{
+       struct client_obd       *cli    = &obd->u.cli;
+       struct obd_import       *imp    = cli->cl_import;
+       struct obd_connect_data *ocd;
+       struct lustre_handle    conn    = { 0 };
+       int                  rc;
+       ENTRY;
+
+       *exp = NULL;
+       down_write(&cli->cl_sem);
+       if (cli->cl_conn_count > 0 )
+               GOTO(out_sem, rc = -EALREADY);
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               GOTO(out_sem, rc);
+
+       cli->cl_conn_count++;
+       *exp = class_conn2export(&conn);
+
+       LASSERT(obd->obd_namespace);
+
+       imp->imp_dlm_handle = conn;
+       rc = ptlrpc_init_import(imp);
+       if (rc != 0)
+               GOTO(out_ldlm, rc);
+
+       ocd = &imp->imp_connect_data;
+       if (data) {
+               *ocd = *data;
+               imp->imp_connect_flags_orig = data->ocd_connect_flags;
+       }
+
+       rc = ptlrpc_connect_import(imp);
+       if (rc != 0) {
+               LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
+               GOTO(out_ldlm, rc);
+       }
+       LASSERT((*exp)->exp_connection);
+
+       if (data) {
+               LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+                        ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
+                        data->ocd_connect_flags, ocd->ocd_connect_flags);
+               data->ocd_connect_flags = ocd->ocd_connect_flags;
+       }
+
+       ptlrpc_pinger_add_import(imp);
+
+       EXIT;
+
+       if (rc) {
+out_ldlm:
+               cli->cl_conn_count--;
+               class_disconnect(*exp);
+               *exp = NULL;
+       }
+out_sem:
+       up_write(&cli->cl_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct client_obd *cli;
+       struct obd_import *imp;
+       int rc = 0, err;
+       ENTRY;
+
+       if (!obd) {
+               CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n",
+                      exp, exp ? exp->exp_handle.h_cookie : -1);
+               RETURN(-EINVAL);
+       }
+
+       cli = &obd->u.cli;
+       imp = cli->cl_import;
+
+       down_write(&cli->cl_sem);
+       CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+              cli->cl_conn_count);
+
+       if (!cli->cl_conn_count) {
+               CERROR("disconnecting disconnected device (%s)\n",
+                      obd->obd_name);
+               GOTO(out_disconnect, rc = -EINVAL);
+       }
+
+       cli->cl_conn_count--;
+       if (cli->cl_conn_count)
+               GOTO(out_disconnect, rc = 0);
+
+       /* Mark import deactivated now, so we don't try to reconnect if any
+        * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+        * fully deactivate the import, or that would drop all requests. */
+       spin_lock(&imp->imp_lock);
+       imp->imp_deactive = 1;
+       spin_unlock(&imp->imp_lock);
+
+       /* Some non-replayable imports (MDS's OSCs) are pinged, so just
+        * delete it regardless.  (It's safe to delete an import that was
+        * never added.) */
+       (void)ptlrpc_pinger_del_import(imp);
+
+       if (obd->obd_namespace != NULL) {
+               /* obd_force == local only */
+               ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+                                      obd->obd_force ? LCF_LOCAL : 0, NULL);
+               ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
+       }
+
+       /* There's no need to hold sem while disconnecting an import,
+        * and it may actually cause deadlock in GSS. */
+       up_write(&cli->cl_sem);
+       rc = ptlrpc_disconnect_import(imp, 0);
+       down_write(&cli->cl_sem);
+
+       ptlrpc_invalidate_import(imp);
+
+       EXIT;
+
+out_disconnect:
+       /* Use server style - class_disconnect should be always called for
+        * o_disconnect. */
+       err = class_disconnect(exp);
+       if (!rc && err)
+               rc = err;
+
+       up_write(&cli->cl_sem);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+       struct obd_device *obd;
+       ENTRY;
+
+       /* Check that we still have all structures alive as this may
+        * be some late RPC at shutdown time. */
+       if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+                    !exp_connect_lru_resize(req->rq_export))) {
+               lustre_msg_set_slv(req->rq_repmsg, 0);
+               lustre_msg_set_limit(req->rq_repmsg, 0);
+               RETURN(0);
+       }
+
+       /* OBD is alive here as export is alive, which we checked above. */
+       obd = req->rq_export->exp_obd;
+
+       read_lock(&obd->obd_pool_lock);
+       lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+       lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+       read_unlock(&obd->obd_pool_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+       if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+               DEBUG_REQ(D_ERROR, req, "dropping reply");
+               return (-ECOMM);
+       }
+
+       if (unlikely(rc)) {
+               DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+               req->rq_status = rc;
+               return (ptlrpc_send_error(req, 1));
+       } else {
+               DEBUG_REQ(D_NET, req, "sending reply");
+       }
+
+       return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+       struct ptlrpc_service_part *svcpt;
+       int                     netrc;
+       struct ptlrpc_reply_state *rs;
+       struct obd_export        *exp;
+       ENTRY;
+
+       if (req->rq_no_reply) {
+               EXIT;
+               return;
+       }
+
+       svcpt = req->rq_rqbd->rqbd_svcpt;
+       rs = req->rq_reply_state;
+       if (rs == NULL || !rs->rs_difficult) {
+               /* no notifiers */
+               target_send_reply_msg (req, rc, fail_id);
+               EXIT;
+               return;
+       }
+
+       /* must be an export if locks saved */
+       LASSERT (req->rq_export != NULL);
+       /* req/reply consistent */
+       LASSERT(rs->rs_svcpt == svcpt);
+
+       /* "fresh" reply */
+       LASSERT (!rs->rs_scheduled);
+       LASSERT (!rs->rs_scheduled_ever);
+       LASSERT (!rs->rs_handled);
+       LASSERT (!rs->rs_on_net);
+       LASSERT (rs->rs_export == NULL);
+       LASSERT (list_empty(&rs->rs_obd_list));
+       LASSERT (list_empty(&rs->rs_exp_list));
+
+       exp = class_export_get (req->rq_export);
+
+       /* disable reply scheduling while I'm setting up */
+       rs->rs_scheduled = 1;
+       rs->rs_on_net    = 1;
+       rs->rs_xid       = req->rq_xid;
+       rs->rs_transno   = req->rq_transno;
+       rs->rs_export    = exp;
+       rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+       spin_lock(&exp->exp_uncommitted_replies_lock);
+       CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
+              rs->rs_transno, exp->exp_last_committed);
+       if (rs->rs_transno > exp->exp_last_committed) {
+               /* not committed already */
+               list_add_tail(&rs->rs_obd_list,
+                                 &exp->exp_uncommitted_replies);
+       }
+       spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+       spin_lock(&exp->exp_lock);
+       list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+       spin_unlock(&exp->exp_lock);
+
+       netrc = target_send_reply_msg(req, rc, fail_id);
+
+       spin_lock(&svcpt->scp_rep_lock);
+
+       atomic_inc(&svcpt->scp_nreps_difficult);
+
+       if (netrc != 0) {
+               /* error sending: reply is off the net.  Also we need +1
+                * reply ref until ptlrpc_handle_rs() is done
+                * with the reply state (if the send was successful, there
+                * would have been +1 ref for the net, which
+                * reply_out_callback leaves alone) */
+               rs->rs_on_net = 0;
+               ptlrpc_rs_addref(rs);
+       }
+
+       spin_lock(&rs->rs_lock);
+       if (rs->rs_transno <= exp->exp_last_committed ||
+           (!rs->rs_on_net && !rs->rs_no_ack) ||
+           list_empty(&rs->rs_exp_list) ||     /* completed already */
+           list_empty(&rs->rs_obd_list)) {
+               CDEBUG(D_HA, "Schedule reply immediately\n");
+               ptlrpc_dispatch_difficult_reply(rs);
+       } else {
+               list_add(&rs->rs_list, &svcpt->scp_rep_active);
+               rs->rs_scheduled = 0;   /* allow notifier to schedule */
+       }
+       spin_unlock(&rs->rs_lock);
+       spin_unlock(&svcpt->scp_rep_lock);
+       EXIT;
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+       [LCK_EX] LCK_COMPAT_EX,
+       [LCK_PW] LCK_COMPAT_PW,
+       [LCK_PR] LCK_COMPAT_PR,
+       [LCK_CW] LCK_COMPAT_CW,
+       [LCK_CR] LCK_COMPAT_CR,
+       [LCK_NL] LCK_COMPAT_NL,
+       [LCK_GROUP] LCK_COMPAT_GROUP,
+       [LCK_COS] LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+       int result;
+
+       switch (error) {
+       case ELDLM_OK:
+               result = 0;
+               break;
+       case ELDLM_LOCK_CHANGED:
+               result = -ESTALE;
+               break;
+       case ELDLM_LOCK_ABORTED:
+               result = -ENAVAIL;
+               break;
+       case ELDLM_LOCK_REPLACED:
+               result = -ESRCH;
+               break;
+       case ELDLM_NO_LOCK_DATA:
+               result = -ENOENT;
+               break;
+       case ELDLM_NAMESPACE_EXISTS:
+               result = -EEXIST;
+               break;
+       case ELDLM_BAD_NAMESPACE:
+               result = -EBADF;
+               break;
+       default:
+               if (((int)error) < 0)  /* cast to signed type */
+                       result = error; /* as ldlm_error_t can be unsigned */
+               else {
+                       CERROR("Invalid DLM result code: %d\n", error);
+                       result = -EPROTO;
+               }
+       }
+       return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+       int error;
+
+       switch (err_no) {
+       case 0:
+               error = ELDLM_OK;
+               break;
+       case -ESTALE:
+               error = ELDLM_LOCK_CHANGED;
+               break;
+       case -ENAVAIL:
+               error = ELDLM_LOCK_ABORTED;
+               break;
+       case -ESRCH:
+               error = ELDLM_LOCK_REPLACED;
+               break;
+       case -ENOENT:
+               error = ELDLM_NO_LOCK_DATA;
+               break;
+       case -EEXIST:
+               error = ELDLM_NAMESPACE_EXISTS;
+               break;
+       case -EBADF:
+               error = ELDLM_BAD_NAMESPACE;
+               break;
+       default:
+               error = err_no;
+       }
+       return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+       if (!list_empty(&exp->exp_locks_list)) {
+               struct ldlm_lock *lock;
+
+               CERROR("dumping locks for export %p,"
+                      "ignore if the unmount doesn't hang\n", exp);
+               list_for_each_entry(lock, &exp->exp_locks_list,
+                                       l_exp_refs_link)
+                       LDLM_ERROR(lock, "lock:");
+       }
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644 (file)
index 0000000..bd39e1c
--- /dev/null
@@ -0,0 +1,2443 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/lustre_intent.h>
+
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+       [0] "--",
+       [LCK_EX] "EX",
+       [LCK_PW] "PW",
+       [LCK_PR] "PR",
+       [LCK_CW] "CW",
+       [LCK_CR] "CR",
+       [LCK_NL] "NL",
+       [LCK_GROUP] "GROUP",
+       [LCK_COS] "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+       [LDLM_PLAIN] "PLN",
+       [LDLM_EXTENT] "EXT",
+       [LDLM_FLOCK] "FLK",
+       [LDLM_IBITS] "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire18_to_local,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire21_to_local,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_local_to_wire,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_local_to_wire,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_local_to_wire,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+                                const ldlm_policy_data_t *lpolicy,
+                                ldlm_wire_policy_data_t *wpolicy)
+{
+       ldlm_policy_local_to_wire_t convert;
+
+       convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+       convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+                                 const ldlm_wire_policy_data_t *wpolicy,
+                                 ldlm_policy_data_t *lpolicy)
+{
+       ldlm_policy_wire_to_local_t convert;
+       int new_client;
+
+       /** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+       new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+       if (new_client)
+               convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+       else
+               convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+       convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+       switch (it) {
+       case IT_OPEN:
+               return "open";
+       case IT_CREAT:
+               return "creat";
+       case (IT_OPEN | IT_CREAT):
+               return "open|creat";
+       case IT_READDIR:
+               return "readdir";
+       case IT_GETATTR:
+               return "getattr";
+       case IT_LOOKUP:
+               return "lookup";
+       case IT_UNLINK:
+               return "unlink";
+       case IT_GETXATTR:
+               return "getxattr";
+       case IT_LAYOUT:
+               return "layout";
+       default:
+               CERROR("Unknown intent %d\n", it);
+               return "UNKNOWN";
+       }
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+extern struct kmem_cache *ldlm_lock_slab;
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+       ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+       atomic_inc(&lock->l_refc);
+       return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+       ENTRY;
+
+       LASSERT(lock->l_resource != LP_POISON);
+       LASSERT(atomic_read(&lock->l_refc) > 0);
+       if (atomic_dec_and_test(&lock->l_refc)) {
+               struct ldlm_resource *res;
+
+               LDLM_DEBUG(lock,
+                          "final lock_put on destroyed lock, freeing it.");
+
+               res = lock->l_resource;
+               LASSERT(lock->l_destroyed);
+               LASSERT(list_empty(&lock->l_res_link));
+               LASSERT(list_empty(&lock->l_pending_chain));
+
+               lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+                                    LDLM_NSS_LOCKS);
+               lu_ref_del(&res->lr_reference, "lock", lock);
+               ldlm_resource_putref(res);
+               lock->l_resource = NULL;
+               if (lock->l_export) {
+                       class_export_lock_put(lock->l_export, lock);
+                       lock->l_export = NULL;
+               }
+
+               if (lock->l_lvb_data != NULL)
+                       OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+               ldlm_interval_free(ldlm_interval_detach(lock));
+               lu_ref_fini(&lock->l_reference);
+               OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+       int rc = 0;
+       if (!list_empty(&lock->l_lru)) {
+               struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+               LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+               list_del_init(&lock->l_lru);
+               if (lock->l_flags & LDLM_FL_SKIPPED)
+                       lock->l_flags &= ~LDLM_FL_SKIPPED;
+               LASSERT(ns->ns_nr_unused > 0);
+               ns->ns_nr_unused--;
+               rc = 1;
+       }
+       return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+       int rc;
+
+       ENTRY;
+       if (lock->l_ns_srv) {
+               LASSERT(list_empty(&lock->l_lru));
+               RETURN(0);
+       }
+
+       spin_lock(&ns->ns_lock);
+       rc = ldlm_lock_remove_from_lru_nolock(lock);
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+       return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       lock->l_last_used = cfs_time_current();
+       LASSERT(list_empty(&lock->l_lru));
+       LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+       list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+       LASSERT(ns->ns_nr_unused >= 0);
+       ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       ENTRY;
+       spin_lock(&ns->ns_lock);
+       ldlm_lock_add_to_lru_nolock(lock);
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       ENTRY;
+       if (lock->l_ns_srv) {
+               LASSERT(list_empty(&lock->l_lru));
+               EXIT;
+               return;
+       }
+
+       spin_lock(&ns->ns_lock);
+       if (!list_empty(&lock->l_lru)) {
+               ldlm_lock_remove_from_lru_nolock(lock);
+               ldlm_lock_add_to_lru_nolock(lock);
+       }
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+       ENTRY;
+
+       if (lock->l_readers || lock->l_writers) {
+               LDLM_ERROR(lock, "lock still has references");
+               LBUG();
+       }
+
+       if (!list_empty(&lock->l_res_link)) {
+               LDLM_ERROR(lock, "lock still on resource");
+               LBUG();
+       }
+
+       if (lock->l_destroyed) {
+               LASSERT(list_empty(&lock->l_lru));
+               EXIT;
+               return 0;
+       }
+       lock->l_destroyed = 1;
+
+       if (lock->l_export && lock->l_export->exp_lock_hash) {
+               /* NB: it's safe to call cfs_hash_del() even lock isn't
+                * in exp_lock_hash. */
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_del(lock->l_export->exp_lock_hash,
+                            &lock->l_remote_handle, &lock->l_exp_hash);
+       }
+
+       ldlm_lock_remove_from_lru(lock);
+       class_handle_unhash(&lock->l_handle);
+
+#if 0
+       /* Wake anyone waiting for this lock */
+       /* FIXME: I should probably add yet another flag, instead of using
+        * l_export to only call this on clients */
+       if (lock->l_export)
+               class_export_put(lock->l_export);
+       lock->l_export = NULL;
+       if (lock->l_export && lock->l_completion_ast)
+               lock->l_completion_ast(lock, 0);
+#endif
+       EXIT;
+       return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+       int first;
+       ENTRY;
+       lock_res_and_lock(lock);
+       first = ldlm_lock_destroy_internal(lock);
+       unlock_res_and_lock(lock);
+
+       /* drop reference from hashtable only for first destroy */
+       if (first) {
+               lu_ref_del(&lock->l_reference, "hash", lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+       EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+       int first;
+       ENTRY;
+       first = ldlm_lock_destroy_internal(lock);
+       /* drop reference from hashtable only for first destroy */
+       if (first) {
+               lu_ref_del(&lock->l_reference, "hash", lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+       EXIT;
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+       LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+       LASSERT(size == sizeof(struct ldlm_lock));
+       OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+       .hop_addref = lock_handle_addref,
+       .hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *     new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       if (resource == NULL)
+               LBUG();
+
+       OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, __GFP_IO);
+       if (lock == NULL)
+               RETURN(NULL);
+
+       spin_lock_init(&lock->l_lock);
+       lock->l_resource = resource;
+       lu_ref_add(&resource->lr_reference, "lock", lock);
+
+       atomic_set(&lock->l_refc, 2);
+       INIT_LIST_HEAD(&lock->l_res_link);
+       INIT_LIST_HEAD(&lock->l_lru);
+       INIT_LIST_HEAD(&lock->l_pending_chain);
+       INIT_LIST_HEAD(&lock->l_bl_ast);
+       INIT_LIST_HEAD(&lock->l_cp_ast);
+       INIT_LIST_HEAD(&lock->l_rk_ast);
+       init_waitqueue_head(&lock->l_waitq);
+       lock->l_blocking_lock = NULL;
+       INIT_LIST_HEAD(&lock->l_sl_mode);
+       INIT_LIST_HEAD(&lock->l_sl_policy);
+       INIT_HLIST_NODE(&lock->l_exp_hash);
+       INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+       lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+                            LDLM_NSS_LOCKS);
+       INIT_LIST_HEAD(&lock->l_handle.h_link);
+       class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+       lu_ref_init(&lock->l_reference);
+       lu_ref_add(&lock->l_reference, "hash", lock);
+       lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       INIT_LIST_HEAD(&lock->l_exp_refs_link);
+       lock->l_exp_refs_nr = 0;
+       lock->l_exp_refs_target = NULL;
+#endif
+       INIT_LIST_HEAD(&lock->l_exp_list);
+
+       RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                             const struct ldlm_res_id *new_resid)
+{
+       struct ldlm_resource *oldres = lock->l_resource;
+       struct ldlm_resource *newres;
+       int type;
+       ENTRY;
+
+       LASSERT(ns_is_client(ns));
+
+       lock_res_and_lock(lock);
+       if (memcmp(new_resid, &lock->l_resource->lr_name,
+                  sizeof(lock->l_resource->lr_name)) == 0) {
+               /* Nothing to do */
+               unlock_res_and_lock(lock);
+               RETURN(0);
+       }
+
+       LASSERT(new_resid->name[0] != 0);
+
+       /* This function assumes that the lock isn't on any lists */
+       LASSERT(list_empty(&lock->l_res_link));
+
+       type = oldres->lr_type;
+       unlock_res_and_lock(lock);
+
+       newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+       if (newres == NULL)
+               RETURN(-ENOMEM);
+
+       lu_ref_add(&newres->lr_reference, "lock", lock);
+       /*
+        * To flip the lock from the old to the new resource, lock, oldres and
+        * newres have to be locked. Resource spin-locks are nested within
+        * lock->l_lock, and are taken in the memory address order to avoid
+        * dead-locks.
+        */
+       spin_lock(&lock->l_lock);
+       oldres = lock->l_resource;
+       if (oldres < newres) {
+               lock_res(oldres);
+               lock_res_nested(newres, LRT_NEW);
+       } else {
+               lock_res(newres);
+               lock_res_nested(oldres, LRT_NEW);
+       }
+       LASSERT(memcmp(new_resid, &oldres->lr_name,
+                      sizeof oldres->lr_name) != 0);
+       lock->l_resource = newres;
+       unlock_res(oldres);
+       unlock_res_and_lock(lock);
+
+       /* ...and the flowers are still standing! */
+       lu_ref_del(&oldres->lr_reference, "lock", lock);
+       ldlm_resource_putref(oldres);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+       lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *           Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+                                    __u64 flags)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       LASSERT(handle);
+
+       lock = class_handle2object(handle->cookie);
+       if (lock == NULL)
+               RETURN(NULL);
+
+       /* It's unlikely but possible that someone marked the lock as
+        * destroyed after we did handle2object on it */
+       if (flags == 0 && !lock->l_destroyed) {
+               lu_ref_add(&lock->l_reference, "handle", current);
+               RETURN(lock);
+       }
+
+       lock_res_and_lock(lock);
+
+       LASSERT(lock->l_resource != NULL);
+
+       lu_ref_add_atomic(&lock->l_reference, "handle", current);
+       if (unlikely(lock->l_destroyed)) {
+               unlock_res_and_lock(lock);
+               CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+               LDLM_LOCK_PUT(lock);
+               RETURN(NULL);
+       }
+
+       if (flags && (lock->l_flags & flags)) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+               RETURN(NULL);
+       }
+
+       if (flags)
+               lock->l_flags |= flags;
+
+       unlock_res_and_lock(lock);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+       struct obd_export *exp = lock->l_export ?: lock->l_conn_export;
+
+       /* INODEBITS_INTEROP: If the other side does not support
+        * inodebits, reply with a plain lock descriptor. */
+       if ((lock->l_resource->lr_type == LDLM_IBITS) &&
+           (exp && !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) {
+               /* Make sure all the right bits are set in this lock we
+                  are going to pass to client */
+               LASSERTF(lock->l_policy_data.l_inodebits.bits ==
+                        (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                         MDS_INODELOCK_LAYOUT),
+                        "Inappropriate inode lock bits during "
+                        "conversion " LPU64 "\n",
+                        lock->l_policy_data.l_inodebits.bits);
+
+               ldlm_res2desc(lock->l_resource, &desc->l_resource);
+               desc->l_resource.lr_type = LDLM_PLAIN;
+
+               /* Convert "new" lock mode to something old client can
+                  understand */
+               if ((lock->l_req_mode == LCK_CR) ||
+                   (lock->l_req_mode == LCK_CW))
+                       desc->l_req_mode = LCK_PR;
+               else
+                       desc->l_req_mode = lock->l_req_mode;
+               if ((lock->l_granted_mode == LCK_CR) ||
+                   (lock->l_granted_mode == LCK_CW)) {
+                       desc->l_granted_mode = LCK_PR;
+               } else {
+                       /* We never grant PW/EX locks to clients */
+                       LASSERT((lock->l_granted_mode != LCK_PW) &&
+                               (lock->l_granted_mode != LCK_EX));
+                       desc->l_granted_mode = lock->l_granted_mode;
+               }
+
+               /* We do not copy policy here, because there is no
+                  policy for plain locks */
+       } else {
+               ldlm_res2desc(lock->l_resource, &desc->l_resource);
+               desc->l_req_mode = lock->l_req_mode;
+               desc->l_granted_mode = lock->l_granted_mode;
+               ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+                                           &lock->l_policy_data,
+                                           &desc->l_policy_data);
+       }
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                          struct list_head *work_list)
+{
+       if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+               LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+               lock->l_flags |= LDLM_FL_AST_SENT;
+               /* If the enqueuing client said so, tell the AST recipient to
+                * discard dirty data, rather than writing back. */
+               if (new->l_flags & LDLM_AST_DISCARD_DATA)
+                       lock->l_flags |= LDLM_FL_DISCARD_DATA;
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, work_list);
+               LDLM_LOCK_GET(lock);
+               LASSERT(lock->l_blocking_lock == NULL);
+               lock->l_blocking_lock = LDLM_LOCK_GET(new);
+       }
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+       if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+               lock->l_flags |= LDLM_FL_CP_REQD;
+               LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+               LASSERT(list_empty(&lock->l_cp_ast));
+               list_add(&lock->l_cp_ast, work_list);
+               LDLM_LOCK_GET(lock);
+       }
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                           struct list_head *work_list)
+{
+       ENTRY;
+       check_res_locked(lock->l_resource);
+       if (new)
+               ldlm_add_bl_work_item(lock, new, work_list);
+       else
+               ldlm_add_cp_work_item(lock, work_list);
+       EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock;
+
+       lock = ldlm_handle2lock(lockh);
+       LASSERT(lock != NULL);
+       ldlm_lock_addref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+       ldlm_lock_remove_from_lru(lock);
+       if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+               lock->l_readers++;
+               lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+       }
+       if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+               lock->l_writers++;
+               lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+       }
+       LDLM_LOCK_GET(lock);
+       lu_ref_add_atomic(&lock->l_reference, "user", lock);
+       LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock;
+       int            result;
+
+       result = -EAGAIN;
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL) {
+               lock_res_and_lock(lock);
+               if (lock->l_readers != 0 || lock->l_writers != 0 ||
+                   !(lock->l_flags & LDLM_FL_CBPENDING)) {
+                       ldlm_lock_addref_internal_nolock(lock, mode);
+                       result = 0;
+               }
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+       }
+       return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_addref_internal_nolock(lock, mode);
+       unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+       LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+       if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+               LASSERT(lock->l_readers > 0);
+               lu_ref_del(&lock->l_reference, "reader", lock);
+               lock->l_readers--;
+       }
+       if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+               LASSERT(lock->l_writers > 0);
+               lu_ref_del(&lock->l_reference, "writer", lock);
+               lock->l_writers--;
+       }
+
+       lu_ref_del(&lock->l_reference, "user", lock);
+       LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+       struct ldlm_namespace *ns;
+       ENTRY;
+
+       lock_res_and_lock(lock);
+
+       ns = ldlm_lock_to_ns(lock);
+
+       ldlm_lock_decref_internal_nolock(lock, mode);
+
+       /* release lvb data for layout lock */
+       if (ns_is_client(ns) && !lock->l_readers && !lock->l_writers &&
+           ldlm_has_layout(lock) && lock->l_flags & LDLM_FL_LVB_READY) {
+               /* this is the last user of a layout lock and stripe has
+                * been set up, lvb is no longer used.
+                * This may be a large amount of memory, so we should free it
+                * when possible. */
+               if (lock->l_lvb_data != NULL) {
+                       OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+                       lock->l_lvb_data = NULL;
+                       lock->l_lvb_len = 0;
+               }
+       }
+
+       if (lock->l_flags & LDLM_FL_LOCAL &&
+           !lock->l_readers && !lock->l_writers) {
+               /* If this is a local lock on a server namespace and this was
+                * the last reference, cancel the lock. */
+               CDEBUG(D_INFO, "forcing cancel of local lock\n");
+               lock->l_flags |= LDLM_FL_CBPENDING;
+       }
+
+       if (!lock->l_readers && !lock->l_writers &&
+           (lock->l_flags & LDLM_FL_CBPENDING)) {
+               /* If we received a blocked AST and this was the last reference,
+                * run the callback. */
+               if (lock->l_ns_srv && lock->l_export)
+                       CERROR("FL_CBPENDING set on non-local lock--just a "
+                              "warning\n");
+
+               LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+               LDLM_LOCK_GET(lock); /* dropped by bl thread */
+               ldlm_lock_remove_from_lru(lock);
+               unlock_res_and_lock(lock);
+
+               if (lock->l_flags & LDLM_FL_FAIL_LOC)
+                       OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+               if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+                   ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+                       ldlm_handle_bl_callback(ns, NULL, lock);
+       } else if (ns_is_client(ns) &&
+                  !lock->l_readers && !lock->l_writers &&
+                  !(lock->l_flags & LDLM_FL_NO_LRU) &&
+                  !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+               LDLM_DEBUG(lock, "add lock into lru list");
+
+               /* If this is a client-side namespace and this was the last
+                * reference, put it on the LRU. */
+               ldlm_lock_add_to_lru(lock);
+               unlock_res_and_lock(lock);
+
+               if (lock->l_flags & LDLM_FL_FAIL_LOC)
+                       OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+               /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+                * are not supported by the server, otherwise, it is done on
+                * enqueue. */
+               if (!exp_connect_cancelset(lock->l_conn_export) &&
+                   !ns_connect_lru_resize(ns))
+                       ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+       } else {
+               LDLM_DEBUG(lock, "do not add lock into lru list");
+               unlock_res_and_lock(lock);
+       }
+
+       EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+       LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie);
+       ldlm_lock_decref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+       ENTRY;
+
+       LASSERT(lock != NULL);
+
+       LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+       lock_res_and_lock(lock);
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       unlock_res_and_lock(lock);
+       ldlm_lock_decref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+       struct list_head *res_link;
+       struct list_head *mode_link;
+       struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+                               struct ldlm_lock *req,
+                               struct sl_insert_point *prev)
+{
+       struct list_head *tmp;
+       struct ldlm_lock *lock, *mode_end, *policy_end;
+       ENTRY;
+
+       list_for_each(tmp, queue) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               mode_end = list_entry(lock->l_sl_mode.prev,
+                                         struct ldlm_lock, l_sl_mode);
+
+               if (lock->l_req_mode != req->l_req_mode) {
+                       /* jump to last lock of mode group */
+                       tmp = &mode_end->l_res_link;
+                       continue;
+               }
+
+               /* suitable mode group is found */
+               if (lock->l_resource->lr_type == LDLM_PLAIN) {
+                       /* insert point is last lock of the mode group */
+                       prev->res_link = &mode_end->l_res_link;
+                       prev->mode_link = &mode_end->l_sl_mode;
+                       prev->policy_link = &req->l_sl_policy;
+                       EXIT;
+                       return;
+               } else if (lock->l_resource->lr_type == LDLM_IBITS) {
+                       for (;;) {
+                               policy_end =
+                                       list_entry(lock->l_sl_policy.prev,
+                                                      struct ldlm_lock,
+                                                      l_sl_policy);
+
+                               if (lock->l_policy_data.l_inodebits.bits ==
+                                   req->l_policy_data.l_inodebits.bits) {
+                                       /* insert point is last lock of
+                                        * the policy group */
+                                       prev->res_link =
+                                               &policy_end->l_res_link;
+                                       prev->mode_link =
+                                               &policy_end->l_sl_mode;
+                                       prev->policy_link =
+                                               &policy_end->l_sl_policy;
+                                       EXIT;
+                                       return;
+                               }
+
+                               if (policy_end == mode_end)
+                                       /* done with mode group */
+                                       break;
+
+                               /* go to next policy group within mode group */
+                               tmp = policy_end->l_res_link.next;
+                               lock = list_entry(tmp, struct ldlm_lock,
+                                                     l_res_link);
+                       }  /* loop over policy groups within the mode group */
+
+                       /* insert point is last lock of the mode group,
+                        * new policy group is started */
+                       prev->res_link = &mode_end->l_res_link;
+                       prev->mode_link = &mode_end->l_sl_mode;
+                       prev->policy_link = &req->l_sl_policy;
+                       EXIT;
+                       return;
+               } else {
+                       LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+                       LBUG();
+               }
+       }
+
+       /* insert point is last lock on the queue,
+        * new mode group and new policy group are started */
+       prev->res_link = queue->prev;
+       prev->mode_link = &req->l_sl_mode;
+       prev->policy_link = &req->l_sl_policy;
+       EXIT;
+       return;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+                                      struct sl_insert_point *prev)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       ENTRY;
+
+       check_res_locked(res);
+
+       ldlm_resource_dump(D_INFO, res);
+       LDLM_DEBUG(lock, "About to add lock:");
+
+       if (lock->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               return;
+       }
+
+       LASSERT(list_empty(&lock->l_res_link));
+       LASSERT(list_empty(&lock->l_sl_mode));
+       LASSERT(list_empty(&lock->l_sl_policy));
+
+       /*
+        * lock->link == prev->link means lock is first starting the group.
+        * Don't re-add to itself to suppress kernel warnings.
+        */
+       if (&lock->l_res_link != prev->res_link)
+               list_add(&lock->l_res_link, prev->res_link);
+       if (&lock->l_sl_mode != prev->mode_link)
+               list_add(&lock->l_sl_mode, prev->mode_link);
+       if (&lock->l_sl_policy != prev->policy_link)
+               list_add(&lock->l_sl_policy, prev->policy_link);
+
+       EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+       struct sl_insert_point prev;
+       ENTRY;
+
+       LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+       search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+       ldlm_granted_list_add_lock(lock, &prev);
+       EXIT;
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       ENTRY;
+
+       check_res_locked(res);
+
+       lock->l_granted_mode = lock->l_req_mode;
+       if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+               ldlm_grant_lock_with_skiplist(lock);
+       else if (res->lr_type == LDLM_EXTENT)
+               ldlm_extent_add_lock(res, lock);
+       else
+               ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+       if (lock->l_granted_mode < res->lr_most_restr)
+               res->lr_most_restr = lock->l_granted_mode;
+
+       if (work_list && lock->l_completion_ast != NULL)
+               ldlm_add_ast_work_item(lock, NULL, work_list);
+
+       ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+       EXIT;
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL.  See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+                                     ldlm_mode_t *mode,
+                                     ldlm_policy_data_t *policy,
+                                     struct ldlm_lock *old_lock,
+                                     __u64 flags, int unref)
+{
+       struct ldlm_lock *lock;
+       struct list_head       *tmp;
+
+       list_for_each(tmp, queue) {
+               ldlm_mode_t match;
+
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (lock == old_lock)
+                       break;
+
+               /* llite sometimes wants to match locks that will be
+                * canceled when their users drop, but we allow it to match
+                * if it passes in CBPENDING and the lock still has users.
+                * this is generally only going to be used by children
+                * whose parents already hold a lock so forward progress
+                * can still happen. */
+               if (lock->l_flags & LDLM_FL_CBPENDING &&
+                   !(flags & LDLM_FL_CBPENDING))
+                       continue;
+               if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+                   lock->l_readers == 0 && lock->l_writers == 0)
+                       continue;
+
+               if (!(lock->l_req_mode & *mode))
+                       continue;
+               match = lock->l_req_mode;
+
+               if (lock->l_resource->lr_type == LDLM_EXTENT &&
+                   (lock->l_policy_data.l_extent.start >
+                    policy->l_extent.start ||
+                    lock->l_policy_data.l_extent.end < policy->l_extent.end))
+                       continue;
+
+               if (unlikely(match == LCK_GROUP) &&
+                   lock->l_resource->lr_type == LDLM_EXTENT &&
+                   lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+                       continue;
+
+               /* We match if we have existing lock with same or wider set
+                  of bits. */
+               if (lock->l_resource->lr_type == LDLM_IBITS &&
+                    ((lock->l_policy_data.l_inodebits.bits &
+                     policy->l_inodebits.bits) !=
+                     policy->l_inodebits.bits))
+                       continue;
+
+               if (!unref &&
+                   (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+                    lock->l_failed))
+                       continue;
+
+               if ((flags & LDLM_FL_LOCAL_ONLY) &&
+                   !(lock->l_flags & LDLM_FL_LOCAL))
+                       continue;
+
+               if (flags & LDLM_FL_TEST_LOCK) {
+                       LDLM_LOCK_GET(lock);
+                       ldlm_lock_touch_in_lru(lock);
+               } else {
+                       ldlm_lock_addref_internal_nolock(lock, match);
+               }
+               *mode = match;
+               return lock;
+       }
+
+       return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+       if (!lock->l_failed) {
+               lock->l_failed = 1;
+               wake_up_all(&lock->l_waitq);
+       }
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_fail_match_locked(lock);
+       unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+       lock->l_flags |= LDLM_FL_LVB_READY;
+       wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_allow_match_locked(lock);
+       unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+                           const struct ldlm_res_id *res_id, ldlm_type_t type,
+                           ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                           struct lustre_handle *lockh, int unref)
+{
+       struct ldlm_resource *res;
+       struct ldlm_lock *lock, *old_lock = NULL;
+       int rc = 0;
+       ENTRY;
+
+       if (ns == NULL) {
+               old_lock = ldlm_handle2lock(lockh);
+               LASSERT(old_lock);
+
+               ns = ldlm_lock_to_ns(old_lock);
+               res_id = &old_lock->l_resource->lr_name;
+               type = old_lock->l_resource->lr_type;
+               mode = old_lock->l_req_mode;
+       }
+
+       res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+       if (res == NULL) {
+               LASSERT(old_lock == NULL);
+               RETURN(0);
+       }
+
+       LDLM_RESOURCE_ADDREF(res);
+       lock_res(res);
+
+       lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+       if (flags & LDLM_FL_BLOCK_GRANTED)
+               GOTO(out, rc = 0);
+       lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+       lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+
+       EXIT;
+ out:
+       unlock_res(res);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+
+       if (lock) {
+               ldlm_lock2handle(lock, lockh);
+               if ((flags & LDLM_FL_LVB_READY) &&
+                   (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+                       struct l_wait_info lwi;
+                       if (lock->l_completion_ast) {
+                               int err = lock->l_completion_ast(lock,
+                                                         LDLM_FL_WAIT_NOREPROC,
+                                                                NULL);
+                               if (err) {
+                                       if (flags & LDLM_FL_TEST_LOCK)
+                                               LDLM_LOCK_RELEASE(lock);
+                                       else
+                                               ldlm_lock_decref_internal(lock,
+                                                                         mode);
+                                       rc = 0;
+                                       goto out2;
+                               }
+                       }
+
+                       lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                                              NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+                       /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+                       l_wait_event(lock->l_waitq,
+                                    lock->l_flags & LDLM_FL_LVB_READY ||
+                                    lock->l_destroyed || lock->l_failed,
+                                    &lwi);
+                       if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+                               if (flags & LDLM_FL_TEST_LOCK)
+                                       LDLM_LOCK_RELEASE(lock);
+                               else
+                                       ldlm_lock_decref_internal(lock, mode);
+                               rc = 0;
+                       }
+               }
+       }
+ out2:
+       if (rc) {
+               LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")",
+                          (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                               res_id->name[2] : policy->l_extent.start,
+                          (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                               res_id->name[3] : policy->l_extent.end);
+
+               /* check user's security context */
+               if (lock->l_conn_export &&
+                   sptlrpc_import_check_ctx(
+                               class_exp2cliimp(lock->l_conn_export))) {
+                       if (!(flags & LDLM_FL_TEST_LOCK))
+                               ldlm_lock_decref_internal(lock, mode);
+                       rc = 0;
+               }
+
+               if (flags & LDLM_FL_TEST_LOCK)
+                       LDLM_LOCK_RELEASE(lock);
+
+       } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+               LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+                                 LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
+                                 type, mode, res_id->name[0], res_id->name[1],
+                                 (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                       res_id->name[2] :policy->l_extent.start,
+                                 (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                       res_id->name[3] : policy->l_extent.end);
+       }
+       if (old_lock)
+               LDLM_LOCK_PUT(old_lock);
+
+       return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+                                       __u64 *bits)
+{
+       struct ldlm_lock *lock;
+       ldlm_mode_t mode = 0;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL) {
+               lock_res_and_lock(lock);
+               if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+                   lock->l_failed)
+                       GOTO(out, mode);
+
+               if (lock->l_flags & LDLM_FL_CBPENDING &&
+                   lock->l_readers == 0 && lock->l_writers == 0)
+                       GOTO(out, mode);
+
+               if (bits)
+                       *bits = lock->l_policy_data.l_inodebits.bits;
+               mode = lock->l_granted_mode;
+               ldlm_lock_addref_internal_nolock(lock, mode);
+       }
+
+       EXIT;
+
+out:
+       if (lock != NULL) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+       }
+       return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+                 enum req_location loc, void *data, int size)
+{
+       void *lvb;
+       ENTRY;
+
+       LASSERT(data != NULL);
+       LASSERT(size >= 0);
+
+       switch (lock->l_lvb_type) {
+       case LVB_T_OST:
+               if (size == sizeof(struct ost_lvb)) {
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb);
+                       else
+                               lvb = req_capsule_server_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+               } else if (size == sizeof(struct ost_lvb_v1)) {
+                       struct ost_lvb *olvb = data;
+
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb_v1);
+                       else
+                               lvb = req_capsule_server_sized_swab_get(pill,
+                                               &RMF_DLM_LVB, size,
+                                               lustre_swab_ost_lvb_v1);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+                       olvb->lvb_mtime_ns = 0;
+                       olvb->lvb_atime_ns = 0;
+                       olvb->lvb_ctime_ns = 0;
+               } else {
+                       LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+                                  size);
+                       RETURN(-EINVAL);
+               }
+               break;
+       case LVB_T_LQUOTA:
+               if (size == sizeof(struct lquota_lvb)) {
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_lquota_lvb);
+                       else
+                               lvb = req_capsule_server_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_lquota_lvb);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+               } else {
+                       LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+                                  size);
+                       RETURN(-EINVAL);
+               }
+               break;
+       case LVB_T_LAYOUT:
+               if (size == 0)
+                       break;
+
+               if (loc == RCL_CLIENT)
+                       lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+               else
+                       lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+               if (unlikely(lvb == NULL)) {
+                       LDLM_ERROR(lock, "no LVB");
+                       RETURN(-EPROTO);
+               }
+
+               memcpy(data, lvb, size);
+               break;
+       default:
+               LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+               libcfs_debug_dumpstack(NULL);
+               RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+                                  const struct ldlm_res_id *res_id,
+                                  ldlm_type_t type,
+                                  ldlm_mode_t mode,
+                                  const struct ldlm_callback_suite *cbs,
+                                  void *data, __u32 lvb_len,
+                                  enum lvb_type lvb_type)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_resource *res;
+       ENTRY;
+
+       res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+       if (res == NULL)
+               RETURN(NULL);
+
+       lock = ldlm_lock_new(res);
+
+       if (lock == NULL)
+               RETURN(NULL);
+
+       lock->l_req_mode = mode;
+       lock->l_ast_data = data;
+       lock->l_pid = current_pid();
+       lock->l_ns_srv = !!ns_is_server(ns);
+       if (cbs) {
+               lock->l_blocking_ast = cbs->lcs_blocking;
+               lock->l_completion_ast = cbs->lcs_completion;
+               lock->l_glimpse_ast = cbs->lcs_glimpse;
+               lock->l_weigh_ast = cbs->lcs_weigh;
+       }
+
+       lock->l_tree_node = NULL;
+       /* if this is the extent lock, allocate the interval tree node */
+       if (type == LDLM_EXTENT) {
+               if (ldlm_interval_alloc(lock) == NULL)
+                       GOTO(out, 0);
+       }
+
+       if (lvb_len) {
+               lock->l_lvb_len = lvb_len;
+               OBD_ALLOC(lock->l_lvb_data, lvb_len);
+               if (lock->l_lvb_data == NULL)
+                       GOTO(out, 0);
+       }
+
+       lock->l_lvb_type = lvb_type;
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+               GOTO(out, 0);
+
+       RETURN(lock);
+
+out:
+       ldlm_lock_destroy(lock);
+       LDLM_LOCK_RELEASE(lock);
+       return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+                              struct ldlm_lock **lockp,
+                              void *cookie, __u64 *flags)
+{
+       struct ldlm_lock *lock = *lockp;
+       struct ldlm_resource *res = lock->l_resource;
+       int local = ns_is_client(ldlm_res_to_ns(res));
+       ldlm_error_t rc = ELDLM_OK;
+       struct ldlm_interval *node = NULL;
+       ENTRY;
+
+       lock->l_last_activity = cfs_time_current_sec();
+       /* policies are not executed on the client or during replay */
+       if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+           && !local && ns->ns_policy) {
+               rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+                                  NULL);
+               if (rc == ELDLM_LOCK_REPLACED) {
+                       /* The lock that was returned has already been granted,
+                        * and placed into lockp.  If it's not the same as the
+                        * one we passed in, then destroy the old one and our
+                        * work here is done. */
+                       if (lock != *lockp) {
+                               ldlm_lock_destroy(lock);
+                               LDLM_LOCK_RELEASE(lock);
+                       }
+                       *flags |= LDLM_FL_LOCK_CHANGED;
+                       RETURN(0);
+               } else if (rc != ELDLM_OK ||
+                          (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+                       ldlm_lock_destroy(lock);
+                       RETURN(rc);
+               }
+       }
+
+       /* For a replaying lock, it might be already in granted list. So
+        * unlinking the lock will cause the interval node to be freed, we
+        * have to allocate the interval node early otherwise we can't regrant
+        * this lock in the future. - jay */
+       if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+               OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+
+       lock_res_and_lock(lock);
+       if (local && lock->l_req_mode == lock->l_granted_mode) {
+               /* The server returned a blocked lock, but it was granted
+                * before we got a chance to actually enqueue it.  We don't
+                * need to do anything else. */
+               *flags &= ~(LDLM_FL_BLOCK_GRANTED |
+                           LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+               GOTO(out, ELDLM_OK);
+       }
+
+       ldlm_resource_unlink_lock(lock);
+       if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+               if (node == NULL) {
+                       ldlm_lock_destroy_nolock(lock);
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               INIT_LIST_HEAD(&node->li_group);
+               ldlm_interval_attach(node, lock);
+               node = NULL;
+       }
+
+       /* Some flags from the enqueue want to make it into the AST, via the
+        * lock's l_flags. */
+       lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
+
+       /* This distinction between local lock trees is very important; a client
+        * namespace only has information about locks taken by that client, and
+        * thus doesn't have enough information to decide for itself if it can
+        * be granted (below).  In this case, we do exactly what the server
+        * tells us to do, as dictated by the 'flags'.
+        *
+        * We do exactly the same thing during recovery, when the server is
+        * more or less trusting the clients not to lie.
+        *
+        * FIXME (bug 268): Detect obvious lies by checking compatibility in
+        * granted/converting queues. */
+       if (local) {
+               if (*flags & LDLM_FL_BLOCK_CONV)
+                       ldlm_resource_add_lock(res, &res->lr_converting, lock);
+               else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+                       ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+               else
+                       ldlm_grant_lock(lock, NULL);
+               GOTO(out, ELDLM_OK);
+       } else {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+
+out:
+       unlock_res_and_lock(lock);
+       if (node)
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg *arg = opaq;
+       struct ldlm_lock_desc   d;
+       int                  rc;
+       struct ldlm_lock       *lock;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+       /* nobody should touch l_bl_ast */
+       lock_res_and_lock(lock);
+       list_del_init(&lock->l_bl_ast);
+
+       LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+       LASSERT(lock->l_bl_ast_run == 0);
+       LASSERT(lock->l_blocking_lock);
+       lock->l_bl_ast_run++;
+       unlock_res_and_lock(lock);
+
+       ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+       rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+       LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+       lock->l_blocking_lock = NULL;
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg  *arg = opaq;
+       int                   rc = 0;
+       struct ldlm_lock        *lock;
+       ldlm_completion_callback completion_callback;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+       /* It's possible to receive a completion AST before we've set
+        * the l_completion_ast pointer: either because the AST arrived
+        * before the reply, or simply because there's a small race
+        * window between receiving the reply and finishing the local
+        * enqueue. (bug 842)
+        *
+        * This can't happen with the blocking_ast, however, because we
+        * will never call the local blocking_ast until we drop our
+        * reader/writer reference, which we won't do until we get the
+        * reply and finish enqueueing. */
+
+       /* nobody should touch l_cp_ast */
+       lock_res_and_lock(lock);
+       list_del_init(&lock->l_cp_ast);
+       LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+       /* save l_completion_ast since it can be changed by
+        * mds_intent_policy(), see bug 14225 */
+       completion_callback = lock->l_completion_ast;
+       lock->l_flags &= ~LDLM_FL_CP_REQD;
+       unlock_res_and_lock(lock);
+
+       if (completion_callback != NULL)
+               rc = completion_callback(lock, 0, (void *)arg);
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg *arg = opaq;
+       struct ldlm_lock_desc   desc;
+       int                  rc;
+       struct ldlm_lock       *lock;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+       list_del_init(&lock->l_rk_ast);
+
+       /* the desc just pretend to exclusive */
+       ldlm_lock2desc(lock, &desc);
+       desc.l_req_mode = LCK_EX;
+       desc.l_granted_mode = 0;
+
+       rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg          *arg = opaq;
+       struct ldlm_glimpse_work        *gl_work;
+       struct ldlm_lock                *lock;
+       int                              rc = 0;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+                                gl_list);
+       list_del_init(&gl_work->gl_list);
+
+       lock = gl_work->gl_lock;
+
+       /* transfer the glimpse descriptor to ldlm_cb_set_arg */
+       arg->gl_desc = gl_work->gl_desc;
+
+       /* invoke the actual glimpse callback */
+       if (lock->l_glimpse_ast(lock, (void*)arg) == 0)
+               rc = 1;
+
+       LDLM_LOCK_RELEASE(lock);
+
+       if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+               OBD_FREE_PTR(gl_work);
+
+       RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                     ldlm_desc_ast_t ast_type)
+{
+       struct ldlm_cb_set_arg *arg;
+       set_producer_func       work_ast_lock;
+       int                  rc;
+
+       if (list_empty(rpc_list))
+               RETURN(0);
+
+       OBD_ALLOC_PTR(arg);
+       if (arg == NULL)
+               RETURN(-ENOMEM);
+
+       atomic_set(&arg->restart, 0);
+       arg->list = rpc_list;
+
+       switch (ast_type) {
+               case LDLM_WORK_BL_AST:
+                       arg->type = LDLM_BL_CALLBACK;
+                       work_ast_lock = ldlm_work_bl_ast_lock;
+                       break;
+               case LDLM_WORK_CP_AST:
+                       arg->type = LDLM_CP_CALLBACK;
+                       work_ast_lock = ldlm_work_cp_ast_lock;
+                       break;
+               case LDLM_WORK_REVOKE_AST:
+                       arg->type = LDLM_BL_CALLBACK;
+                       work_ast_lock = ldlm_work_revoke_ast_lock;
+                       break;
+               case LDLM_WORK_GL_AST:
+                       arg->type = LDLM_GL_CALLBACK;
+                       work_ast_lock = ldlm_work_gl_ast_lock;
+                       break;
+               default:
+                       LBUG();
+       }
+
+       /* We create a ptlrpc request set with flow control extension.
+        * This request set will use the work_ast_lock function to produce new
+        * requests and will send a new request each time one completes in order
+        * to keep the number of requests in flight to ns_max_parallel_ast */
+       arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+                                    work_ast_lock, arg);
+       if (arg->set == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       ptlrpc_set_wait(arg->set);
+       ptlrpc_set_destroy(arg->set);
+
+       rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+       GOTO(out, rc);
+out:
+       OBD_FREE_PTR(arg);
+       return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+       ldlm_reprocess_all(res);
+       return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                             struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       int    rc;
+
+       rc = reprocess_one_queue(res, arg);
+
+       return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+       ENTRY;
+
+       if (ns != NULL) {
+               cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                        ldlm_reprocess_res, NULL);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+       LIST_HEAD(rpc_list);
+
+       ENTRY;
+       if (!ns_is_client(ldlm_res_to_ns(res))) {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+       EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+       check_res_locked(lock->l_resource);
+       if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+               lock->l_flags |= LDLM_FL_CANCEL;
+               if (lock->l_blocking_ast) {
+                       unlock_res_and_lock(lock);
+                       lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+                                            LDLM_CB_CANCELING);
+                       lock_res_and_lock(lock);
+               } else {
+                       LDLM_DEBUG(lock, "no blocking ast");
+               }
+       }
+       lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+       if (req->l_resource->lr_type != LDLM_PLAIN &&
+           req->l_resource->lr_type != LDLM_IBITS)
+               return;
+
+       list_del_init(&req->l_sl_policy);
+       list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns;
+       ENTRY;
+
+       lock_res_and_lock(lock);
+
+       res = lock->l_resource;
+       ns  = ldlm_res_to_ns(res);
+
+       /* Please do not, no matter how tempting, remove this LBUG without
+        * talking to me first. -phik */
+       if (lock->l_readers || lock->l_writers) {
+               LDLM_ERROR(lock, "lock still has references");
+               LBUG();
+       }
+
+       if (lock->l_waited)
+               ldlm_del_waiting_lock(lock);
+
+       /* Releases cancel callback. */
+       ldlm_cancel_callback(lock);
+
+       /* Yes, second time, just in case it was added again while we were
+          running with no res lock in ldlm_cancel_callback */
+       if (lock->l_waited)
+               ldlm_del_waiting_lock(lock);
+
+       ldlm_resource_unlink_lock(lock);
+       ldlm_lock_destroy_nolock(lock);
+
+       if (lock->l_granted_mode == lock->l_req_mode)
+               ldlm_pool_del(&ns->ns_pool, lock);
+
+       /* Make sure we will not be called again for same lock what is possible
+        * if not to zero out lock->l_granted_mode */
+       lock->l_granted_mode = LCK_MINMODE;
+       unlock_res_and_lock(lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+       int rc = -EINVAL;
+       ENTRY;
+
+       if (lock) {
+               if (lock->l_ast_data == NULL)
+                       lock->l_ast_data = data;
+               if (lock->l_ast_data == data)
+                       rc = 0;
+               LDLM_LOCK_PUT(lock);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+       struct obd_export       *ecl_exp;
+       int                     ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                   struct hlist_node *hnode, void *data)
+
+{
+       struct export_cl_data   *ecl = (struct export_cl_data *)data;
+       struct obd_export       *exp  = ecl->ecl_exp;
+       struct ldlm_lock     *lock = cfs_hash_object(hs, hnode);
+       struct ldlm_resource *res;
+
+       res = ldlm_resource_getref(lock->l_resource);
+       LDLM_LOCK_GET(lock);
+
+       LDLM_DEBUG(lock, "export %p", exp);
+       ldlm_res_lvbo_update(res, NULL, 1);
+       ldlm_lock_cancel(lock);
+       ldlm_reprocess_all(res);
+       ldlm_resource_putref(res);
+       LDLM_LOCK_RELEASE(lock);
+
+       ecl->ecl_loop++;
+       if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+               CDEBUG(D_INFO,
+                      "Cancel lock %p for export %p (loop %d), still have "
+                      "%d locks left on hash table.\n",
+                      lock, exp, ecl->ecl_loop,
+                      atomic_read(&hs->hs_count));
+       }
+
+       return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+       struct export_cl_data   ecl = {
+               .ecl_exp        = exp,
+               .ecl_loop       = 0,
+       };
+
+       cfs_hash_for_each_empty(exp->exp_lock_hash,
+                               ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for convertion of exclusive
+ * locks. The convertion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+       ENTRY;
+
+       LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+       LASSERT(new_mode == LCK_COS);
+
+       lock_res_and_lock(lock);
+       ldlm_resource_unlink_lock(lock);
+       /*
+        * Remove the lock from pool as it will be added again in
+        * ldlm_grant_lock() called below.
+        */
+       ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+       lock->l_req_mode = new_mode;
+       ldlm_grant_lock(lock, NULL);
+       unlock_res_and_lock(lock);
+       ldlm_reprocess_all(lock->l_resource);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+                                       __u32 *flags)
+{
+       LIST_HEAD(rpc_list);
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns;
+       int granted = 0;
+       struct ldlm_interval *node;
+       ENTRY;
+
+       /* Just return if mode is unchanged. */
+       if (new_mode == lock->l_granted_mode) {
+               *flags |= LDLM_FL_BLOCK_GRANTED;
+               RETURN(lock->l_resource);
+       }
+
+       /* I can't check the type of lock here because the bitlock of lock
+        * is not held here, so do the allocation blindly. -jay */
+       OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+       if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+               RETURN(NULL);
+
+       LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+                "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+       lock_res_and_lock(lock);
+
+       res = lock->l_resource;
+       ns  = ldlm_res_to_ns(res);
+
+       lock->l_req_mode = new_mode;
+       if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+               ldlm_resource_unlink_lock(lock);
+       } else {
+               ldlm_resource_unlink_lock(lock);
+               if (res->lr_type == LDLM_EXTENT) {
+                       /* FIXME: ugly code, I have to attach the lock to a
+                        * interval node again since perhaps it will be granted
+                        * soon */
+                       INIT_LIST_HEAD(&node->li_group);
+                       ldlm_interval_attach(node, lock);
+                       node = NULL;
+               }
+       }
+
+       /*
+        * Remove old lock from the pool before adding the lock with new
+        * mode below in ->policy()
+        */
+       ldlm_pool_del(&ns->ns_pool, lock);
+
+       /* If this is a local resource, put it on the appropriate list. */
+       if (ns_is_client(ldlm_res_to_ns(res))) {
+               if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+                       ldlm_resource_add_lock(res, &res->lr_converting, lock);
+               } else {
+                       /* This should never happen, because of the way the
+                        * server handles conversions. */
+                       LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+                                  *flags);
+                       LBUG();
+
+                       ldlm_grant_lock(lock, &rpc_list);
+                       granted = 1;
+                       /* FIXME: completion handling not with lr_lock held ! */
+                       if (lock->l_completion_ast)
+                               lock->l_completion_ast(lock, 0, NULL);
+               }
+       } else {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+       unlock_res_and_lock(lock);
+
+       if (granted)
+               ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+       if (node)
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       RETURN(res);
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+       struct ldlm_lock *lock;
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       lock = ldlm_handle2lock(lockh);
+       if (lock == NULL)
+               return;
+
+       LDLM_DEBUG_LIMIT(level, lock, "###");
+
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                     struct libcfs_debug_msg_data *msgdata,
+                     const char *fmt, ...)
+{
+       va_list args;
+       struct obd_export *exp = lock->l_export;
+       struct ldlm_resource *resource = lock->l_resource;
+       char *nid = "local";
+
+       va_start(args, fmt);
+
+       if (exp && exp->exp_connection) {
+               nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+       } else if (exp && exp->exp_obd != NULL) {
+               struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+               nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+       }
+
+       if (resource == NULL) {
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" nid: %s "
+                      "remote: "LPX64" expref: %d pid: %u timeout: %lu "
+                      "lvb_type: %d\n",
+                      lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               va_end(args);
+               return;
+       }
+
+       switch (resource->lr_type) {
+       case LDLM_EXTENT:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
+                      "] (req "LPU64"->"LPU64") flags: "LPX64" nid: %s remote:"
+                      " "LPX64" expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock), lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_policy_data.l_extent.start,
+                      lock->l_policy_data.l_extent.end,
+                      lock->l_req_extent.start, lock->l_req_extent.end,
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+
+       case LDLM_FLOCK:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
+                      "["LPU64"->"LPU64"] flags: "LPX64" nid: %s remote: "LPX64
+                      " expref: %d pid: %u timeout: %lu\n",
+                      ldlm_lock_to_ns_name(lock), lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_policy_data.l_flock.pid,
+                      lock->l_policy_data.l_flock.start,
+                      lock->l_policy_data.l_flock.end,
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout);
+               break;
+
+       case LDLM_IBITS:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
+                      "flags: "LPX64" nid: %s remote: "LPX64" expref: %d "
+                      "pid: %u timeout: %lu lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock),
+                      lock, lock->l_handle.h_cookie,
+                      atomic_read (&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      lock->l_policy_data.l_inodebits.bits,
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+
+       default:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+                      "nid: %s remote: "LPX64" expref: %d pid: %u timeout: %lu"
+                      "lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock),
+                      lock, lock->l_handle.h_cookie,
+                      atomic_read (&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+       }
+       va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644 (file)
index 0000000..324d5e4
--- /dev/null
@@ -0,0 +1,1238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+CFS_MODULE_PARM(ldlm_num_threads, "i", int, 0444,
+               "number of DLM service threads to start");
+
+static char *ldlm_cpts;
+CFS_MODULE_PARM(ldlm_cpts, "s", charp, 0444,
+               "CPU partitions ldlm threads should run on");
+
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+static struct mutex    ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+       struct ldlm_cb_set_arg *ca_set_arg;
+       struct ldlm_lock       *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+       return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+       /* Non-AT value */
+       unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+       return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+       spinlock_t              blp_lock;
+
+       /*
+        * blp_prio_list is used for callbacks that should be handled
+        * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+        * see bug 13843
+        */
+       struct list_head              blp_prio_list;
+
+       /*
+        * blp_list is used for all other callbacks which are likely
+        * to take longer to process.
+        */
+       struct list_head              blp_list;
+
+       wait_queue_head_t            blp_waitq;
+       struct completion       blp_comp;
+       atomic_t            blp_num_threads;
+       atomic_t            blp_busy_threads;
+       int                  blp_min_threads;
+       int                  blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+       struct list_head              blwi_entry;
+       struct ldlm_namespace  *blwi_ns;
+       struct ldlm_lock_desc   blwi_ld;
+       struct ldlm_lock       *blwi_lock;
+       struct list_head              blwi_head;
+       int                  blwi_count;
+       struct completion       blwi_comp;
+       ldlm_cancel_flags_t     blwi_flags;
+       int                  blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+       RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+       RETURN(0);
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+       int do_ast;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+       lock_res_and_lock(lock);
+       lock->l_flags |= LDLM_FL_CBPENDING;
+
+       if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+               lock->l_flags |= LDLM_FL_CANCEL;
+
+       do_ast = (!lock->l_readers && !lock->l_writers);
+       unlock_res_and_lock(lock);
+
+       if (do_ast) {
+               CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n",
+                      lock, lock->l_blocking_ast);
+               if (lock->l_blocking_ast != NULL)
+                       lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+                                            LDLM_CB_BLOCKING);
+       } else {
+               CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n",
+                      lock);
+       }
+
+       LDLM_DEBUG(lock, "client blocking callback handler END");
+       LDLM_LOCK_RELEASE(lock);
+       EXIT;
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns,
+                                   struct ldlm_request *dlm_req,
+                                   struct ldlm_lock *lock)
+{
+       int lvb_len;
+       LIST_HEAD(ast_list);
+       int rc = 0;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client completion callback handler START");
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+               int to = cfs_time_seconds(1);
+               while (to > 0) {
+                       schedule_timeout_and_set_state(
+                               TASK_INTERRUPTIBLE, to);
+                       if (lock->l_granted_mode == lock->l_req_mode ||
+                           lock->l_destroyed)
+                               break;
+               }
+       }
+
+       lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+       if (lvb_len < 0) {
+               LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+               GOTO(out, rc = lvb_len);
+       } else if (lvb_len > 0) {
+               if (lock->l_lvb_len > 0) {
+                       /* for extent lock, lvb contains ost_lvb{}. */
+                       LASSERT(lock->l_lvb_data != NULL);
+
+                       if (unlikely(lock->l_lvb_len < lvb_len)) {
+                               LDLM_ERROR(lock, "Replied LVB is larger than "
+                                          "expectation, expected = %d, "
+                                          "replied = %d",
+                                          lock->l_lvb_len, lvb_len);
+                               GOTO(out, rc = -EINVAL);
+                       }
+               } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+                                                    * variable length */
+                       void *lvb_data;
+
+                       OBD_ALLOC(lvb_data, lvb_len);
+                       if (lvb_data == NULL) {
+                               LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+                               GOTO(out, rc = -ENOMEM);
+                       }
+
+                       lock_res_and_lock(lock);
+                       LASSERT(lock->l_lvb_data == NULL);
+                       lock->l_lvb_data = lvb_data;
+                       lock->l_lvb_len = lvb_len;
+                       unlock_res_and_lock(lock);
+               }
+       }
+
+       lock_res_and_lock(lock);
+       if (lock->l_destroyed ||
+           lock->l_granted_mode == lock->l_req_mode) {
+               /* bug 11300: the lock has already been granted */
+               unlock_res_and_lock(lock);
+               LDLM_DEBUG(lock, "Double grant race happened");
+               GOTO(out, rc = 0);
+       }
+
+       /* If we receive the completion AST before the actual enqueue returned,
+        * then we might need to switch lock modes, resources, or extents. */
+       if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+               lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+               LDLM_DEBUG(lock, "completion AST, new lock mode");
+       }
+
+       if (lock->l_resource->lr_type != LDLM_PLAIN) {
+               ldlm_convert_policy_to_local(req->rq_export,
+                                         dlm_req->lock_desc.l_resource.lr_type,
+                                         &dlm_req->lock_desc.l_policy_data,
+                                         &lock->l_policy_data);
+               LDLM_DEBUG(lock, "completion AST, new policy data");
+       }
+
+       ldlm_resource_unlink_lock(lock);
+       if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+                  &lock->l_resource->lr_name,
+                  sizeof(lock->l_resource->lr_name)) != 0) {
+               unlock_res_and_lock(lock);
+               rc = ldlm_lock_change_resource(ns, lock,
+                               &dlm_req->lock_desc.l_resource.lr_name);
+               if (rc < 0) {
+                       LDLM_ERROR(lock, "Failed to allocate resource");
+                       GOTO(out, rc);
+               }
+               LDLM_DEBUG(lock, "completion AST, new resource");
+               CERROR("change resource!\n");
+               lock_res_and_lock(lock);
+       }
+
+       if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+               /* BL_AST locks are not needed in LRU.
+                * Let ldlm_cancel_lru() be fast. */
+               ldlm_lock_remove_from_lru(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+               LDLM_DEBUG(lock, "completion AST includes blocking AST");
+       }
+
+       if (lock->l_lvb_len > 0) {
+               rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+                                  lock->l_lvb_data, lvb_len);
+               if (rc < 0) {
+                       unlock_res_and_lock(lock);
+                       GOTO(out, rc);
+               }
+       }
+
+       ldlm_grant_lock(lock, &ast_list);
+       unlock_res_and_lock(lock);
+
+       LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+       /* Let Enqueue to call osc_lock_upcall() and initialize
+        * l_ast_data */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+       ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+       LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+                         lock);
+       GOTO(out, rc);
+
+out:
+       if (rc < 0) {
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_FAILED;
+               unlock_res_and_lock(lock);
+               wake_up(&lock->l_waitq);
+       }
+       LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns,
+                                   struct ldlm_request *dlm_req,
+                                   struct ldlm_lock *lock)
+{
+       int rc = -ENOSYS;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+       if (lock->l_glimpse_ast != NULL)
+               rc = lock->l_glimpse_ast(lock, req);
+
+       if (req->rq_repmsg != NULL) {
+               ptlrpc_reply(req);
+       } else {
+               req->rq_status = rc;
+               ptlrpc_error(req);
+       }
+
+       lock_res_and_lock(lock);
+       if (lock->l_granted_mode == LCK_PW &&
+           !lock->l_readers && !lock->l_writers &&
+           cfs_time_after(cfs_time_current(),
+                          cfs_time_add(lock->l_last_used,
+                                       cfs_time_seconds(10)))) {
+               unlock_res_and_lock(lock);
+               if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+                       ldlm_handle_bl_callback(ns, NULL, lock);
+
+               EXIT;
+               return;
+       }
+       unlock_res_and_lock(lock);
+       LDLM_LOCK_RELEASE(lock);
+       EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_no_reply)
+               return 0;
+
+       req->rq_status = rc;
+       if (!req->rq_packed_final) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       return rc;
+       }
+       return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+                              ldlm_cancel_flags_t cancel_flags)
+{
+       struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+       ENTRY;
+
+       spin_lock(&blp->blp_lock);
+       if (blwi->blwi_lock &&
+           blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+               /* add LDLM_FL_DISCARD_DATA requests to the priority list */
+               list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+       } else {
+               /* other blocking callbacks are added to the regular list */
+               list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+       }
+       spin_unlock(&blp->blp_lock);
+
+       wake_up(&blp->blp_waitq);
+
+       /* can not check blwi->blwi_flags as blwi could be already freed in
+          LCF_ASYNC mode */
+       if (!(cancel_flags & LCF_ASYNC))
+               wait_for_completion(&blwi->blwi_comp);
+
+       RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+                            struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld,
+                            struct list_head *cancels, int count,
+                            struct ldlm_lock *lock,
+                            ldlm_cancel_flags_t cancel_flags)
+{
+       init_completion(&blwi->blwi_comp);
+       INIT_LIST_HEAD(&blwi->blwi_head);
+
+       if (memory_pressure_get())
+               blwi->blwi_mem_pressure = 1;
+
+       blwi->blwi_ns = ns;
+       blwi->blwi_flags = cancel_flags;
+       if (ld != NULL)
+               blwi->blwi_ld = *ld;
+       if (count) {
+               list_add(&blwi->blwi_head, cancels);
+               list_del_init(cancels);
+               blwi->blwi_count = count;
+       } else {
+               blwi->blwi_lock = lock;
+       }
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld,
+                            struct ldlm_lock *lock,
+                            struct list_head *cancels, int count,
+                            ldlm_cancel_flags_t cancel_flags)
+{
+       ENTRY;
+
+       if (cancels && count == 0)
+               RETURN(0);
+
+       if (cancel_flags & LCF_ASYNC) {
+               struct ldlm_bl_work_item *blwi;
+
+               OBD_ALLOC(blwi, sizeof(*blwi));
+               if (blwi == NULL)
+                       RETURN(-ENOMEM);
+               init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+               RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+       } else {
+               /* if it is synchronous call do minimum mem alloc, as it could
+                * be triggered from kernel shrinker
+                */
+               struct ldlm_bl_work_item blwi;
+
+               memset(&blwi, 0, sizeof(blwi));
+               init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+               RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+       }
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct ldlm_lock *lock)
+{
+       return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct list_head *cancels, int count,
+                          ldlm_cancel_flags_t cancel_flags)
+{
+       return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+       struct obd_device *obd = req->rq_export->exp_obd;
+       char *key;
+       void *val;
+       int keylen, vallen;
+       int rc = -ENOSYS;
+       ENTRY;
+
+       DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+       req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+       key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       if (key == NULL) {
+               DEBUG_REQ(D_IOCTL, req, "no set_info key");
+               RETURN(-EFAULT);
+       }
+       keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                                     RCL_CLIENT);
+       val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+       if (val == NULL) {
+               DEBUG_REQ(D_IOCTL, req, "no set_info val");
+               RETURN(-EFAULT);
+       }
+       vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                                     RCL_CLIENT);
+
+       /* We are responsible for swabbing contents of val */
+
+       if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+               /* Pass it on to mdc (the "export" in this case) */
+               rc = obd_set_info_async(req->rq_svc_thread->t_env,
+                                       req->rq_export,
+                                       sizeof(KEY_HSM_COPYTOOL_SEND),
+                                       KEY_HSM_COPYTOOL_SEND,
+                                       vallen, val, NULL);
+       else
+               DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+       return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+                                       const char *msg, int rc,
+                                       struct lustre_handle *handle)
+{
+       DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+                 "%s: [nid %s] [rc %d] [lock "LPX64"]",
+                 msg, libcfs_id2str(req->rq_peer), rc,
+                 handle ? handle->cookie : 0);
+       if (req->rq_no_reply)
+               CWARN("No reply was sent, maybe cause bug 21636.\n");
+       else if (rc)
+               CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+       struct obd_quotactl *oqctl;
+       struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+       oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       if (oqctl == NULL) {
+               CERROR("Can't unpack obd_quotactl\n");
+               RETURN(-EPROTO);
+       }
+
+       cli->cl_qchk_stat = oqctl->qc_stat;
+       return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+       struct ldlm_namespace *ns;
+       struct ldlm_request *dlm_req;
+       struct ldlm_lock *lock;
+       int rc;
+       ENTRY;
+
+       /* Requests arrive in sender's byte order.  The ptlrpc service
+        * handler has already checked and, if necessary, byte-swapped the
+        * incoming request message body, but I am responsible for the
+        * message buffers. */
+
+       /* do nothing for sec context finalize */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+               RETURN(0);
+
+       req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+       if (req->rq_export == NULL) {
+               rc = ldlm_callback_reply(req, -ENOTCONN);
+               ldlm_callback_errmsg(req, "Operate on unconnected server",
+                                    rc, NULL);
+               RETURN(0);
+       }
+
+       LASSERT(req->rq_export != NULL);
+       LASSERT(req->rq_export->exp_obd != NULL);
+
+       switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+       case LDLM_BL_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_CP_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_GL_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_SET_INFO:
+               rc = ldlm_handle_setinfo(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
+               CERROR("shouldn't be handling OBD_LOG_CANCEL on DLM thread\n");
+               req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_cancel(req);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP))
+                       RETURN(0);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_CREATE:
+               req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_open(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+               req_capsule_set(&req->rq_pill,
+                               &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_next_block(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_READ_HEADER:
+               req_capsule_set(&req->rq_pill,
+                               &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_read_header(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_CLOSE:
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_close(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case OBD_QC_CALLBACK:
+               req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+                       RETURN(0);
+               rc = ldlm_handle_qc_callback(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       default:
+               CERROR("unknown opcode %u\n",
+                      lustre_msg_get_opc(req->rq_reqmsg));
+               ldlm_callback_reply(req, -EPROTO);
+               RETURN(0);
+       }
+
+       ns = req->rq_export->exp_obd->obd_namespace;
+       LASSERT(ns != NULL);
+
+       req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+       dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       if (dlm_req == NULL) {
+               rc = ldlm_callback_reply(req, -EPROTO);
+               ldlm_callback_errmsg(req, "Operate without parameter", rc,
+                                    NULL);
+               RETURN(0);
+       }
+
+       /* Force a known safe race, send a cancel to the server for a lock
+        * which the server has already started a blocking callback on. */
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+               rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+               if (rc < 0)
+                       CERROR("ldlm_cli_cancel: %d\n", rc);
+       }
+
+       lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+       if (!lock) {
+               CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
+                      "disappeared\n", dlm_req->lock_handle[0].cookie);
+               rc = ldlm_callback_reply(req, -EINVAL);
+               ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+                                    &dlm_req->lock_handle[0]);
+               RETURN(0);
+       }
+
+       if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+               OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+       /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+       lock_res_and_lock(lock);
+       lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+                                             LDLM_AST_FLAGS);
+       if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+               /* If somebody cancels lock and cache is already dropped,
+                * or lock is failed before cp_ast received on client,
+                * we can tell the server we have no lock. Otherwise, we
+                * should send cancel after dropping the cache. */
+               if (((lock->l_flags & LDLM_FL_CANCELING) &&
+                   (lock->l_flags & LDLM_FL_BL_DONE)) ||
+                   (lock->l_flags & LDLM_FL_FAILED)) {
+                       LDLM_DEBUG(lock, "callback on lock "
+                                  LPX64" - lock disappeared\n",
+                                  dlm_req->lock_handle[0].cookie);
+                       unlock_res_and_lock(lock);
+                       LDLM_LOCK_RELEASE(lock);
+                       rc = ldlm_callback_reply(req, -EINVAL);
+                       ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+                                            &dlm_req->lock_handle[0]);
+                       RETURN(0);
+               }
+               /* BL_AST locks are not needed in LRU.
+                * Let ldlm_cancel_lru() be fast. */
+               ldlm_lock_remove_from_lru(lock);
+               lock->l_flags |= LDLM_FL_BL_AST;
+       }
+       unlock_res_and_lock(lock);
+
+       /* We want the ost thread to get this reply so that it can respond
+        * to ost requests (write cache writeback) that might be triggered
+        * in the callback.
+        *
+        * But we'd also like to be able to indicate in the reply that we're
+        * cancelling right now, because it's unused, or have an intent result
+        * in the reply, so we might have to push the responsibility for sending
+        * the reply down into the AST handlers, alas. */
+
+       switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+       case LDLM_BL_CALLBACK:
+               CDEBUG(D_INODE, "blocking ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+               if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+                       rc = ldlm_callback_reply(req, 0);
+                       if (req->rq_no_reply || rc)
+                               ldlm_callback_errmsg(req, "Normal process", rc,
+                                                    &dlm_req->lock_handle[0]);
+               }
+               if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+                       ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+               break;
+       case LDLM_CP_CALLBACK:
+               CDEBUG(D_INODE, "completion ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+               ldlm_callback_reply(req, 0);
+               ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               break;
+       case LDLM_GL_CALLBACK:
+               CDEBUG(D_INODE, "glimpse ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+               ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+               break;
+       default:
+               LBUG();                  /* checked above */
+       }
+
+       RETURN(0);
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+       struct ldlm_bl_work_item *blwi = NULL;
+       static unsigned int num_bl = 0;
+
+       spin_lock(&blp->blp_lock);
+       /* process a request from the blp_list at least every blp_num_threads */
+       if (!list_empty(&blp->blp_list) &&
+           (list_empty(&blp->blp_prio_list) || num_bl == 0))
+               blwi = list_entry(blp->blp_list.next,
+                                     struct ldlm_bl_work_item, blwi_entry);
+       else
+               if (!list_empty(&blp->blp_prio_list))
+                       blwi = list_entry(blp->blp_prio_list.next,
+                                             struct ldlm_bl_work_item,
+                                             blwi_entry);
+
+       if (blwi) {
+               if (++num_bl >= atomic_read(&blp->blp_num_threads))
+                       num_bl = 0;
+               list_del(&blwi->blwi_entry);
+       }
+       spin_unlock(&blp->blp_lock);
+
+       return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+       char                    bltd_name[CFS_CURPROC_COMM_MAX];
+       struct ldlm_bl_pool     *bltd_blp;
+       struct completion       bltd_comp;
+       int                     bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+       struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+       task_t *task;
+
+       init_completion(&bltd.bltd_comp);
+       bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+       snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1,
+               "ldlm_bl_%02d", bltd.bltd_num);
+       task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name);
+       if (IS_ERR(task)) {
+               CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+                      atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+               return PTR_ERR(task);
+       }
+       wait_for_completion(&bltd.bltd_comp);
+
+       return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+       struct ldlm_bl_pool *blp;
+       ENTRY;
+
+       {
+               struct ldlm_bl_thread_data *bltd = arg;
+
+               blp = bltd->bltd_blp;
+
+               atomic_inc(&blp->blp_num_threads);
+               atomic_inc(&blp->blp_busy_threads);
+
+               complete(&bltd->bltd_comp);
+               /* cannot use bltd after this, it is only on caller's stack */
+       }
+
+       while (1) {
+               struct l_wait_info lwi = { 0 };
+               struct ldlm_bl_work_item *blwi = NULL;
+               int busy;
+
+               blwi = ldlm_bl_get_work(blp);
+
+               if (blwi == NULL) {
+                       atomic_dec(&blp->blp_busy_threads);
+                       l_wait_event_exclusive(blp->blp_waitq,
+                                        (blwi = ldlm_bl_get_work(blp)) != NULL,
+                                        &lwi);
+                       busy = atomic_inc_return(&blp->blp_busy_threads);
+               } else {
+                       busy = atomic_read(&blp->blp_busy_threads);
+               }
+
+               if (blwi->blwi_ns == NULL)
+                       /* added by ldlm_cleanup() */
+                       break;
+
+               /* Not fatal if racy and have a few too many threads */
+               if (unlikely(busy < blp->blp_max_threads &&
+                            busy >= atomic_read(&blp->blp_num_threads) &&
+                            !blwi->blwi_mem_pressure))
+                       /* discard the return value, we tried */
+                       ldlm_bl_thread_start(blp);
+
+               if (blwi->blwi_mem_pressure)
+                       memory_pressure_set();
+
+               if (blwi->blwi_count) {
+                       int count;
+                       /* The special case when we cancel locks in LRU
+                        * asynchronously, we pass the list of locks here.
+                        * Thus locks are marked LDLM_FL_CANCELING, but NOT
+                        * canceled locally yet. */
+                       count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+                                                          blwi->blwi_count,
+                                                          LCF_BL_AST);
+                       ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+                                            blwi->blwi_flags);
+               } else {
+                       ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+                                               blwi->blwi_lock);
+               }
+               if (blwi->blwi_mem_pressure)
+                       memory_pressure_clr();
+
+               if (blwi->blwi_flags & LCF_ASYNC)
+                       OBD_FREE(blwi, sizeof(*blwi));
+               else
+                       complete(&blwi->blwi_comp);
+       }
+
+       atomic_dec(&blp->blp_busy_threads);
+       atomic_dec(&blp->blp_num_threads);
+       complete(&blp->blp_comp);
+       RETURN(0);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+       int rc = 0;
+       ENTRY;
+       mutex_lock(&ldlm_ref_mutex);
+       if (++ldlm_refcount == 1) {
+               rc = ldlm_setup();
+               if (rc)
+                       ldlm_refcount--;
+       }
+       mutex_unlock(&ldlm_ref_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+       ENTRY;
+       mutex_lock(&ldlm_ref_mutex);
+       if (ldlm_refcount == 1) {
+               int rc = ldlm_cleanup();
+               if (rc)
+                       CERROR("ldlm_cleanup failed: %d\n", rc);
+               else
+                       ldlm_refcount--;
+       } else {
+               ldlm_refcount--;
+       }
+       mutex_unlock(&ldlm_ref_mutex);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+       struct ldlm_lock     *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+       .hs_hash        = ldlm_export_lock_hash,
+       .hs_key  = ldlm_export_lock_key,
+       .hs_keycmp      = ldlm_export_lock_keycmp,
+       .hs_keycpy      = ldlm_export_lock_keycpy,
+       .hs_object      = ldlm_export_lock_object,
+       .hs_get  = ldlm_export_lock_get,
+       .hs_put  = ldlm_export_lock_put,
+       .hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+       ENTRY;
+
+       exp->exp_lock_hash =
+               cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+                               HASH_EXP_LOCK_CUR_BITS,
+                               HASH_EXP_LOCK_MAX_BITS,
+                               HASH_EXP_LOCK_BKT_BITS, 0,
+                               CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+                               &ldlm_export_lock_ops,
+                               CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+                               CFS_HASH_NBLK_CHANGE);
+
+       if (!exp->exp_lock_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+       cfs_hash_putref(exp->exp_lock_hash);
+       exp->exp_lock_hash = NULL;
+
+       ldlm_destroy_flock_export(exp);
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+       static struct ptlrpc_service_conf       conf;
+       struct ldlm_bl_pool                     *blp = NULL;
+       int rc = 0;
+       int i;
+       ENTRY;
+
+       if (ldlm_state != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+       if (ldlm_state == NULL)
+               RETURN(-ENOMEM);
+
+#ifdef LPROCFS
+       rc = ldlm_proc_setup();
+       if (rc != 0)
+               GOTO(out, rc);
+#endif
+
+       memset(&conf, 0, sizeof(conf));
+       conf = (typeof(conf)) {
+               .psc_name               = "ldlm_cbd",
+               .psc_watchdog_factor    = 2,
+               .psc_buf                = {
+                       .bc_nbufs               = LDLM_CLIENT_NBUFS,
+                       .bc_buf_size            = LDLM_BUFSIZE,
+                       .bc_req_max_size        = LDLM_MAXREQSIZE,
+                       .bc_rep_max_size        = LDLM_MAXREPSIZE,
+                       .bc_req_portal          = LDLM_CB_REQUEST_PORTAL,
+                       .bc_rep_portal          = LDLM_CB_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ldlm_cb",
+                       .tc_thr_factor          = LDLM_THR_FACTOR,
+                       .tc_nthrs_init          = LDLM_NTHRS_INIT,
+                       .tc_nthrs_base          = LDLM_NTHRS_BASE,
+                       .tc_nthrs_max           = LDLM_NTHRS_MAX,
+                       .tc_nthrs_user          = ldlm_num_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_pattern             = ldlm_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = ldlm_callback_handler,
+               },
+       };
+       ldlm_state->ldlm_cb_service = \
+                       ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+       if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+               CERROR("failed to start service\n");
+               rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+               ldlm_state->ldlm_cb_service = NULL;
+               GOTO(out, rc);
+       }
+
+
+       OBD_ALLOC(blp, sizeof(*blp));
+       if (blp == NULL)
+               GOTO(out, rc = -ENOMEM);
+       ldlm_state->ldlm_bl_pool = blp;
+
+       spin_lock_init(&blp->blp_lock);
+       INIT_LIST_HEAD(&blp->blp_list);
+       INIT_LIST_HEAD(&blp->blp_prio_list);
+       init_waitqueue_head(&blp->blp_waitq);
+       atomic_set(&blp->blp_num_threads, 0);
+       atomic_set(&blp->blp_busy_threads, 0);
+
+       if (ldlm_num_threads == 0) {
+               blp->blp_min_threads = LDLM_NTHRS_INIT;
+               blp->blp_max_threads = LDLM_NTHRS_MAX;
+       } else {
+               blp->blp_min_threads = blp->blp_max_threads = \
+                       min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+                                                        ldlm_num_threads));
+       }
+
+       for (i = 0; i < blp->blp_min_threads; i++) {
+               rc = ldlm_bl_thread_start(blp);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+
+       rc = ldlm_pools_init();
+       if (rc) {
+               CERROR("Failed to initialize LDLM pools: %d\n", rc);
+               GOTO(out, rc);
+       }
+       RETURN(0);
+
+ out:
+       ldlm_cleanup();
+       RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+       ENTRY;
+
+       if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+           !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+               CERROR("ldlm still has namespaces; clean these up first.\n");
+               ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+               ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+               RETURN(-EBUSY);
+       }
+
+       ldlm_pools_fini();
+
+       if (ldlm_state->ldlm_bl_pool != NULL) {
+               struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+               while (atomic_read(&blp->blp_num_threads) > 0) {
+                       struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+                       init_completion(&blp->blp_comp);
+
+                       spin_lock(&blp->blp_lock);
+                       list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+                       wake_up(&blp->blp_waitq);
+                       spin_unlock(&blp->blp_lock);
+
+                       wait_for_completion(&blp->blp_comp);
+               }
+
+               OBD_FREE(blp, sizeof(*blp));
+       }
+
+       if (ldlm_state->ldlm_cb_service != NULL)
+               ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+       ldlm_proc_cleanup();
+
+
+       OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+       ldlm_state = NULL;
+
+       RETURN(0);
+}
+
+int ldlm_init(void)
+{
+       mutex_init(&ldlm_ref_mutex);
+       mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+       mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+       ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+                                              sizeof(struct ldlm_resource), 0,
+                                              SLAB_HWCACHE_ALIGN, NULL);
+       if (ldlm_resource_slab == NULL)
+               return -ENOMEM;
+
+       ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+                             sizeof(struct ldlm_lock), 0,
+                             SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+       if (ldlm_lock_slab == NULL) {
+               kmem_cache_destroy(ldlm_resource_slab);
+               return -ENOMEM;
+       }
+
+       ldlm_interval_slab = kmem_cache_create("interval_node",
+                                       sizeof(struct ldlm_interval),
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
+       if (ldlm_interval_slab == NULL) {
+               kmem_cache_destroy(ldlm_resource_slab);
+               kmem_cache_destroy(ldlm_lock_slab);
+               return -ENOMEM;
+       }
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+       return 0;
+}
+
+void ldlm_exit(void)
+{
+       if (ldlm_refcount)
+               CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+       kmem_cache_destroy(ldlm_resource_slab);
+       /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+        * synchronize_rcu() to wait a grace period elapsed, so that
+        * ldlm_lock_free() get a chance to be called. */
+       synchronize_rcu();
+       kmem_cache_destroy(ldlm_lock_slab);
+       kmem_cache_destroy(ldlm_interval_slab);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644 (file)
index 0000000..ec29e28
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       /* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       /* No policy for plain locks */
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644 (file)
index 0000000..0604295
--- /dev/null
@@ -0,0 +1,1406 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much sensitive
+ * client should be about last SLV from server. The higher LVF is the more locks
+ * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
+ * that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
+ * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
+ * cleanups. Flow definition to allow more easy understanding of the logic belongs
+ * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
+ * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <lustre_dlm.h>
+
+#include <cl_object.h>
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+extern proc_dir_entry_t *ldlm_ns_proc_dir;
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+       return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+       /*
+        * Allow to have all locks for 1 client for 10 hrs.
+        * Formula is the following: limit * 10h / 1 client.
+        */
+       __u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+       return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+       return 1;
+}
+
+enum {
+       LDLM_POOL_FIRST_STAT = 0,
+       LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+       LDLM_POOL_GRANT_STAT,
+       LDLM_POOL_CANCEL_STAT,
+       LDLM_POOL_GRANT_RATE_STAT,
+       LDLM_POOL_CANCEL_RATE_STAT,
+       LDLM_POOL_GRANT_PLAN_STAT,
+       LDLM_POOL_SLV_STAT,
+       LDLM_POOL_SHRINK_REQTD_STAT,
+       LDLM_POOL_SHRINK_FREED_STAT,
+       LDLM_POOL_RECALC_STAT,
+       LDLM_POOL_TIMING_STAT,
+       LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+       return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+       /*
+        * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+        * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+        *
+        * How this will affect execution is the following:
+        *
+        * - for thread period 1s we will have grant_step 1% which good from
+        * pov of taking some load off from server and push it out to clients.
+        * This is like that because 1% for grant_step means that server will
+        * not allow clients to get lots of locks in short period of time and
+        * keep all old locks in their caches. Clients will always have to
+        * get some locks back if they want to take some new;
+        *
+        * - for thread period 10s (which is default) we will have 23% which
+        * means that clients will have enough of room to take some new locks
+        * without getting some back. All locks from this 23% which were not
+        * taken by clients in current period will contribute in SLV growing.
+        * SLV growing means more locks cached on clients until limit or grant
+        * plan is reached.
+        */
+       return LDLM_POOL_MAX_GSP -
+               ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+                (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+       int granted, grant_step, limit;
+
+       limit = ldlm_pool_get_limit(pl);
+       granted = atomic_read(&pl->pl_granted);
+
+       grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+       grant_step = ((limit - granted) * grant_step) / 100;
+       pl->pl_grant_plan = granted + grant_step;
+       limit = (limit * 5) >> 2;
+       if (pl->pl_grant_plan > limit)
+               pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+       int granted;
+       int grant_plan;
+       int round_up;
+       __u64 slv;
+       __u64 slv_factor;
+       __u64 grant_usage;
+       __u32 limit;
+
+       slv = pl->pl_server_lock_volume;
+       grant_plan = pl->pl_grant_plan;
+       limit = ldlm_pool_get_limit(pl);
+       granted = atomic_read(&pl->pl_granted);
+       round_up = granted < limit;
+
+       grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+       /*
+        * Find out SLV change factor which is the ratio of grant usage
+        * from limit. SLV changes as fast as the ratio of grant plan
+        * consumption. The more locks from grant plan are not consumed
+        * by clients in last interval (idle time), the faster grows
+        * SLV. And the opposite, the more grant plan is over-consumed
+        * (load time) the faster drops SLV.
+        */
+       slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+       do_div(slv_factor, limit);
+       slv = slv * slv_factor;
+       slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+       if (slv > ldlm_pool_slv_max(limit)) {
+               slv = ldlm_pool_slv_max(limit);
+       } else if (slv < ldlm_pool_slv_min(limit)) {
+               slv = ldlm_pool_slv_min(limit);
+       }
+
+       pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+       int grant_plan = pl->pl_grant_plan;
+       __u64 slv = pl->pl_server_lock_volume;
+       int granted = atomic_read(&pl->pl_granted);
+       int grant_rate = atomic_read(&pl->pl_grant_rate);
+       int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+                           slv);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+                           granted);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+                           grant_rate);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+                           grant_plan);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+                           cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+       struct obd_device *obd;
+
+       /*
+        * Set new SLV in obd field for using it later without accessing the
+        * pool. This is required to avoid race between sending reply to client
+        * with new SLV and cleanup server stack in which we can't guarantee
+        * that namespace is still alive. We know only that obd is alive as
+        * long as valid export is alive.
+        */
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL);
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_slv = pl->pl_server_lock_volume;
+       write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       ENTRY;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period) {
+               spin_unlock(&pl->pl_lock);
+               RETURN(0);
+       }
+       /*
+        * Recalc SLV after last period. This should be done
+        * _before_ recalculating new grant plan.
+        */
+       ldlm_pool_recalc_slv(pl);
+
+       /*
+        * Make sure that pool informed obd of last SLV changes.
+        */
+       ldlm_srv_pool_push_slv(pl);
+
+       /*
+        * Update grant_plan for new period.
+        */
+       ldlm_pool_recalc_grant_plan(pl);
+
+       pl->pl_recalc_time = cfs_time_current_sec();
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                           recalc_interval_sec);
+       spin_unlock(&pl->pl_lock);
+       RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+                               int nr, unsigned int gfp_mask)
+{
+       __u32 limit;
+
+       /*
+        * VM is asking how many entries may be potentially freed.
+        */
+       if (nr == 0)
+               return atomic_read(&pl->pl_granted);
+
+       /*
+        * Client already canceled locks but server is already in shrinker
+        * and can't cancel anything. Let's catch this race.
+        */
+       if (atomic_read(&pl->pl_granted) == 0)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+
+       /*
+        * We want shrinker to possibly cause cancellation of @nr locks from
+        * clients or grant approximately @nr locks smaller next intervals.
+        *
+        * This is why we decreased SLV by @nr. This effect will only be as
+        * long as one re-calc interval (1s these days) and this should be
+        * enough to pass this decreased SLV to all clients. On next recalc
+        * interval pool will either increase SLV if locks load is not high
+        * or will keep on same level or even decrease again, thus, shrinker
+        * decreased SLV will affect next recalc intervals and this way will
+        * make locking load lower.
+        */
+       if (nr < pl->pl_server_lock_volume) {
+               pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+       } else {
+               limit = ldlm_pool_get_limit(pl);
+               pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+       }
+
+       /*
+        * Make sure that pool informed obd of last SLV changes.
+        */
+       ldlm_srv_pool_push_slv(pl);
+       spin_unlock(&pl->pl_lock);
+
+       /*
+        * We did not really free any memory here so far, it only will be
+        * freed later may be, so that we return 0 to not confuse VM.
+        */
+       return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+       struct obd_device *obd;
+
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL && obd != LP_POISON);
+       LASSERT(obd->obd_type != LP_POISON);
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_limit = limit;
+       write_unlock(&obd->obd_pool_lock);
+
+       ldlm_pool_set_limit(pl, limit);
+       return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+       struct obd_device *obd;
+
+       /*
+        * Get new SLV and Limit from obd which is updated with coming
+        * RPCs.
+        */
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL);
+       read_lock(&obd->obd_pool_lock);
+       pl->pl_server_lock_volume = obd->obd_pool_slv;
+       ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+       read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       ENTRY;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+       /*
+        * Check if we need to recalc lists now.
+        */
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period) {
+               spin_unlock(&pl->pl_lock);
+               RETURN(0);
+       }
+
+       /*
+        * Make sure that pool knows last SLV and Limit from obd.
+        */
+       ldlm_cli_pool_pop_slv(pl);
+
+       pl->pl_recalc_time = cfs_time_current_sec();
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                           recalc_interval_sec);
+       spin_unlock(&pl->pl_lock);
+
+       /*
+        * Do not cancel locks in case lru resize is disabled for this ns.
+        */
+       if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+               RETURN(0);
+
+       /*
+        * In the time of canceling locks on client we do not need to maintain
+        * sharp timing, we only want to cancel locks asap according to new SLV.
+        * It may be called when SLV has changed much, this is why we do not
+        * take into account pl->pl_recalc_time here.
+        */
+       RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+                              LDLM_CANCEL_LRUR));
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+                               int nr, unsigned int gfp_mask)
+{
+       struct ldlm_namespace *ns;
+       int canceled = 0, unused;
+
+       ns = ldlm_pl2ns(pl);
+
+       /*
+        * Do not cancel locks in case lru resize is disabled for this ns.
+        */
+       if (!ns_connect_lru_resize(ns))
+               RETURN(0);
+
+       /*
+        * Make sure that pool knows last SLV and Limit from obd.
+        */
+       ldlm_cli_pool_pop_slv(pl);
+
+       spin_lock(&ns->ns_lock);
+       unused = ns->ns_nr_unused;
+       spin_unlock(&ns->ns_lock);
+
+       if (nr) {
+               canceled = ldlm_cancel_lru(ns, nr, LCF_ASYNC,
+                                          LDLM_CANCEL_SHRINK);
+       }
+       /*
+        * Return the number of potentially reclaimable locks.
+        */
+       return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure;
+}
+
+struct ldlm_pool_ops ldlm_srv_pool_ops = {
+       .po_recalc = ldlm_srv_pool_recalc,
+       .po_shrink = ldlm_srv_pool_shrink,
+       .po_setup  = ldlm_srv_pool_setup
+};
+
+struct ldlm_pool_ops ldlm_cli_pool_ops = {
+       .po_recalc = ldlm_cli_pool_recalc,
+       .po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       int count;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec <= 0)
+               goto recalc;
+
+       spin_lock(&pl->pl_lock);
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec > 0) {
+               /*
+                * Update pool statistics every 1s.
+                */
+               ldlm_pool_recalc_stats(pl);
+
+               /*
+                * Zero out all rates and speed for the last period.
+                */
+               atomic_set(&pl->pl_grant_rate, 0);
+               atomic_set(&pl->pl_cancel_rate, 0);
+       }
+       spin_unlock(&pl->pl_lock);
+
+ recalc:
+       if (pl->pl_ops->po_recalc != NULL) {
+               count = pl->pl_ops->po_recalc(pl);
+               lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+                                   count);
+               return count;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_recalc);
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+                    unsigned int gfp_mask)
+{
+       int cancel = 0;
+
+       if (pl->pl_ops->po_shrink != NULL) {
+               cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+               if (nr > 0) {
+                       lprocfs_counter_add(pl->pl_stats,
+                                           LDLM_POOL_SHRINK_REQTD_STAT,
+                                           nr);
+                       lprocfs_counter_add(pl->pl_stats,
+                                           LDLM_POOL_SHRINK_FREED_STAT,
+                                           cancel);
+                       CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
+                              "shrunk %d\n", pl->pl_name, nr, cancel);
+               }
+       }
+       return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+       if (pl->pl_ops->po_setup != NULL)
+               return(pl->pl_ops->po_setup(pl, limit));
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+static int lprocfs_rd_pool_state(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{
+       int granted, grant_rate, cancel_rate, grant_step;
+       int nr = 0, grant_speed, grant_plan, lvf;
+       struct ldlm_pool *pl = data;
+       __u64 slv, clv;
+       __u32 limit;
+
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_server_lock_volume;
+       clv = pl->pl_client_lock_volume;
+       limit = ldlm_pool_get_limit(pl);
+       grant_plan = pl->pl_grant_plan;
+       granted = atomic_read(&pl->pl_granted);
+       grant_rate = atomic_read(&pl->pl_grant_rate);
+       cancel_rate = atomic_read(&pl->pl_cancel_rate);
+       grant_speed = grant_rate - cancel_rate;
+       lvf = atomic_read(&pl->pl_lock_volume_factor);
+       grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+       spin_unlock(&pl->pl_lock);
+
+       nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n",
+                      pl->pl_name);
+       nr += snprintf(page + nr, count - nr, "  SLV: "LPU64"\n", slv);
+       nr += snprintf(page + nr, count - nr, "  CLV: "LPU64"\n", clv);
+       nr += snprintf(page + nr, count - nr, "  LVF: %d\n", lvf);
+
+       if (ns_is_server(ldlm_pl2ns(pl))) {
+               nr += snprintf(page + nr, count - nr, "  GSP: %d%%\n",
+                              grant_step);
+               nr += snprintf(page + nr, count - nr, "  GP:  %d\n",
+                              grant_plan);
+       }
+       nr += snprintf(page + nr, count - nr, "  GR:  %d\n",
+                      grant_rate);
+       nr += snprintf(page + nr, count - nr, "  CR:  %d\n",
+                      cancel_rate);
+       nr += snprintf(page + nr, count - nr, "  GS:  %d\n",
+                      grant_speed);
+       nr += snprintf(page + nr, count - nr, "  G:   %d\n",
+                      granted);
+       nr += snprintf(page + nr, count - nr, "  L:   %d\n",
+                      limit);
+       return nr;
+}
+
+static int lprocfs_rd_grant_speed(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+       struct ldlm_pool *pl = data;
+       int            grant_speed;
+
+       spin_lock(&pl->pl_lock);
+       /* serialize with ldlm_pool_recalc */
+       grant_speed = atomic_read(&pl->pl_grant_rate) -
+                       atomic_read(&pl->pl_cancel_rate);
+       spin_unlock(&pl->pl_lock);
+       return lprocfs_rd_uint(page, start, off, count, eof, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER(grant_plan, int);
+LDLM_POOL_PROC_READER(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+       struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+       struct proc_dir_entry *parent_ns_proc;
+       struct lprocfs_vars pool_vars[2];
+       char *var_name = NULL;
+       int rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+       if (!var_name)
+               RETURN(-ENOMEM);
+
+       parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir,
+                                     ldlm_ns_name(ns));
+       if (parent_ns_proc == NULL) {
+               CERROR("%s: proc entry is not initialized\n",
+                      ldlm_ns_name(ns));
+               GOTO(out_free_name, rc = -EINVAL);
+       }
+       pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+                                          NULL, NULL);
+       if (IS_ERR(pl->pl_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-pool-init\n");
+               rc = PTR_ERR(pl->pl_proc_dir);
+               GOTO(out_free_name, rc);
+       }
+
+       var_name[MAX_STRING_SIZE] = '\0';
+       memset(pool_vars, 0, sizeof(pool_vars));
+       pool_vars[0].name = var_name;
+
+       snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume");
+       pool_vars[0].data = &pl->pl_server_lock_volume;
+       pool_vars[0].read_fptr = lprocfs_rd_u64;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "limit");
+       pool_vars[0].data = &pl->pl_limit;
+       pool_vars[0].read_fptr = lprocfs_rd_atomic;
+       pool_vars[0].write_fptr = lprocfs_wr_atomic;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "granted");
+       pool_vars[0].data = &pl->pl_granted;
+       pool_vars[0].read_fptr = lprocfs_rd_atomic;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "grant_speed");
+       pool_vars[0].data = pl;
+       pool_vars[0].read_fptr = lprocfs_rd_grant_speed;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "cancel_rate");
+       pool_vars[0].data = &pl->pl_cancel_rate;
+       pool_vars[0].read_fptr = lprocfs_rd_atomic;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "grant_rate");
+       pool_vars[0].data = &pl->pl_grant_rate;
+       pool_vars[0].read_fptr = lprocfs_rd_atomic;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "grant_plan");
+       pool_vars[0].data = pl;
+       pool_vars[0].read_fptr = lprocfs_rd_grant_plan;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "recalc_period");
+       pool_vars[0].data = pl;
+       pool_vars[0].read_fptr = lprocfs_rd_recalc_period;
+       pool_vars[0].write_fptr = lprocfs_wr_recalc_period;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor");
+       pool_vars[0].data = &pl->pl_lock_volume_factor;
+       pool_vars[0].read_fptr = lprocfs_rd_atomic;
+       pool_vars[0].write_fptr = lprocfs_wr_atomic;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       snprintf(var_name, MAX_STRING_SIZE, "state");
+       pool_vars[0].data = pl;
+       pool_vars[0].read_fptr = lprocfs_rd_pool_state;
+       lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);
+
+       pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+                                          LDLM_POOL_FIRST_STAT, 0);
+       if (!pl->pl_stats)
+               GOTO(out_free_name, rc = -ENOMEM);
+
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "granted", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "cancel", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant_rate", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "cancel_rate", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant_plan", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "slv", "slv");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "shrink_request", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "shrink_freed", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "recalc_freed", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "recalc_timing", "sec");
+       rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+       EXIT;
+out_free_name:
+       OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+       return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+       if (pl->pl_stats != NULL) {
+               lprocfs_free_stats(&pl->pl_stats);
+               pl->pl_stats = NULL;
+       }
+       if (pl->pl_proc_dir != NULL) {
+               lprocfs_remove(&pl->pl_proc_dir);
+               pl->pl_proc_dir = NULL;
+       }
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+                  int idx, ldlm_side_t client)
+{
+       int rc;
+       ENTRY;
+
+       spin_lock_init(&pl->pl_lock);
+       atomic_set(&pl->pl_granted, 0);
+       pl->pl_recalc_time = cfs_time_current_sec();
+       atomic_set(&pl->pl_lock_volume_factor, 1);
+
+       atomic_set(&pl->pl_grant_rate, 0);
+       atomic_set(&pl->pl_cancel_rate, 0);
+       pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+       snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+                ldlm_ns_name(ns), idx);
+
+       if (client == LDLM_NAMESPACE_SERVER) {
+               pl->pl_ops = &ldlm_srv_pool_ops;
+               ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+               pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+               pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+       } else {
+               ldlm_pool_set_limit(pl, 1);
+               pl->pl_server_lock_volume = 0;
+               pl->pl_ops = &ldlm_cli_pool_ops;
+               pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+       }
+       pl->pl_client_lock_volume = 0;
+       rc = ldlm_pool_proc_init(pl);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+       ENTRY;
+       ldlm_pool_proc_fini(pl);
+
+       /*
+        * Pool should not be used after this point. We can't free it here as
+        * it lives in struct ldlm_namespace, but still interested in catching
+        * any abnormal using cases.
+        */
+       POISON(pl, 0x5a, sizeof(*pl));
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+       /*
+        * FLOCK locks are special in a sense that they are almost never
+        * cancelled, instead special kind of lock is used to drop them.
+        * also there is no LRU for flock locks, so no point in tracking
+        * them anyway.
+        */
+       if (lock->l_resource->lr_type == LDLM_FLOCK)
+               return;
+
+       atomic_inc(&pl->pl_granted);
+       atomic_inc(&pl->pl_grant_rate);
+       lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+       /*
+        * Do not do pool recalc for client side as all locks which
+        * potentially may be canceled has already been packed into
+        * enqueue/cancel rpc. Also we do not want to run out of stack
+        * with too long call paths.
+        */
+       if (ns_is_server(ldlm_pl2ns(pl)))
+               ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+       /*
+        * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+        */
+       if (lock->l_resource->lr_type == LDLM_FLOCK)
+               return;
+
+       LASSERT(atomic_read(&pl->pl_granted) > 0);
+       atomic_dec(&pl->pl_granted);
+       atomic_inc(&pl->pl_cancel_rate);
+
+       lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+       if (ns_is_server(ldlm_pl2ns(pl)))
+               ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+       __u64 slv;
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_server_lock_volume;
+       spin_unlock(&pl->pl_lock);
+       return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+       spin_lock(&pl->pl_lock);
+       pl->pl_server_lock_volume = slv;
+       spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+       __u64 slv;
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_client_lock_volume;
+       spin_unlock(&pl->pl_lock);
+       return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+       spin_lock(&pl->pl_lock);
+       pl->pl_client_lock_volume = clv;
+       spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+       atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct shrinker *ldlm_pools_srv_shrinker;
+static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
+
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(ldlm_side_t client, int nr,
+                            unsigned int gfp_mask)
+{
+       int total = 0, cached = 0, nr_ns;
+       struct ldlm_namespace *ns;
+       void *cookie;
+
+       if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+           !(gfp_mask & __GFP_FS))
+               return -1;
+
+       CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
+              nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+       cookie = cl_env_reenter();
+
+       /*
+        * Find out how many resources we may release.
+        */
+       for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+            nr_ns > 0; nr_ns--)
+       {
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       cl_env_reexit(cookie);
+                       return 0;
+               }
+               ns = ldlm_namespace_first_locked(client);
+               ldlm_namespace_get(ns);
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+               total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+               ldlm_namespace_put(ns);
+       }
+
+       if (nr == 0 || total == 0) {
+               cl_env_reexit(cookie);
+               return total;
+       }
+
+       /*
+        * Shrink at least ldlm_namespace_nr(client) namespaces.
+        */
+       for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+            nr_ns > 0; nr_ns--)
+       {
+               int cancel, nr_locks;
+
+               /*
+                * Do not call shrink under ldlm_namespace_lock(client)
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       /*
+                        * If list is empty, we can't return any @cached > 0,
+                        * that probably would cause needless shrinker
+                        * call.
+                        */
+                       cached = 0;
+                       break;
+               }
+               ns = ldlm_namespace_first_locked(client);
+               ldlm_namespace_get(ns);
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+
+               nr_locks = ldlm_pool_granted(&ns->ns_pool);
+               cancel = 1 + nr_locks * nr / total;
+               ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+               cached += ldlm_pool_granted(&ns->ns_pool);
+               ldlm_namespace_put(ns);
+       }
+       cl_env_reexit(cookie);
+       /* we only decrease the SLV in server pools shrinker, return -1 to
+        * kernel to avoid needless loop. LU-1128 */
+       return (client == LDLM_NAMESPACE_SERVER) ? -1 : cached;
+}
+
+static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+                                shrink_param(sc, nr_to_scan),
+                                shrink_param(sc, gfp_mask));
+}
+
+static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+                                shrink_param(sc, nr_to_scan),
+                                shrink_param(sc, gfp_mask));
+}
+
+void ldlm_pools_recalc(ldlm_side_t client)
+{
+       __u32 nr_l = 0, nr_p = 0, l;
+       struct ldlm_namespace *ns;
+       int nr, equal = 0;
+
+       /*
+        * No need to setup pool limit for client pools.
+        */
+       if (client == LDLM_NAMESPACE_SERVER) {
+               /*
+                * Check all modest namespaces first.
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               list_for_each_entry(ns, ldlm_namespace_list(client),
+                                       ns_list_chain)
+               {
+                       if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+                               continue;
+
+                       l = ldlm_pool_granted(&ns->ns_pool);
+                       if (l == 0)
+                               l = 1;
+
+                       /*
+                        * Set the modest pools limit equal to their avg granted
+                        * locks + ~6%.
+                        */
+                       l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+                       ldlm_pool_setup(&ns->ns_pool, l);
+                       nr_l += l;
+                       nr_p++;
+               }
+
+               /*
+                * Make sure that modest namespaces did not eat more that 2/3
+                * of limit.
+                */
+               if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+                       CWARN("\"Modest\" pools eat out 2/3 of server locks "
+                             "limit (%d of %lu). This means that you have too "
+                             "many clients for this amount of server RAM. "
+                             "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+                       equal = 1;
+               }
+
+               /*
+                * The rest is given to greedy namespaces.
+                */
+               list_for_each_entry(ns, ldlm_namespace_list(client),
+                                       ns_list_chain)
+               {
+                       if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+                               continue;
+
+                       if (equal) {
+                               /*
+                                * In the case 2/3 locks are eaten out by
+                                * modest pools, we re-setup equal limit
+                                * for _all_ pools.
+                                */
+                               l = LDLM_POOL_HOST_L /
+                                       atomic_read(
+                                               ldlm_namespace_nr(client));
+                       } else {
+                               /*
+                                * All the rest of greedy pools will have
+                                * all locks in equal parts.
+                                */
+                               l = (LDLM_POOL_HOST_L - nr_l) /
+                                       (atomic_read(
+                                               ldlm_namespace_nr(client)) -
+                                        nr_p);
+                       }
+                       ldlm_pool_setup(&ns->ns_pool, l);
+               }
+               mutex_unlock(ldlm_namespace_lock(client));
+       }
+
+       /*
+        * Recalc at least ldlm_namespace_nr(client) namespaces.
+        */
+       for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
+               int     skip;
+               /*
+                * Lock the list, get first @ns in the list, getref, move it
+                * to the tail, unlock and call pool recalc. This way we avoid
+                * calling recalc under @ns lock what is really good as we get
+                * rid of potential deadlock on client nodes when canceling
+                * locks synchronously.
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       break;
+               }
+               ns = ldlm_namespace_first_locked(client);
+
+               spin_lock(&ns->ns_lock);
+               /*
+                * skip ns which is being freed, and we don't want to increase
+                * its refcount again, not even temporarily. bz21519 & LU-499.
+                */
+               if (ns->ns_stopping) {
+                       skip = 1;
+               } else {
+                       skip = 0;
+                       ldlm_namespace_get(ns);
+               }
+               spin_unlock(&ns->ns_lock);
+
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+
+               /*
+                * After setup is done - recalc the pool.
+                */
+               if (!skip) {
+                       ldlm_pool_recalc(&ns->ns_pool);
+                       ldlm_namespace_put(ns);
+               }
+       }
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+       ENTRY;
+
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+               "ldlm_poold", current_pid());
+
+       while (1) {
+               struct l_wait_info lwi;
+
+               /*
+                * Recal all pools on this tick.
+                */
+               ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+               ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+               /*
+                * Wait until the next check time, or until we're
+                * stopped.
+                */
+               lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
+                                 NULL, NULL);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopping(thread) ||
+                            thread_is_event(thread),
+                            &lwi);
+
+               if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                       break;
+               else
+                       thread_test_and_clear_flags(thread, SVC_EVENT);
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+               "ldlm_poold", current_pid());
+
+       complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+       struct l_wait_info lwi = { 0 };
+       task_t *task;
+       ENTRY;
+
+       if (ldlm_pools_thread != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC_PTR(ldlm_pools_thread);
+       if (ldlm_pools_thread == NULL)
+               RETURN(-ENOMEM);
+
+       init_completion(&ldlm_pools_comp);
+       init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+       task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+                          "ldlm_poold");
+       if (IS_ERR(task)) {
+               CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+               OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+               ldlm_pools_thread = NULL;
+               RETURN(PTR_ERR(task));
+       }
+       l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+                    thread_is_running(ldlm_pools_thread), &lwi);
+       RETURN(0);
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+       ENTRY;
+
+       if (ldlm_pools_thread == NULL) {
+               EXIT;
+               return;
+       }
+
+       thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+       wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+       /*
+        * Make sure that pools thread is finished before freeing @thread.
+        * This fixes possible race and oops due to accessing freed memory
+        * in pools thread.
+        */
+       wait_for_completion(&ldlm_pools_comp);
+       OBD_FREE_PTR(ldlm_pools_thread);
+       ldlm_pools_thread = NULL;
+       EXIT;
+}
+
+int ldlm_pools_init(void)
+{
+       int rc;
+       ENTRY;
+
+       rc = ldlm_pools_thread_start();
+       if (rc == 0) {
+               ldlm_pools_srv_shrinker =
+                       set_shrinker(DEFAULT_SEEKS,
+                                        ldlm_pools_srv_shrink);
+               ldlm_pools_cli_shrinker =
+                       set_shrinker(DEFAULT_SEEKS,
+                                        ldlm_pools_cli_shrink);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+       if (ldlm_pools_srv_shrinker != NULL) {
+               remove_shrinker(ldlm_pools_srv_shrinker);
+               ldlm_pools_srv_shrinker = NULL;
+       }
+       if (ldlm_pools_cli_shrinker != NULL) {
+               remove_shrinker(ldlm_pools_cli_shrinker);
+               ldlm_pools_cli_shrinker = NULL;
+       }
+       ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644 (file)
index 0000000..1a690ed
--- /dev/null
@@ -0,0 +1,2333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+               "lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+       struct ldlm_lock *lwd_lock;
+       __u32        lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+       struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+       struct lock_wait_data *lwd = data;
+       struct ldlm_lock *lock = lwd->lwd_lock;
+       struct obd_import *imp;
+       struct obd_device *obd;
+
+       ENTRY;
+       if (lock->l_conn_export == NULL) {
+               static cfs_time_t next_dump = 0, last_dump = 0;
+
+               if (ptlrpc_check_suspend())
+                       RETURN(0);
+
+               LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+                             CFS_DURATION_T"s ago)\n",
+                             lock->l_last_activity,
+                             cfs_time_sub(cfs_time_current_sec(),
+                                          lock->l_last_activity));
+               LDLM_DEBUG(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+                          CFS_DURATION_T"s ago); not entering recovery in "
+                          "server code, just going back to sleep",
+                          lock->l_last_activity,
+                          cfs_time_sub(cfs_time_current_sec(),
+                                       lock->l_last_activity));
+               if (cfs_time_after(cfs_time_current(), next_dump)) {
+                       last_dump = next_dump;
+                       next_dump = cfs_time_shift(300);
+                       ldlm_namespace_dump(D_DLMTRACE,
+                                           ldlm_lock_to_ns(lock));
+                       if (last_dump == 0)
+                               libcfs_debug_dumplog();
+               }
+               RETURN(0);
+       }
+
+       obd = lock->l_conn_export->exp_obd;
+       imp = obd->u.cli.cl_import;
+       ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+       LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+                 CFS_DURATION_T"s ago), entering recovery for %s@%s",
+                 lock->l_last_activity,
+                 cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity),
+                 obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+       int timeout = at_get(ldlm_lock_to_ns_at(lock));
+       if (AT_OFF)
+               return obd_timeout / 2;
+       /* Since these are non-updating timeouts, we should be conservative.
+          It would be nice to have some kind of "early reply" mechanism for
+          lock callbacks too... */
+       timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+       return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+       long delay;
+       int  result;
+
+       if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) {
+               LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+               result = -EIO;
+       } else {
+               delay = cfs_time_sub(cfs_time_current_sec(),
+                                    lock->l_last_activity);
+               LDLM_DEBUG(lock, "client-side enqueue: granted after "
+                          CFS_DURATION_T"s", delay);
+
+               /* Update our time estimate */
+               at_measured(ldlm_lock_to_ns_at(lock),
+                           delay);
+               result = 0;
+       }
+       return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       ENTRY;
+
+       if (flags == LDLM_FL_WAIT_NOREPROC) {
+               LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+               RETURN(0);
+       }
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               wake_up(&lock->l_waitq);
+               RETURN(ldlm_completion_tail(lock));
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "going forward");
+       ldlm_reprocess_all(lock->l_resource);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       /* XXX ALLOCATE - 160 bytes */
+       struct lock_wait_data lwd;
+       struct obd_device *obd;
+       struct obd_import *imp = NULL;
+       struct l_wait_info lwi;
+       __u32 timeout;
+       int rc = 0;
+       ENTRY;
+
+       if (flags == LDLM_FL_WAIT_NOREPROC) {
+               LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+               goto noreproc;
+       }
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "sleeping");
+
+noreproc:
+
+       obd = class_exp2obd(lock->l_conn_export);
+
+       /* if this is a local lock, then there is no import */
+       if (obd != NULL) {
+               imp = obd->u.cli.cl_import;
+       }
+
+       /* Wait a long time for enqueue - server may have to callback a
+          lock from another client.  Server will evict the other client if it
+          doesn't respond reasonably, and then give us the lock. */
+       timeout = ldlm_get_enq_timeout(lock) * 2;
+
+       lwd.lwd_lock = lock;
+
+       if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+               LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+               lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+       } else {
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+                                      ldlm_expired_completion_wait,
+                                      interrupted_completion_wait, &lwd);
+       }
+
+       if (imp != NULL) {
+               spin_lock(&imp->imp_lock);
+               lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+           OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+                                OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+               lock->l_flags |= LDLM_FL_FAIL_LOC;
+               rc = -EINTR;
+       } else {
+               /* Go to sleep until the lock is granted or cancelled. */
+               rc = l_wait_event(lock->l_waitq,
+                                 is_granted_or_cancelled(lock), &lwi);
+       }
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+       RETURN(ldlm_completion_tail(lock));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+       int do_ast;
+       ENTRY;
+
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       do_ast = (!lock->l_readers && !lock->l_writers);
+       unlock_res_and_lock(lock);
+
+       if (do_ast) {
+               struct lustre_handle lockh;
+               int rc;
+
+               LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               if (rc < 0)
+                       CERROR("ldlm_cli_cancel: %d\n", rc);
+       } else {
+               LDLM_DEBUG(lock, "Lock still has references, will be "
+                          "cancelled later");
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                     void *data, int flag)
+{
+       ENTRY;
+
+       if (flag == LDLM_CB_CANCELING) {
+               /* Don't need to do anything here. */
+               RETURN(0);
+       }
+
+       lock_res_and_lock(lock);
+       /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+        * that ldlm_blocking_ast is called just before intent_policy method
+        * takes the lr_lock, then by the time we get the lock, we might not
+        * be the correct blocking function anymore.  So check, and return
+        * early, if so. */
+       if (lock->l_blocking_ast != ldlm_blocking_ast) {
+               unlock_res_and_lock(lock);
+               RETURN(0);
+       }
+       RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+       /*
+        * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+        * that is rather subtle: with OST-side locking, it may so happen that
+        * _all_ extent locks are held by the OST. If client wants to obtain
+        * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+        * on the server), dummy glimpse callback fires and does
+        * nothing. Client still receives correct file size due to the
+        * following fragment in filter_intent_policy():
+        *
+        * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+        * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+        *     res->lr_namespace->ns_lvbo->lvbo_update) {
+        *       res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+        * }
+        *
+        * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+        * returns correct file size to the client.
+        */
+       return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_type_t type, ldlm_policy_data_t *policy,
+                          ldlm_mode_t mode, __u64 *flags,
+                          ldlm_blocking_callback blocking,
+                          ldlm_completion_callback completion,
+                          ldlm_glimpse_callback glimpse,
+                          void *data, __u32 lvb_len, enum lvb_type lvb_type,
+                          const __u64 *client_cookie,
+                          struct lustre_handle *lockh)
+{
+       struct ldlm_lock *lock;
+       int err;
+       const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+                                                .lcs_blocking   = blocking,
+                                                .lcs_glimpse    = glimpse,
+       };
+       ENTRY;
+
+       LASSERT(!(*flags & LDLM_FL_REPLAY));
+       if (unlikely(ns_is_client(ns))) {
+               CERROR("Trying to enqueue local lock in a shadow namespace\n");
+               LBUG();
+       }
+
+       lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+                               lvb_type);
+       if (unlikely(!lock))
+               GOTO(out_nolock, err = -ENOMEM);
+
+       ldlm_lock2handle(lock, lockh);
+
+       /* NB: we don't have any lock now (lock_res_and_lock)
+        * because it's a new lock */
+       ldlm_lock_addref_internal_nolock(lock, mode);
+       lock->l_flags |= LDLM_FL_LOCAL;
+       if (*flags & LDLM_FL_ATOMIC_CB)
+               lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+       if (policy != NULL)
+               lock->l_policy_data = *policy;
+       if (client_cookie != NULL)
+               lock->l_client_cookie = *client_cookie;
+       if (type == LDLM_EXTENT)
+               lock->l_req_extent = policy->l_extent;
+
+       err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+       if (unlikely(err != ELDLM_OK))
+               GOTO(out, err);
+
+       if (policy != NULL)
+               *policy = lock->l_policy_data;
+
+       if (lock->l_completion_ast)
+               lock->l_completion_ast(lock, *flags, NULL);
+
+       LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+       EXIT;
+ out:
+       LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+       return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+                               struct ldlm_lock *lock, int mode)
+{
+       int need_cancel = 0;
+
+       /* Set a flag to prevent us from sending a CANCEL (bug 407) */
+       lock_res_and_lock(lock);
+       /* Check that lock is not granted or failed, we might race. */
+       if ((lock->l_req_mode != lock->l_granted_mode) &&
+           !(lock->l_flags & LDLM_FL_FAILED)) {
+               /* Make sure that this lock will not be found by raced
+                * bl_ast and -EINVAL reply is sent to server anyways.
+                * bug 17645 */
+               lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+                                LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+               need_cancel = 1;
+       }
+       unlock_res_and_lock(lock);
+
+       if (need_cancel)
+               LDLM_DEBUG(lock,
+                          "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
+                          "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+       else
+               LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+       ldlm_lock_decref_internal(lock, mode);
+
+       /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+        *       from llite/file.c/ll_file_flock(). */
+       /* This code makes for the fact that we do not have blocking handler on
+        * a client for flock locks. As such this is the place where we must
+        * completely kill failed locks. (interrupted and those that
+        * were waiting to be granted when server evicted us. */
+       if (lock->l_resource->lr_type == LDLM_FLOCK) {
+               lock_res_and_lock(lock);
+               ldlm_resource_unlink_lock(lock);
+               ldlm_lock_destroy_nolock(lock);
+               unlock_res_and_lock(lock);
+       }
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+                         ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+                         __u64 *flags, void *lvb, __u32 lvb_len,
+                         struct lustre_handle *lockh,int rc)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       int is_replay = *flags & LDLM_FL_REPLAY;
+       struct ldlm_lock *lock;
+       struct ldlm_reply *reply;
+       int cleanup_phase = 1;
+       int size = 0;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       /* ldlm_cli_enqueue is holding a reference on this lock. */
+       if (!lock) {
+               LASSERT(type == LDLM_FLOCK);
+               RETURN(-ENOLCK);
+       }
+
+       LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+                "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+       if (rc != ELDLM_OK) {
+               LASSERT(!is_replay);
+               LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+                          rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+               if (rc != ELDLM_LOCK_ABORTED)
+                       GOTO(cleanup, rc);
+       }
+
+       /* Before we return, swab the reply */
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(cleanup, rc = -EPROTO);
+
+       if (lvb_len != 0) {
+               LASSERT(lvb != NULL);
+
+               size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+                                           RCL_SERVER);
+               if (size < 0) {
+                       LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+                       GOTO(cleanup, rc = size);
+               } else if (unlikely(size > lvb_len)) {
+                       LDLM_ERROR(lock, "Replied LVB is larger than "
+                                  "expectation, expected = %d, replied = %d",
+                                  lvb_len, size);
+                       GOTO(cleanup, rc = -EINVAL);
+               }
+       }
+
+       if (rc == ELDLM_LOCK_ABORTED) {
+               if (lvb_len != 0)
+                       rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+                                          lvb, size);
+               GOTO(cleanup, rc = (rc != 0 ? rc : ELDLM_LOCK_ABORTED));
+       }
+
+       /* lock enqueued on the server */
+       cleanup_phase = 0;
+
+       lock_res_and_lock(lock);
+       /* Key change rehash lock in per-export hash with new key */
+       if (exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_rehash_key(exp->exp_lock_hash,
+                                   &lock->l_remote_handle,
+                                   &reply->lock_handle,
+                                   &lock->l_exp_hash);
+       } else {
+               lock->l_remote_handle = reply->lock_handle;
+       }
+
+       *flags = ldlm_flags_from_wire(reply->lock_flags);
+       lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+                                             LDLM_INHERIT_FLAGS);
+       /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+        * to wait with no timeout as well */
+       lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+                                             LDLM_FL_NO_TIMEOUT);
+       unlock_res_and_lock(lock);
+
+       CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%llx\n",
+              lock, reply->lock_handle.cookie, *flags);
+
+       /* If enqueue returned a blocked lock but the completion handler has
+        * already run, then it fixed up the resource and we don't need to do it
+        * again. */
+       if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+               int newmode = reply->lock_desc.l_req_mode;
+               LASSERT(!is_replay);
+               if (newmode && newmode != lock->l_req_mode) {
+                       LDLM_DEBUG(lock, "server returned different mode %s",
+                                  ldlm_lockname[newmode]);
+                       lock->l_req_mode = newmode;
+               }
+
+               if (memcmp(reply->lock_desc.l_resource.lr_name.name,
+                         lock->l_resource->lr_name.name,
+                         sizeof(struct ldlm_res_id))) {
+                       CDEBUG(D_INFO, "remote intent success, locking "
+                                       "(%ld,%ld,%ld) instead of "
+                                       "(%ld,%ld,%ld)\n",
+                             (long)reply->lock_desc.l_resource.lr_name.name[0],
+                             (long)reply->lock_desc.l_resource.lr_name.name[1],
+                             (long)reply->lock_desc.l_resource.lr_name.name[2],
+                             (long)lock->l_resource->lr_name.name[0],
+                             (long)lock->l_resource->lr_name.name[1],
+                             (long)lock->l_resource->lr_name.name[2]);
+
+                       rc = ldlm_lock_change_resource(ns, lock,
+                                       &reply->lock_desc.l_resource.lr_name);
+                       if (rc || lock->l_resource == NULL)
+                               GOTO(cleanup, rc = -ENOMEM);
+                       LDLM_DEBUG(lock, "client-side enqueue, new resource");
+               }
+               if (with_policy)
+                       if (!(type == LDLM_IBITS &&
+                             !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+                               /* We assume lock type cannot change on server*/
+                               ldlm_convert_policy_to_local(exp,
+                                               lock->l_resource->lr_type,
+                                               &reply->lock_desc.l_policy_data,
+                                               &lock->l_policy_data);
+               if (type != LDLM_PLAIN)
+                       LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+       }
+
+       if ((*flags) & LDLM_FL_AST_SENT ||
+           /* Cancel extent locks as soon as possible on a liblustre client,
+            * because it cannot handle asynchronous ASTs robustly (see
+            * bug 7311). */
+           (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING |  LDLM_FL_BL_AST;
+               unlock_res_and_lock(lock);
+               LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+       }
+
+       /* If the lock has already been granted by a completion AST, don't
+        * clobber the LVB with an older one. */
+       if (lvb_len != 0) {
+               /* We must lock or a racing completion might update lvb without
+                * letting us know and we'll clobber the correct value.
+                * Cannot unlock after the check either, a that still leaves
+                * a tiny window for completion to get in */
+               lock_res_and_lock(lock);
+               if (lock->l_req_mode != lock->l_granted_mode)
+                       rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+                                          lock->l_lvb_data, size);
+               unlock_res_and_lock(lock);
+               if (rc < 0) {
+                       cleanup_phase = 1;
+                       GOTO(cleanup, rc);
+               }
+       }
+
+       if (!is_replay) {
+               rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+               if (lock->l_completion_ast != NULL) {
+                       int err = lock->l_completion_ast(lock, *flags, NULL);
+                       if (!rc)
+                               rc = err;
+                       if (rc)
+                               cleanup_phase = 1;
+               }
+       }
+
+       if (lvb_len && lvb != NULL) {
+               /* Copy the LVB here, and not earlier, because the completion
+                * AST (if any) can override what we got in the reply */
+               memcpy(lvb, lock->l_lvb_data, lvb_len);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue END");
+       EXIT;
+cleanup:
+       if (cleanup_phase == 1 && rc)
+               failed_lock_cleanup(ns, lock, mode);
+       /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+       LDLM_LOCK_PUT(lock);
+       LDLM_LOCK_RELEASE(lock);
+       return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+       int avail;
+
+       avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+       if (likely(avail >= 0))
+               avail /= (int)sizeof(struct lustre_handle);
+       else
+               avail = 0;
+       avail += LDLM_LOCKREQ_HANDLES - off;
+
+       return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+                                            enum req_location loc,
+                                            int off)
+{
+       int size = req_capsule_msg_size(pill, loc);
+       return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+                                           const struct req_format *fmt,
+                                           enum req_location loc, int off)
+{
+       int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+       return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+                     int version, int opc, int canceloff,
+                     struct list_head *cancels, int count)
+{
+       struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
+       struct req_capsule      *pill = &req->rq_pill;
+       struct ldlm_request     *dlm = NULL;
+       int flags, avail, to_free, pack = 0;
+       LIST_HEAD(head);
+       int rc;
+       ENTRY;
+
+       if (cancels == NULL)
+               cancels = &head;
+       if (ns_connect_cancelset(ns)) {
+               /* Estimate the amount of available space in the request. */
+               req_capsule_filled_sizes(pill, RCL_CLIENT);
+               avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+               flags = ns_connect_lru_resize(ns) ?
+                       LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+               to_free = !ns_connect_lru_resize(ns) &&
+                         opc == LDLM_ENQUEUE ? 1 : 0;
+
+               /* Cancel LRU locks here _only_ if the server supports
+                * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+                * RPC, which will make us slower. */
+               if (avail > count)
+                       count += ldlm_cancel_lru_local(ns, cancels, to_free,
+                                                      avail - count, 0, flags);
+               if (avail > count)
+                       pack = count;
+               else
+                       pack = avail;
+               req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+                                    ldlm_request_bufsize(pack, opc));
+       }
+
+       rc = ptlrpc_request_pack(req, version, opc);
+       if (rc) {
+               ldlm_lock_list_put(cancels, l_bl_ast, count);
+               RETURN(rc);
+       }
+
+       if (ns_connect_cancelset(ns)) {
+               if (canceloff) {
+                       dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+                       LASSERT(dlm);
+                       /* Skip first lock handler in ldlm_request_pack(),
+                        * this method will incrment @lock_count according
+                        * to the lock handle amount actually written to
+                        * the buffer. */
+                       dlm->lock_count = canceloff;
+               }
+               /* Pack into the request @pack lock handles. */
+               ldlm_cli_cancel_list(cancels, pack, req, 0);
+               /* Prepare and send separate cancel RPC for others. */
+               ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+       } else {
+               ldlm_lock_list_put(cancels, l_bl_ast, count);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+                         struct list_head *cancels, int count)
+{
+       return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+                                LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+       struct ptlrpc_request *req;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+                    struct ldlm_enqueue_info *einfo,
+                    const struct ldlm_res_id *res_id,
+                    ldlm_policy_data_t const *policy, __u64 *flags,
+                    void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+                    struct lustre_handle *lockh, int async)
+{
+       struct ldlm_namespace *ns;
+       struct ldlm_lock      *lock;
+       struct ldlm_request   *body;
+       int                 is_replay = *flags & LDLM_FL_REPLAY;
+       int                 req_passed_in = 1;
+       int                 rc, err;
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+
+       ns = exp->exp_obd->obd_namespace;
+
+       /* If we're replaying this lock, just check some invariants.
+        * If we're creating a new lock, get everything all setup nice. */
+       if (is_replay) {
+               lock = ldlm_handle2lock_long(lockh, 0);
+               LASSERT(lock != NULL);
+               LDLM_DEBUG(lock, "client-side enqueue START");
+               LASSERT(exp == lock->l_conn_export);
+       } else {
+               const struct ldlm_callback_suite cbs = {
+                       .lcs_completion = einfo->ei_cb_cp,
+                       .lcs_blocking   = einfo->ei_cb_bl,
+                       .lcs_glimpse    = einfo->ei_cb_gl,
+                       .lcs_weigh      = einfo->ei_cb_wg
+               };
+               lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+                                       einfo->ei_mode, &cbs, einfo->ei_cbdata,
+                                       lvb_len, lvb_type);
+               if (lock == NULL)
+                       RETURN(-ENOMEM);
+               /* for the local lock, add the reference */
+               ldlm_lock_addref_internal(lock, einfo->ei_mode);
+               ldlm_lock2handle(lock, lockh);
+               if (policy != NULL) {
+                       /* INODEBITS_INTEROP: If the server does not support
+                        * inodebits, we will request a plain lock in the
+                        * descriptor (ldlm_lock2desc() below) but use an
+                        * inodebits lock internally with both bits set.
+                        */
+                       if (einfo->ei_type == LDLM_IBITS &&
+                           !(exp_connect_flags(exp) &
+                             OBD_CONNECT_IBITS))
+                               lock->l_policy_data.l_inodebits.bits =
+                                       MDS_INODELOCK_LOOKUP |
+                                       MDS_INODELOCK_UPDATE;
+                       else
+                               lock->l_policy_data = *policy;
+               }
+
+               if (einfo->ei_type == LDLM_EXTENT)
+                       lock->l_req_extent = policy->l_extent;
+               LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+                          *flags);
+       }
+
+       lock->l_conn_export = exp;
+       lock->l_export = NULL;
+       lock->l_blocking_ast = einfo->ei_cb_bl;
+       lock->l_flags |= (*flags & LDLM_FL_NO_LRU);
+
+       /* lock not sent to server yet */
+
+       if (reqp == NULL || *reqp == NULL) {
+               req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                               &RQF_LDLM_ENQUEUE,
+                                               LUSTRE_DLM_VERSION,
+                                               LDLM_ENQUEUE);
+               if (req == NULL) {
+                       failed_lock_cleanup(ns, lock, einfo->ei_mode);
+                       LDLM_LOCK_RELEASE(lock);
+                       RETURN(-ENOMEM);
+               }
+               req_passed_in = 0;
+               if (reqp)
+                       *reqp = req;
+       } else {
+               int len;
+
+               req = *reqp;
+               len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+                                          RCL_CLIENT);
+               LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+                        DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+       }
+
+       /* Dump lock data into the request buffer */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       ldlm_lock2desc(lock, &body->lock_desc);
+       body->lock_flags = ldlm_flags_to_wire(*flags);
+       body->lock_handle[0] = *lockh;
+
+       /* Continue as normal. */
+       if (!req_passed_in) {
+               if (lvb_len > 0)
+                       req_capsule_extend(&req->rq_pill,
+                                          &RQF_LDLM_ENQUEUE_LVB);
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                                    lvb_len);
+               ptlrpc_request_set_replen(req);
+       }
+
+       /*
+        * Liblustre client doesn't get extent locks, except for O_APPEND case
+        * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+        * [i_size, OBD_OBJECT_EOF] lock is taken.
+        */
+       LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+                    policy->l_extent.end == OBD_OBJECT_EOF));
+
+       if (async) {
+               LASSERT(reqp != NULL);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "sending request");
+
+       rc = ptlrpc_queue_wait(req);
+
+       err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+                                   einfo->ei_mode, flags, lvb, lvb_len,
+                                   lockh, rc);
+
+       /* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+        * one reference that we took */
+       if (err == -ENOLCK)
+               LDLM_LOCK_RELEASE(lock);
+       else
+               rc = err;
+
+       if (!req_passed_in && req != NULL) {
+               ptlrpc_req_finished(req);
+               if (reqp)
+                       *reqp = NULL;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+                                 __u32 *flags)
+{
+       struct ldlm_resource *res;
+       int rc;
+       ENTRY;
+       if (ns_is_client(ldlm_lock_to_ns(lock))) {
+               CERROR("Trying to cancel local lock\n");
+               LBUG();
+       }
+       LDLM_DEBUG(lock, "client-side local convert");
+
+       res = ldlm_lock_convert(lock, new_mode, flags);
+       if (res) {
+               ldlm_reprocess_all(res);
+               rc = 0;
+       } else {
+               rc = EDEADLOCK;
+       }
+       LDLM_DEBUG(lock, "client-side local convert handler END");
+       LDLM_LOCK_PUT(lock);
+       RETURN(rc);
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+       struct ldlm_request   *body;
+       struct ldlm_reply     *reply;
+       struct ldlm_lock      *lock;
+       struct ldlm_resource  *res;
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       if (!lock) {
+               LBUG();
+               RETURN(-EINVAL);
+       }
+       *flags = 0;
+
+       if (lock->l_conn_export == NULL)
+               RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+
+       LDLM_DEBUG(lock, "client-side convert");
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+                                       &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+                                       LDLM_CONVERT);
+       if (req == NULL) {
+               LDLM_LOCK_PUT(lock);
+               RETURN(-ENOMEM);
+       }
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       body->lock_handle[0] = lock->l_remote_handle;
+
+       body->lock_desc.l_req_mode = new_mode;
+       body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc != ELDLM_OK)
+               GOTO(out, rc);
+
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       if (req->rq_status)
+               GOTO(out, rc = req->rq_status);
+
+       res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+       if (res != NULL) {
+               ldlm_reprocess_all(res);
+               /* Go to sleep until the lock is granted. */
+               /* FIXME: or cancelled. */
+               if (lock->l_completion_ast) {
+                       rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+                                                   NULL);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+       } else {
+               rc = EDEADLOCK;
+       }
+       EXIT;
+ out:
+       LDLM_LOCK_PUT(lock);
+       ptlrpc_req_finished(req);
+       return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+       __u64 rc = LDLM_FL_LOCAL_ONLY;
+       ENTRY;
+
+       if (lock->l_conn_export) {
+               bool local_only;
+
+               LDLM_DEBUG(lock, "client-side cancel");
+               /* Set this flag to prevent others from getting new references*/
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING;
+               local_only = !!(lock->l_flags &
+                               (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+               ldlm_cancel_callback(lock);
+               rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+                       LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+               unlock_res_and_lock(lock);
+
+               if (local_only) {
+                       CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+                              "instruction)\n");
+                       rc = LDLM_FL_LOCAL_ONLY;
+               }
+               ldlm_lock_cancel(lock);
+       } else {
+               if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                       LDLM_ERROR(lock, "Trying to cancel local lock");
+                       LBUG();
+               }
+               LDLM_DEBUG(lock, "server-side local cancel");
+               ldlm_lock_cancel(lock);
+               ldlm_reprocess_all(lock->l_resource);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+                            struct list_head *head, int count)
+{
+       struct ldlm_request *dlm;
+       struct ldlm_lock *lock;
+       int max, packed = 0;
+       ENTRY;
+
+       dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       LASSERT(dlm != NULL);
+
+       /* Check the room in the request buffer. */
+       max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+               sizeof(struct ldlm_request);
+       max /= sizeof(struct lustre_handle);
+       max += LDLM_LOCKREQ_HANDLES;
+       LASSERT(max >= dlm->lock_count + count);
+
+       /* XXX: it would be better to pack lock handles grouped by resource.
+        * so that the server cancel would call filter_lvbo_update() less
+        * frequently. */
+       list_for_each_entry(lock, head, l_bl_ast) {
+               if (!count--)
+                       break;
+               LASSERT(lock->l_conn_export);
+               /* Pack the lock handle to the given request buffer. */
+               LDLM_DEBUG(lock, "packing");
+               dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+               packed++;
+       }
+       CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+       EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+                       int count, ldlm_cancel_flags_t flags)
+{
+       struct ptlrpc_request *req = NULL;
+       struct obd_import *imp;
+       int free, sent = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+       LASSERT(count > 0);
+
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+       if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+               RETURN(count);
+
+       free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+                                        &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+       if (count > free)
+               count = free;
+
+       while (1) {
+               imp = class_exp2cliimp(exp);
+               if (imp == NULL || imp->imp_invalid) {
+                       CDEBUG(D_DLMTRACE,
+                              "skipping cancel on invalid import %p\n", imp);
+                       RETURN(count);
+               }
+
+               req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+               if (req == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+                                    ldlm_request_bufsize(count, LDLM_CANCEL));
+
+               rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       GOTO(out, rc);
+               }
+
+               req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+               req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+               ptlrpc_at_set_req_timeout(req);
+
+               ldlm_cancel_pack(req, cancels, count);
+
+               ptlrpc_request_set_replen(req);
+               if (flags & LCF_ASYNC) {
+                       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+                       sent = count;
+                       GOTO(out, 0);
+               } else {
+                       rc = ptlrpc_queue_wait(req);
+               }
+               if (rc == ESTALE) {
+                       CDEBUG(D_DLMTRACE, "client/server (nid %s) "
+                              "out of sync -- not fatal\n",
+                              libcfs_nid2str(req->rq_import->
+                                             imp_connection->c_peer.nid));
+                       rc = 0;
+               } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+                          req->rq_import_generation == imp->imp_generation) {
+                       ptlrpc_req_finished(req);
+                       continue;
+               } else if (rc != ELDLM_OK) {
+                       /* -ESHUTDOWN is common on umount */
+                       CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+                                    "Got rc %d from cancel RPC: "
+                                    "canceling anyway\n", rc);
+                       break;
+               }
+               sent = count;
+               break;
+       }
+
+       ptlrpc_req_finished(req);
+       EXIT;
+out:
+       return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+       LASSERT(imp != NULL);
+       return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+       struct obd_device *obd;
+       __u64 new_slv;
+       __u32 new_limit;
+       ENTRY;
+       if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+                    !imp_connect_lru_resize(req->rq_import)))
+       {
+               /*
+                * Do nothing for corner cases.
+                */
+               RETURN(0);
+       }
+
+       /* In some cases RPC may contain SLV and limit zeroed out. This
+        * is the case when server does not support LRU resize feature.
+        * This is also possible in some recovery cases when server-side
+        * reqs have no reference to the OBD export and thus access to
+        * server-side namespace is not possible. */
+       if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+           lustre_msg_get_limit(req->rq_repmsg) == 0) {
+               DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
+                         "(SLV: "LPU64", Limit: %u)",
+                         lustre_msg_get_slv(req->rq_repmsg),
+                         lustre_msg_get_limit(req->rq_repmsg));
+               RETURN(0);
+       }
+
+       new_limit = lustre_msg_get_limit(req->rq_repmsg);
+       new_slv = lustre_msg_get_slv(req->rq_repmsg);
+       obd = req->rq_import->imp_obd;
+
+       /* Set new SLV and limit in OBD fields to make them accessible
+        * to the pool thread. We do not access obd_namespace and pool
+        * directly here as there is no reliable way to make sure that
+        * they are still alive at cleanup time. Evil races are possible
+        * which may cause Oops at that time. */
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_slv = new_slv;
+       obd->obd_pool_limit = new_limit;
+       write_unlock(&obd->obd_pool_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+                   ldlm_cancel_flags_t cancel_flags)
+{
+       struct obd_export *exp;
+       int avail, flags, count = 1;
+       __u64 rc = 0;
+       struct ldlm_namespace *ns;
+       struct ldlm_lock *lock;
+       LIST_HEAD(cancels);
+       ENTRY;
+
+       /* concurrent cancels on the same handle can happen */
+       lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+       if (lock == NULL) {
+               LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+               RETURN(0);
+       }
+
+       rc = ldlm_cli_cancel_local(lock);
+       if (rc == LDLM_FL_LOCAL_ONLY) {
+               LDLM_LOCK_RELEASE(lock);
+               RETURN(0);
+       }
+       /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+        * RPC which goes to canceld portal, so we can cancel other LRU locks
+        * here and send them all as one LDLM_CANCEL RPC. */
+       LASSERT(list_empty(&lock->l_bl_ast));
+       list_add(&lock->l_bl_ast, &cancels);
+
+       exp = lock->l_conn_export;
+       if (exp_connect_cancelset(exp)) {
+               avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+                                                 &RQF_LDLM_CANCEL,
+                                                 RCL_CLIENT, 0);
+               LASSERT(avail > 0);
+
+               ns = ldlm_lock_to_ns(lock);
+               flags = ns_connect_lru_resize(ns) ?
+                       LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+               count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+                                              LCF_BL_AST, flags);
+       }
+       ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+                              ldlm_cancel_flags_t flags)
+{
+       LIST_HEAD(head);
+       struct ldlm_lock *lock, *next;
+       int left = 0, bl_ast = 0;
+       __u64 rc;
+
+       left = count;
+       list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+               if (left-- == 0)
+                       break;
+
+               if (flags & LCF_LOCAL) {
+                       rc = LDLM_FL_LOCAL_ONLY;
+                       ldlm_lock_cancel(lock);
+               } else {
+                       rc = ldlm_cli_cancel_local(lock);
+               }
+               /* Until we have compound requests and can send LDLM_CANCEL
+                * requests batched with generic RPCs, we need to send cancels
+                * with the LDLM_FL_BL_AST flag in a separate RPC from
+                * the one being generated now. */
+               if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+                       LDLM_DEBUG(lock, "Cancel lock separately");
+                       list_del_init(&lock->l_bl_ast);
+                       list_add(&lock->l_bl_ast, &head);
+                       bl_ast++;
+                       continue;
+               }
+               if (rc == LDLM_FL_LOCAL_ONLY) {
+                       /* CANCEL RPC should not be sent to server. */
+                       list_del_init(&lock->l_bl_ast);
+                       LDLM_LOCK_RELEASE(lock);
+                       count--;
+               }
+       }
+       if (bl_ast > 0) {
+               count -= bl_ast;
+               ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+       }
+
+       RETURN(count);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                   struct ldlm_lock *lock,
+                                                   int unused, int added,
+                                                   int count)
+{
+       ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+       ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+       lock_res_and_lock(lock);
+
+       /* don't check added & count since we want to process all locks
+        * from unused list */
+       switch (lock->l_resource->lr_type) {
+               case LDLM_EXTENT:
+               case LDLM_IBITS:
+                       if (cb && cb(lock))
+                               break;
+               default:
+                       result = LDLM_POLICY_SKIP_LOCK;
+                       lock->l_flags |= LDLM_FL_SKIPPED;
+                       break;
+       }
+
+       unlock_res_and_lock(lock);
+       RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+                                                struct ldlm_lock *lock,
+                                                int unused, int added,
+                                                int count)
+{
+       cfs_time_t cur = cfs_time_current();
+       struct ldlm_pool *pl = &ns->ns_pool;
+       __u64 slv, lvf, lv;
+       cfs_time_t la;
+
+       /* Stop LRU processing when we reach past @count or have checked all
+        * locks in LRU. */
+       if (count && added >= count)
+               return LDLM_POLICY_KEEP_LOCK;
+
+       slv = ldlm_pool_get_slv(pl);
+       lvf = ldlm_pool_get_lvf(pl);
+       la = cfs_duration_sec(cfs_time_sub(cur,
+                             lock->l_last_used));
+       lv = lvf * la * unused;
+
+       /* Inform pool about current CLV to see it via proc. */
+       ldlm_pool_set_clv(pl, lv);
+
+       /* Stop when SLV is not yet come from server or lv is smaller than
+        * it is. */
+       return (slv == 0 || lv < slv) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+                                                  struct ldlm_lock *lock,
+                                                  int unused, int added,
+                                                  int count)
+{
+       /* Stop LRU processing when we reach past @count or have checked all
+        * locks in LRU. */
+       return (added >= count) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+                                                struct ldlm_lock *lock,
+                                                int unused, int added,
+                                                int count)
+{
+       /* Stop LRU processing if young lock is found and we reach past count */
+       return ((added >= count) &&
+               cfs_time_before(cfs_time_current(),
+                               cfs_time_add(lock->l_last_used,
+                                            ns->ns_max_age))) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+                                                   struct ldlm_lock *lock,
+                                                   int unused, int added,
+                                                   int count)
+{
+       /* Stop LRU processing when we reach past count or have checked all
+        * locks in LRU. */
+       return (added >= count) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+                                                     struct ldlm_lock *, int,
+                                                     int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+       if (flags & LDLM_CANCEL_NO_WAIT)
+               return ldlm_cancel_no_wait_policy;
+
+       if (ns_connect_lru_resize(ns)) {
+               if (flags & LDLM_CANCEL_SHRINK)
+                       /* We kill passed number of old locks. */
+                       return ldlm_cancel_passed_policy;
+               else if (flags & LDLM_CANCEL_LRUR)
+                       return ldlm_cancel_lrur_policy;
+               else if (flags & LDLM_CANCEL_PASSED)
+                       return ldlm_cancel_passed_policy;
+       } else {
+               if (flags & LDLM_CANCEL_AGED)
+                       return ldlm_cancel_aged_policy;
+       }
+
+       return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ *                         cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ *                           the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ *                           memory pressre policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                            (typically before replaying locks) w/o
+ *                            sending any RPCs or waiting for any
+ *                            outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, struct list_head *cancels,
+                                int count, int max, int flags)
+{
+       ldlm_cancel_lru_policy_t pf;
+       struct ldlm_lock *lock, *next;
+       int added = 0, unused, remained;
+       ENTRY;
+
+       spin_lock(&ns->ns_lock);
+       unused = ns->ns_nr_unused;
+       remained = unused;
+
+       if (!ns_connect_lru_resize(ns))
+               count += unused - ns->ns_max_unused;
+
+       pf = ldlm_cancel_lru_policy(ns, flags);
+       LASSERT(pf != NULL);
+
+       while (!list_empty(&ns->ns_unused_list)) {
+               ldlm_policy_res_t result;
+
+               /* all unused locks */
+               if (remained-- <= 0)
+                       break;
+
+               /* For any flags, stop scanning if @max is reached. */
+               if (max && added >= max)
+                       break;
+
+               list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+                                            l_lru) {
+                       /* No locks which got blocking requests. */
+                       LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+                       if (flags & LDLM_CANCEL_NO_WAIT &&
+                           lock->l_flags & LDLM_FL_SKIPPED)
+                               /* already processed */
+                               continue;
+
+                       /* Somebody is already doing CANCEL. No need for this
+                        * lock in LRU, do not traverse it again. */
+                       if (!(lock->l_flags & LDLM_FL_CANCELING))
+                               break;
+
+                       ldlm_lock_remove_from_lru_nolock(lock);
+               }
+               if (&lock->l_lru == &ns->ns_unused_list)
+                       break;
+
+               LDLM_LOCK_GET(lock);
+               spin_unlock(&ns->ns_lock);
+               lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+               /* Pass the lock through the policy filter and see if it
+                * should stay in LRU.
+                *
+                * Even for shrinker policy we stop scanning if
+                * we find a lock that should stay in the cache.
+                * We should take into account lock age anyway
+                * as a new lock is a valuable resource even if
+                * it has a low weight.
+                *
+                * That is, for shrinker policy we drop only
+                * old locks, but additionally choose them by
+                * their weight. Big extent locks will stay in
+                * the cache. */
+               result = pf(ns, lock, unused, added, count);
+               if (result == LDLM_POLICY_KEEP_LOCK) {
+                       lu_ref_del(&lock->l_reference,
+                                  __FUNCTION__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       break;
+               }
+               if (result == LDLM_POLICY_SKIP_LOCK) {
+                       lu_ref_del(&lock->l_reference,
+                                  __func__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       continue;
+               }
+
+               lock_res_and_lock(lock);
+               /* Check flags again under the lock. */
+               if ((lock->l_flags & LDLM_FL_CANCELING) ||
+                   (ldlm_lock_remove_from_lru(lock) == 0)) {
+                       /* Another thread is removing lock from LRU, or
+                        * somebody is already doing CANCEL, or there
+                        * is a blocking request which will send cancel
+                        * by itself, or the lock is no longer unused. */
+                       unlock_res_and_lock(lock);
+                       lu_ref_del(&lock->l_reference,
+                                  __FUNCTION__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       continue;
+               }
+               LASSERT(!lock->l_readers && !lock->l_writers);
+
+               /* If we have chosen to cancel this lock voluntarily, we
+                * better send cancel notification to server, so that it
+                * frees appropriate state. This might lead to a race
+                * where while we are doing cancel here, server is also
+                * silently cancelling this lock. */
+               lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+               /* Setting the CBPENDING flag is a little misleading,
+                * but prevents an important race; namely, once
+                * CBPENDING is set, the lock can accumulate no more
+                * readers/writers. Since readers and writers are
+                * already zero here, ldlm_lock_decref() won't see
+                * this flag and call l_blocking_ast */
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+               /* We can't re-add to l_lru as it confuses the
+                * refcounting in ldlm_lock_remove_from_lru() if an AST
+                * arrives after we drop lr_lock below. We use l_bl_ast
+                * and can't use l_pending_chain as it is used both on
+                * server and client nevertheless bug 5666 says it is
+                * used only on server */
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, cancels);
+               unlock_res_and_lock(lock);
+               lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+               spin_lock(&ns->ns_lock);
+               added++;
+               unused--;
+       }
+       spin_unlock(&ns->ns_lock);
+       RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+                         int count, int max, ldlm_cancel_flags_t cancel_flags,
+                         int flags)
+{
+       int added;
+       added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+       if (added <= 0)
+               return added;
+       return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+                   ldlm_cancel_flags_t cancel_flags,
+                   int flags)
+{
+       LIST_HEAD(cancels);
+       int count, rc;
+       ENTRY;
+
+       /* Just prepare the list of locks, do not actually cancel them yet.
+        * Locks are cancelled later in a separate thread. */
+       count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+       rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+       if (rc == 0)
+               RETURN(count);
+
+       RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+                              struct list_head *cancels,
+                              ldlm_policy_data_t *policy,
+                              ldlm_mode_t mode, int lock_flags,
+                              ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+       struct ldlm_lock *lock;
+       int count = 0;
+       ENTRY;
+
+       lock_res(res);
+       list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+               if (opaque != NULL && lock->l_ast_data != opaque) {
+                       LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+                                  lock->l_ast_data, opaque);
+                       //LBUG();
+                       continue;
+               }
+
+               if (lock->l_readers || lock->l_writers)
+                       continue;
+
+               /* If somebody is already doing CANCEL, or blocking AST came,
+                * skip this lock. */
+               if (lock->l_flags & LDLM_FL_BL_AST ||
+                   lock->l_flags & LDLM_FL_CANCELING)
+                       continue;
+
+               if (lockmode_compat(lock->l_granted_mode, mode))
+                       continue;
+
+               /* If policy is given and this is IBITS lock, add to list only
+                * those locks that match by policy. */
+               if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+                   !(lock->l_policy_data.l_inodebits.bits &
+                     policy->l_inodebits.bits))
+                       continue;
+
+               /* See CBPENDING comment in ldlm_cancel_lru */
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+                                lock_flags;
+
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, cancels);
+               LDLM_LOCK_GET(lock);
+               count++;
+       }
+       unlock_res(res);
+
+       RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+                        struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+       struct ldlm_lock *lock;
+       int res = 0;
+       ENTRY;
+
+       if (list_empty(cancels) || count == 0)
+               RETURN(0);
+
+       /* XXX: requests (both batched and not) could be sent in parallel.
+        * Usually it is enough to have just 1 RPC, but it is possible that
+        * there are too many locks to be cancelled in LRU or on a resource.
+        * It would also speed up the case when the server does not support
+        * the feature. */
+       while (count > 0) {
+               LASSERT(!list_empty(cancels));
+               lock = list_entry(cancels->next, struct ldlm_lock,
+                                     l_bl_ast);
+               LASSERT(lock->l_conn_export);
+
+               if (exp_connect_cancelset(lock->l_conn_export)) {
+                       res = count;
+                       if (req)
+                               ldlm_cancel_pack(req, cancels, count);
+                       else
+                               res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                         cancels, count,
+                                                         flags);
+               } else {
+                       res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                 cancels, 1, flags);
+               }
+
+               if (res < 0) {
+                       CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+                                    "ldlm_cli_cancel_list: %d\n", res);
+                       res = count;
+               }
+
+               count -= res;
+               ldlm_lock_list_put(cancels, l_bl_ast, res);
+       }
+       LASSERT(count == 0);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+                                   const struct ldlm_res_id *res_id,
+                                   ldlm_policy_data_t *policy,
+                                   ldlm_mode_t mode,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque)
+{
+       struct ldlm_resource *res;
+       LIST_HEAD(cancels);
+       int count;
+       int rc;
+       ENTRY;
+
+       res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+       if (res == NULL) {
+               /* This is not a problem. */
+               CDEBUG(D_INFO, "No resource "LPU64"\n", res_id->name[0]);
+               RETURN(0);
+       }
+
+       LDLM_RESOURCE_ADDREF(res);
+       count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+                                          0, flags | LCF_BL_AST, opaque);
+       rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+       if (rc != ELDLM_OK)
+               CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
+
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+       int     lc_flags;
+       void   *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                      struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource       *res = cfs_hash_object(hs, hnode);
+       struct ldlm_cli_cancel_arg     *lc = arg;
+       int                          rc;
+
+       rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+                                            NULL, LCK_MINMODE,
+                                            lc->lc_flags, lc->lc_opaque);
+       if (rc != 0) {
+               CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+                      res->lr_name.name[0], rc);
+       }
+       /* must return 0 for hash iteration */
+       return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct ldlm_cli_cancel_arg arg = {
+               .lc_flags       = flags,
+               .lc_opaque      = opaque,
+       };
+
+       ENTRY;
+
+       if (ns == NULL)
+               RETURN(ELDLM_OK);
+
+       if (res_id != NULL) {
+               RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+                                                      LCK_MINMODE, flags,
+                                                      opaque));
+       } else {
+               cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                        ldlm_cli_hash_cancel_unused, &arg);
+               RETURN(ELDLM_OK);
+       }
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+                         void *closure)
+{
+       struct list_head *tmp, *next;
+       struct ldlm_lock *lock;
+       int rc = LDLM_ITER_CONTINUE;
+
+       ENTRY;
+
+       if (!res)
+               RETURN(LDLM_ITER_CONTINUE);
+
+       lock_res(res);
+       list_for_each_safe(tmp, next, &res->lr_granted) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+
+       list_for_each_safe(tmp, next, &res->lr_converting) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+
+       list_for_each_safe(tmp, next, &res->lr_waiting) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+ out:
+       unlock_res(res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+       ldlm_iterator_t iter;
+       void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+       struct iter_helper_data *helper = closure;
+       return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                               struct hlist_node *hnode, void *arg)
+
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+       return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+              LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                           ldlm_iterator_t iter, void *closure)
+
+{
+       struct iter_helper_data helper = { iter: iter, closure: closure };
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+                         const struct ldlm_res_id *res_id,
+                         ldlm_iterator_t iter, void *data)
+{
+       struct ldlm_resource *res;
+       int rc;
+       ENTRY;
+
+       if (ns == NULL) {
+               CERROR("must pass in namespace\n");
+               LBUG();
+       }
+
+       res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+
+       LDLM_RESOURCE_ADDREF(res);
+       rc = ldlm_resource_foreach(res, iter, data);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+       struct list_head *list = closure;
+
+       /* we use l_pending_chain here, because it's unused on clients. */
+       LASSERTF(list_empty(&lock->l_pending_chain),
+                "lock %p next %p prev %p\n",
+                lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
+       /* bug 9573: don't replay locks left after eviction, or
+        * bug 17614: locks being actively cancelled. Get a reference
+        * on a lock so that it does not disapear under us (e.g. due to cancel)
+        */
+       if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+               list_add(&lock->l_pending_chain, list);
+               LDLM_LOCK_GET(lock);
+       }
+
+       return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct ldlm_async_args *aa, int rc)
+{
+       struct ldlm_lock     *lock;
+       struct ldlm_reply    *reply;
+       struct obd_export    *exp;
+
+       ENTRY;
+       atomic_dec(&req->rq_import->imp_replay_inflight);
+       if (rc != ELDLM_OK)
+               GOTO(out, rc);
+
+
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lock = ldlm_handle2lock(&aa->lock_handle);
+       if (!lock) {
+               CERROR("received replay ack for unknown local cookie "LPX64
+                      " remote cookie "LPX64 " from server %s id %s\n",
+                      aa->lock_handle.cookie, reply->lock_handle.cookie,
+                      req->rq_export->exp_client_uuid.uuid,
+                      libcfs_id2str(req->rq_peer));
+               GOTO(out, rc = -ESTALE);
+       }
+
+       /* Key change rehash lock in per-export hash with new key */
+       exp = req->rq_export;
+       if (exp && exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_rehash_key(exp->exp_lock_hash,
+                                   &lock->l_remote_handle,
+                                   &reply->lock_handle,
+                                   &lock->l_exp_hash);
+       } else {
+               lock->l_remote_handle = reply->lock_handle;
+       }
+
+       LDLM_DEBUG(lock, "replayed lock:");
+       ptlrpc_import_recovery_state_machine(req->rq_import);
+       LDLM_LOCK_PUT(lock);
+out:
+       if (rc != ELDLM_OK)
+               ptlrpc_connect_import(req->rq_import);
+
+       RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+       struct ptlrpc_request *req;
+       struct ldlm_async_args *aa;
+       struct ldlm_request   *body;
+       int flags;
+       ENTRY;
+
+
+       /* Bug 11974: Do not replay a lock which is actively being canceled */
+       if (lock->l_flags & LDLM_FL_CANCELING) {
+               LDLM_DEBUG(lock, "Not replaying canceled lock:");
+               RETURN(0);
+       }
+
+       /* If this is reply-less callback lock, we cannot replay it, since
+        * server might have long dropped it, but notification of that event was
+        * lost by network. (and server granted conflicting lock already) */
+       if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+               LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+               ldlm_lock_cancel(lock);
+               RETURN(0);
+       }
+
+       /*
+        * If granted mode matches the requested mode, this lock is granted.
+        *
+        * If they differ, but we have a granted mode, then we were granted
+        * one mode and now want another: ergo, converting.
+        *
+        * If we haven't been granted anything and are on a resource list,
+        * then we're blocked/waiting.
+        *
+        * If we haven't been granted anything and we're NOT on a resource list,
+        * then we haven't got a reply yet and don't have a known disposition.
+        * This happens whenever a lock enqueue is the request that triggers
+        * recovery.
+        */
+       if (lock->l_granted_mode == lock->l_req_mode)
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+       else if (lock->l_granted_mode)
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+       else if (!list_empty(&lock->l_res_link))
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+       else
+               flags = LDLM_FL_REPLAY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+                                       LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* We're part of recovery, so don't wait for it. */
+       req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       ldlm_lock2desc(lock, &body->lock_desc);
+       body->lock_flags = ldlm_flags_to_wire(flags);
+
+       ldlm_lock2handle(lock, &body->lock_handle[0]);
+       if (lock->l_lvb_len > 0)
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                            lock->l_lvb_len);
+       ptlrpc_request_set_replen(req);
+       /* notify the server we've replayed all requests.
+        * also, we mark the request to be put on a dedicated
+        * queue to be processed after all request replayes.
+        * bug 6063 */
+       lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+       LDLM_DEBUG(lock, "replaying lock:");
+
+       atomic_inc(&req->rq_import->imp_replay_inflight);
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->lock_handle = body->lock_handle[0];
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+       RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+       int canceled;
+       LIST_HEAD(cancels);
+
+       CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                          "replay for namespace %s (%d)\n",
+                          ldlm_ns_name(ns), ns->ns_nr_unused);
+
+       /* We don't need to care whether or not LRU resize is enabled
+        * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+        * count parameter */
+       canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                        LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+       CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                          canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+       struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+       LIST_HEAD(list);
+       struct ldlm_lock *lock, *next;
+       int rc = 0;
+
+       ENTRY;
+
+       LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+       /* don't replay locks if import failed recovery */
+       if (imp->imp_vbr_failed)
+               RETURN(0);
+
+       /* ensure this doesn't fall to 0 before all have been queued */
+       atomic_inc(&imp->imp_replay_inflight);
+
+       if (ldlm_cancel_unused_locks_before_replay)
+               ldlm_cancel_unused_locks_for_replay(ns);
+
+       ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+       list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+               list_del_init(&lock->l_pending_chain);
+               if (rc) {
+                       LDLM_LOCK_RELEASE(lock);
+                       continue; /* or try to do the rest? */
+               }
+               rc = replay_one_lock(imp, lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+
+       atomic_dec(&imp->imp_replay_inflight);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_replay_locks);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644 (file)
index 0000000..6bdfb42
--- /dev/null
@@ -0,0 +1,1444 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <lustre_dlm.h>
+
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0);
+atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0);
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+LIST_HEAD(ldlm_cli_namespace_list);
+
+proc_dir_entry_t *ldlm_type_proc_dir = NULL;
+proc_dir_entry_t *ldlm_ns_proc_dir = NULL;
+proc_dir_entry_t *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#ifdef LPROCFS
+static int ldlm_proc_dump_ns(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+       ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+       ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+       RETURN(count);
+}
+
+int ldlm_proc_setup(void)
+{
+       int rc;
+       struct lprocfs_vars list[] = {
+               { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL },
+               { "dump_granted_max",
+                 lprocfs_rd_uint, lprocfs_wr_uint,
+                 &ldlm_dump_granted_max, NULL },
+               { "cancel_unused_locks_before_replay",
+                 lprocfs_rd_uint, lprocfs_wr_uint,
+                 &ldlm_cancel_unused_locks_before_replay, NULL },
+               { NULL }};
+       ENTRY;
+       LASSERT(ldlm_ns_proc_dir == NULL);
+
+       ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+                                             proc_lustre_root,
+                                             NULL, NULL);
+       if (IS_ERR(ldlm_type_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_type_proc_dir);
+               GOTO(err, rc);
+       }
+
+       ldlm_ns_proc_dir = lprocfs_register("namespaces",
+                                           ldlm_type_proc_dir,
+                                           NULL, NULL);
+       if (IS_ERR(ldlm_ns_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_ns_proc_dir);
+               GOTO(err_type, rc);
+       }
+
+       ldlm_svc_proc_dir = lprocfs_register("services",
+                                           ldlm_type_proc_dir,
+                                           NULL, NULL);
+       if (IS_ERR(ldlm_svc_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_svc_proc_dir);
+               GOTO(err_ns, rc);
+       }
+
+       rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+       RETURN(0);
+
+err_ns:
+       lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+       lprocfs_remove(&ldlm_type_proc_dir);
+err:
+       ldlm_svc_proc_dir = NULL;
+       RETURN(rc);
+}
+
+void ldlm_proc_cleanup(void)
+{
+       if (ldlm_svc_proc_dir)
+               lprocfs_remove(&ldlm_svc_proc_dir);
+
+       if (ldlm_ns_proc_dir)
+               lprocfs_remove(&ldlm_ns_proc_dir);
+
+       if (ldlm_type_proc_dir)
+               lprocfs_remove(&ldlm_type_proc_dir);
+}
+
+static int lprocfs_rd_ns_resources(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+       struct ldlm_namespace *ns  = data;
+       __u64             res = 0;
+       cfs_hash_bd_t     bd;
+       int                 i;
+
+       /* result is not strictly consistant */
+       cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+               res += cfs_hash_bd_count_get(&bd);
+       return lprocfs_rd_u64(page, start, off, count, eof, &res);
+}
+
+static int lprocfs_rd_ns_locks(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct ldlm_namespace *ns = data;
+       __u64             locks;
+
+       locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+                                       LPROCFS_FIELDS_FLAGS_SUM);
+       return lprocfs_rd_u64(page, start, off, count, eof, &locks);
+}
+
+static int lprocfs_rd_lru_size(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct ldlm_namespace *ns = data;
+       __u32 *nr = &ns->ns_max_unused;
+
+       if (ns_connect_lru_resize(ns))
+               nr = &ns->ns_nr_unused;
+       return lprocfs_rd_uint(page, start, off, count, eof, nr);
+}
+
+static int lprocfs_wr_lru_size(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct ldlm_namespace *ns = data;
+       char dummy[MAX_STRING_SIZE + 1], *end;
+       unsigned long tmp;
+       int lru_resize;
+
+       dummy[MAX_STRING_SIZE] = '\0';
+       if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+               return -EFAULT;
+
+       if (strncmp(dummy, "clear", 5) == 0) {
+               CDEBUG(D_DLMTRACE,
+                      "dropping all unused locks from namespace %s\n",
+                      ldlm_ns_name(ns));
+               if (ns_connect_lru_resize(ns)) {
+                       int canceled, unused  = ns->ns_nr_unused;
+
+                       /* Try to cancel all @ns_nr_unused locks. */
+                       canceled = ldlm_cancel_lru(ns, unused, 0,
+                                                  LDLM_CANCEL_PASSED);
+                       if (canceled < unused) {
+                               CDEBUG(D_DLMTRACE,
+                                      "not all requested locks are canceled, "
+                                      "requested: %d, canceled: %d\n", unused,
+                                      canceled);
+                               return -EINVAL;
+                       }
+               } else {
+                       tmp = ns->ns_max_unused;
+                       ns->ns_max_unused = 0;
+                       ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+                       ns->ns_max_unused = tmp;
+               }
+               return count;
+       }
+
+       tmp = simple_strtoul(dummy, &end, 0);
+       if (dummy == end) {
+               CERROR("invalid value written\n");
+               return -EINVAL;
+       }
+       lru_resize = (tmp == 0);
+
+       if (ns_connect_lru_resize(ns)) {
+               if (!lru_resize)
+                       ns->ns_max_unused = (unsigned int)tmp;
+
+               if (tmp > ns->ns_nr_unused)
+                       tmp = ns->ns_nr_unused;
+               tmp = ns->ns_nr_unused - tmp;
+
+               CDEBUG(D_DLMTRACE,
+                      "changing namespace %s unused locks from %u to %u\n",
+                      ldlm_ns_name(ns), ns->ns_nr_unused,
+                      (unsigned int)tmp);
+               ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+               if (!lru_resize) {
+                       CDEBUG(D_DLMTRACE,
+                              "disable lru_resize for namespace %s\n",
+                              ldlm_ns_name(ns));
+                       ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+               }
+       } else {
+               CDEBUG(D_DLMTRACE,
+                      "changing namespace %s max_unused from %u to %u\n",
+                      ldlm_ns_name(ns), ns->ns_max_unused,
+                      (unsigned int)tmp);
+               ns->ns_max_unused = (unsigned int)tmp;
+               ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+               /* Make sure that LRU resize was originally supported before
+                * turning it on here. */
+               if (lru_resize &&
+                   (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+                       CDEBUG(D_DLMTRACE,
+                              "enable lru_resize for namespace %s\n",
+                              ldlm_ns_name(ns));
+                       ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+               }
+       }
+
+       return count;
+}
+
+static int lprocfs_rd_elc(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       struct ldlm_namespace *ns = data;
+       unsigned int supp = ns_connect_cancelset(ns);
+
+       return lprocfs_rd_uint(page, start, off, count, eof, &supp);
+}
+
+static int lprocfs_wr_elc(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct ldlm_namespace *ns = data;
+       unsigned int supp = -1;
+       int rc;
+
+       rc = lprocfs_wr_uint(file, buffer, count, &supp);
+       if (rc < 0)
+               return rc;
+
+       if (supp == 0)
+               ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+       else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+               ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+       return count;
+}
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+       struct proc_dir_entry *dir;
+
+       dir = lprocfs_srch(ldlm_ns_proc_dir, ldlm_ns_name(ns));
+       if (dir == NULL) {
+               CERROR("dlm namespace %s has no procfs dir?\n",
+                      ldlm_ns_name(ns));
+       } else {
+               lprocfs_remove(&dir);
+       }
+
+       if (ns->ns_stats != NULL)
+               lprocfs_free_stats(&ns->ns_stats);
+}
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+       struct lprocfs_vars lock_vars[2];
+       char lock_name[MAX_STRING_SIZE + 1];
+
+       LASSERT(ns != NULL);
+       LASSERT(ns->ns_rs_hash != NULL);
+
+       ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+       if (ns->ns_stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+                            LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+       lock_name[MAX_STRING_SIZE] = '\0';
+
+       memset(lock_vars, 0, sizeof(lock_vars));
+       lock_vars[0].name = lock_name;
+
+       snprintf(lock_name, MAX_STRING_SIZE, "%s/resource_count",
+                ldlm_ns_name(ns));
+       lock_vars[0].data = ns;
+       lock_vars[0].read_fptr = lprocfs_rd_ns_resources;
+       lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+       snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_count",
+                ldlm_ns_name(ns));
+       lock_vars[0].data = ns;
+       lock_vars[0].read_fptr = lprocfs_rd_ns_locks;
+       lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+       if (ns_is_client(ns)) {
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_unused_count",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_nr_unused;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_size",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = ns;
+               lock_vars[0].read_fptr = lprocfs_rd_lru_size;
+               lock_vars[0].write_fptr = lprocfs_wr_lru_size;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_max_age;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/early_lock_cancel",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = ns;
+               lock_vars[0].read_fptr = lprocfs_rd_elc;
+               lock_vars[0].write_fptr = lprocfs_wr_elc;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+       } else {
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/ctime_age_limit",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_ctime_age_limit;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_timeouts",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_timeouts;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/max_nolock_bytes",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_max_nolock_size;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/contention_seconds",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_contention_time;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/contended_locks",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_contended_locks;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+
+               snprintf(lock_name, MAX_STRING_SIZE, "%s/max_parallel_ast",
+                        ldlm_ns_name(ns));
+               lock_vars[0].data = &ns->ns_max_parallel_ast;
+               lock_vars[0].read_fptr = lprocfs_rd_uint;
+               lock_vars[0].write_fptr = lprocfs_wr_uint;
+               lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
+       }
+       return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* LPROCFS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)       ({0;})
+
+#endif /* LPROCFS */
+
+static unsigned ldlm_res_hop_hash(cfs_hash_t *hs,
+                                 const void *key, unsigned mask)
+{
+       const struct ldlm_res_id     *id  = key;
+       unsigned                val = 0;
+       unsigned                i;
+
+       for (i = 0; i < RES_NAME_SIZE; i++)
+               val += id->name[i];
+       return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(cfs_hash_t *hs,
+                                     const void *key, unsigned mask)
+{
+       const struct ldlm_res_id *id = key;
+       struct lu_fid       fid;
+       __u32          hash;
+       __u32          val;
+
+       fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+       fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+       fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+       hash = fid_flatten32(&fid);
+       hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+       if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+               val = id->name[LUSTRE_RES_ID_HSH_OFF];
+               hash += (val >> 5) + (val << 11);
+       } else {
+               val = fid_oid(&fid);
+       }
+       hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+       /* give me another random factor */
+       hash -= cfs_hash_long((unsigned long)hs, val % 11 + 3);
+
+       hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+       hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+       return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+       struct ldlm_resource   *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct ldlm_resource   *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       return ldlm_res_eq((const struct ldlm_res_id *)key,
+                          (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       /* cfs_hash_for_each_nolock is the only chance we call it */
+       ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+       .hs_hash        = ldlm_res_hop_hash,
+       .hs_key  = ldlm_res_hop_key,
+       .hs_keycmp      = ldlm_res_hop_keycmp,
+       .hs_keycpy      = NULL,
+       .hs_object      = ldlm_res_hop_object,
+       .hs_get  = ldlm_res_hop_get_locked,
+       .hs_put_locked  = ldlm_res_hop_put_locked,
+       .hs_put  = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+       .hs_hash        = ldlm_res_hop_fid_hash,
+       .hs_key  = ldlm_res_hop_key,
+       .hs_keycmp      = ldlm_res_hop_keycmp,
+       .hs_keycpy      = NULL,
+       .hs_object      = ldlm_res_hop_object,
+       .hs_get  = ldlm_res_hop_get_locked,
+       .hs_put_locked  = ldlm_res_hop_put_locked,
+       .hs_put  = ldlm_res_hop_put
+};
+
+typedef struct {
+       ldlm_ns_type_t  nsd_type;
+       /** hash bucket bits */
+       unsigned        nsd_bkt_bits;
+       /** hash bits */
+       unsigned        nsd_all_bits;
+       /** hash operations */
+       cfs_hash_ops_t *nsd_hops;
+} ldlm_ns_hash_def_t;
+
+ldlm_ns_hash_def_t ldlm_ns_hash_defs[] =
+{
+       {
+               .nsd_type       = LDLM_NS_TYPE_MDC,
+               .nsd_bkt_bits   = 11,
+               .nsd_all_bits   = 16,
+               .nsd_hops       = &ldlm_ns_fid_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MDT,
+               .nsd_bkt_bits   = 14,
+               .nsd_all_bits   = 21,
+               .nsd_hops       = &ldlm_ns_fid_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_OSC,
+               .nsd_bkt_bits   = 8,
+               .nsd_all_bits   = 12,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_OST,
+               .nsd_bkt_bits   = 11,
+               .nsd_all_bits   = 17,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MGC,
+               .nsd_bkt_bits   = 4,
+               .nsd_all_bits   = 4,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MGT,
+               .nsd_bkt_bits   = 4,
+               .nsd_all_bits   = 4,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+       },
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+                                         ldlm_side_t client,
+                                         ldlm_appetite_t apt,
+                                         ldlm_ns_type_t ns_type)
+{
+       struct ldlm_namespace *ns = NULL;
+       struct ldlm_ns_bucket *nsb;
+       ldlm_ns_hash_def_t    *nsd;
+       cfs_hash_bd_t     bd;
+       int                 idx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+
+       rc = ldlm_get_ref();
+       if (rc) {
+               CERROR("ldlm_get_ref failed: %d\n", rc);
+               RETURN(NULL);
+       }
+
+       for (idx = 0;;idx++) {
+               nsd = &ldlm_ns_hash_defs[idx];
+               if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+                       CERROR("Unknown type %d for ns %s\n", ns_type, name);
+                       GOTO(out_ref, NULL);
+               }
+
+               if (nsd->nsd_type == ns_type)
+                       break;
+       }
+
+       OBD_ALLOC_PTR(ns);
+       if (!ns)
+               GOTO(out_ref, NULL);
+
+       ns->ns_rs_hash = cfs_hash_create(name,
+                                        nsd->nsd_all_bits, nsd->nsd_all_bits,
+                                        nsd->nsd_bkt_bits, sizeof(*nsb),
+                                        CFS_HASH_MIN_THETA,
+                                        CFS_HASH_MAX_THETA,
+                                        nsd->nsd_hops,
+                                        CFS_HASH_DEPTH |
+                                        CFS_HASH_BIGNAME |
+                                        CFS_HASH_SPIN_BKTLOCK |
+                                        CFS_HASH_NO_ITEMREF);
+       if (ns->ns_rs_hash == NULL)
+               GOTO(out_ns, NULL);
+
+       cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+               nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+               at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+               nsb->nsb_namespace = ns;
+       }
+
+       ns->ns_obd      = obd;
+       ns->ns_appetite = apt;
+       ns->ns_client   = client;
+
+       INIT_LIST_HEAD(&ns->ns_list_chain);
+       INIT_LIST_HEAD(&ns->ns_unused_list);
+       spin_lock_init(&ns->ns_lock);
+       atomic_set(&ns->ns_bref, 0);
+       init_waitqueue_head(&ns->ns_waitq);
+
+       ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+       ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+       ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+       ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+       ns->ns_nr_unused          = 0;
+       ns->ns_max_unused        = LDLM_DEFAULT_LRU_SIZE;
+       ns->ns_max_age      = LDLM_DEFAULT_MAX_ALIVE;
+       ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+       ns->ns_timeouts    = 0;
+       ns->ns_orig_connect_flags = 0;
+       ns->ns_connect_flags      = 0;
+       ns->ns_stopping    = 0;
+       rc = ldlm_namespace_proc_register(ns);
+       if (rc != 0) {
+               CERROR("Can't initialize ns proc, rc %d\n", rc);
+               GOTO(out_hash, rc);
+       }
+
+       idx = atomic_read(ldlm_namespace_nr(client));
+       rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+       if (rc) {
+               CERROR("Can't initialize lock pool, rc %d\n", rc);
+               GOTO(out_proc, rc);
+       }
+
+       ldlm_namespace_register(ns, client);
+       RETURN(ns);
+out_proc:
+       ldlm_namespace_proc_unregister(ns);
+       ldlm_namespace_cleanup(ns, 0);
+out_hash:
+       cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+       OBD_FREE_PTR(ns);
+out_ref:
+       ldlm_put_ref();
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+                            __u64 flags)
+{
+       struct list_head *tmp;
+       int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+       bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+       do {
+               struct ldlm_lock *lock = NULL;
+
+               /* First, we look for non-cleaned-yet lock
+                * all cleaned locks are marked by CLEANED flag. */
+               lock_res(res);
+               list_for_each(tmp, q) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+                       if (lock->l_flags & LDLM_FL_CLEANED) {
+                               lock = NULL;
+                               continue;
+                       }
+                       LDLM_LOCK_GET(lock);
+                       lock->l_flags |= LDLM_FL_CLEANED;
+                       break;
+               }
+
+               if (lock == NULL) {
+                       unlock_res(res);
+                       break;
+               }
+
+               /* Set CBPENDING so nothing in the cancellation path
+                * can match this lock. */
+               lock->l_flags |= LDLM_FL_CBPENDING;
+               lock->l_flags |= LDLM_FL_FAILED;
+               lock->l_flags |= flags;
+
+               /* ... without sending a CANCEL message for local_only. */
+               if (local_only)
+                       lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+               if (local_only && (lock->l_readers || lock->l_writers)) {
+                       /* This is a little bit gross, but much better than the
+                        * alternative: pretend that we got a blocking AST from
+                        * the server, so that when the lock is decref'd, it
+                        * will go away ... */
+                       unlock_res(res);
+                       LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+                       if (lock->l_completion_ast)
+                               lock->l_completion_ast(lock, 0, NULL);
+                       LDLM_LOCK_RELEASE(lock);
+                       continue;
+               }
+
+               if (client) {
+                       struct lustre_handle lockh;
+
+                       unlock_res(res);
+                       ldlm_lock2handle(lock, &lockh);
+                       rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+                       if (rc)
+                               CERROR("ldlm_cli_cancel: %d\n", rc);
+               } else {
+                       ldlm_resource_unlink_lock(lock);
+                       unlock_res(res);
+                       LDLM_DEBUG(lock, "Freeing a lock still held by a "
+                                  "client node");
+                       ldlm_lock_destroy(lock);
+               }
+               LDLM_LOCK_RELEASE(lock);
+       } while (1);
+}
+
+static int ldlm_resource_clean(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                              struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       __u64 flags = *(__u64 *)arg;
+
+       cleanup_resource(res, &res->lr_granted, flags);
+       cleanup_resource(res, &res->lr_converting, flags);
+       cleanup_resource(res, &res->lr_waiting, flags);
+
+       return 0;
+}
+
+static int ldlm_resource_complain(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                 struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+       lock_res(res);
+       CERROR("Namespace %s resource refcount nonzero "
+              "(%d) after lock cleanup; forcing "
+              "cleanup.\n",
+              ldlm_ns_name(ldlm_res_to_ns(res)),
+              atomic_read(&res->lr_refcount) - 1);
+
+       CERROR("Resource: %p ("LPU64"/"LPU64"/"LPU64"/"
+              LPU64") (rc: %d)\n", res,
+              res->lr_name.name[0], res->lr_name.name[1],
+              res->lr_name.name[2], res->lr_name.name[3],
+              atomic_read(&res->lr_refcount) - 1);
+
+       ldlm_resource_dump(D_ERROR, res);
+       unlock_res(res);
+       return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+       if (ns == NULL) {
+               CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+               return ELDLM_OK;
+       }
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+       cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+       return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+       ENTRY;
+
+       /* At shutdown time, don't call the cancellation callback */
+       ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+       if (atomic_read(&ns->ns_bref) > 0) {
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               int rc;
+               CDEBUG(D_DLMTRACE,
+                      "dlm namespace %s free waiting on refcount %d\n",
+                      ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+               if (force)
+                       lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+               rc = l_wait_event(ns->ns_waitq,
+                                 atomic_read(&ns->ns_bref) == 0, &lwi);
+
+               /* Forced cleanups should be able to reclaim all references,
+                * so it's safe to wait forever... we can't leak locks... */
+               if (force && rc == -ETIMEDOUT) {
+                       LCONSOLE_ERROR("Forced cleanup waiting for %s "
+                                      "namespace with %d resources in use, "
+                                      "(rc=%d)\n", ldlm_ns_name(ns),
+                                      atomic_read(&ns->ns_bref), rc);
+                       GOTO(force_wait, rc);
+               }
+
+               if (atomic_read(&ns->ns_bref)) {
+                       LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+                                      "with %d resources in use, (rc=%d)\n",
+                                      ldlm_ns_name(ns),
+                                      atomic_read(&ns->ns_bref), rc);
+                       RETURN(ELDLM_NAMESPACE_EXISTS);
+               }
+               CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+                      ldlm_ns_name(ns));
+       }
+
+       RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                              struct obd_import *imp,
+                              int force)
+{
+       int rc;
+       ENTRY;
+       if (!ns) {
+               EXIT;
+               return;
+       }
+
+       spin_lock(&ns->ns_lock);
+       ns->ns_stopping = 1;
+       spin_unlock(&ns->ns_lock);
+
+       /*
+        * Can fail with -EINTR when force == 0 in which case try harder.
+        */
+       rc = __ldlm_namespace_free(ns, force);
+       if (rc != ELDLM_OK) {
+               if (imp) {
+                       ptlrpc_disconnect_import(imp, 0);
+                       ptlrpc_invalidate_import(imp);
+               }
+
+               /*
+                * With all requests dropped and the import inactive
+                * we are gaurenteed all reference will be dropped.
+                */
+               rc = __ldlm_namespace_free(ns, 1);
+               LASSERT(rc == 0);
+       }
+       EXIT;
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+       ENTRY;
+       if (!ns) {
+               EXIT;
+               return;
+       }
+
+       /* Make sure that nobody can find this ns in its list. */
+       ldlm_namespace_unregister(ns, ns->ns_client);
+       /* Fini pool _before_ parent proc dir is removed. This is important as
+        * ldlm_pool_fini() removes own proc dir which is child to @dir.
+        * Removing it after @dir may cause oops. */
+       ldlm_pool_fini(&ns->ns_pool);
+
+       ldlm_namespace_proc_unregister(ns);
+       cfs_hash_putref(ns->ns_rs_hash);
+       /* Namespace \a ns should be not on list at this time, otherwise
+        * this will cause issues related to using freed \a ns in poold
+        * thread. */
+       LASSERT(list_empty(&ns->ns_list_chain));
+       OBD_FREE_PTR(ns);
+       ldlm_put_ref();
+       EXIT;
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *     class_disconnect_export(grab cl_sem) ->
+ *           -> ldlm_namespace_free ->
+ *           -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *     lprocfs_fops_read(grab _lprocfs_lock) ->
+ *           -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+                        struct obd_import *imp,
+                        int force)
+{
+       ldlm_namespace_free_prior(ns, imp, force);
+       ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+       atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+       if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+               wake_up(&ns->ns_waitq);
+               spin_unlock(&ns->ns_lock);
+       }
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       mutex_lock(ldlm_namespace_lock(client));
+       LASSERT(list_empty(&ns->ns_list_chain));
+       list_add(&ns->ns_list_chain, ldlm_namespace_list(client));
+       atomic_inc(ldlm_namespace_nr(client));
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       mutex_lock(ldlm_namespace_lock(client));
+       LASSERT(!list_empty(&ns->ns_list_chain));
+       /* Some asserts and possibly other parts of the code are still
+        * using list_empty(&ns->ns_list_chain). This is why it is
+        * important to use list_del_init() here. */
+       list_del_init(&ns->ns_list_chain);
+       atomic_dec(ldlm_namespace_nr(client));
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       LASSERT(!list_empty(&ns->ns_list_chain));
+       LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+       list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+       LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+       LASSERT(!list_empty(ldlm_namespace_list(client)));
+       return container_of(ldlm_namespace_list(client)->next,
+               struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+       struct ldlm_resource *res;
+       int idx;
+
+       OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, __GFP_IO);
+       if (res == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&res->lr_granted);
+       INIT_LIST_HEAD(&res->lr_converting);
+       INIT_LIST_HEAD(&res->lr_waiting);
+
+       /* Initialize interval trees for each lock mode. */
+       for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+               res->lr_itree[idx].lit_size = 0;
+               res->lr_itree[idx].lit_mode = 1 << idx;
+               res->lr_itree[idx].lit_root = NULL;
+       }
+
+       atomic_set(&res->lr_refcount, 1);
+       spin_lock_init(&res->lr_lock);
+       lu_ref_init(&res->lr_reference);
+
+       /* The creator of the resource must unlock the mutex after LVB
+        * initialization. */
+       mutex_init(&res->lr_lvb_mutex);
+       mutex_lock(&res->lr_lvb_mutex);
+
+       return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+                 const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+       struct hlist_node     *hnode;
+       struct ldlm_resource *res;
+       cfs_hash_bd_t    bd;
+       __u64            version;
+
+       LASSERT(ns != NULL);
+       LASSERT(parent == NULL);
+       LASSERT(ns->ns_rs_hash != NULL);
+       LASSERT(name->name[0] != 0);
+
+       cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+       hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+       if (hnode != NULL) {
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+               res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+               /* Synchronize with regard to resource creation. */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+                       mutex_lock(&res->lr_lvb_mutex);
+                       mutex_unlock(&res->lr_lvb_mutex);
+               }
+
+               if (unlikely(res->lr_lvb_len < 0)) {
+                       ldlm_resource_putref(res);
+                       res = NULL;
+               }
+               return res;
+       }
+
+       version = cfs_hash_bd_version_get(&bd);
+       cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+       if (create == 0)
+               return NULL;
+
+       LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+                "type: %d\n", type);
+       res = ldlm_resource_new();
+       if (!res)
+               return NULL;
+
+       res->lr_ns_bucket  = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+       res->lr_name       = *name;
+       res->lr_type       = type;
+       res->lr_most_restr = LCK_NL;
+
+       cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+       hnode = (version == cfs_hash_bd_version_get(&bd)) ?  NULL :
+               cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+       if (hnode != NULL) {
+               /* Someone won the race and already added the resource. */
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               /* Clean lu_ref for failed resource. */
+               lu_ref_fini(&res->lr_reference);
+               /* We have taken lr_lvb_mutex. Drop it. */
+               mutex_unlock(&res->lr_lvb_mutex);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+               res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+               /* Synchronize with regard to resource creation. */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+                       mutex_lock(&res->lr_lvb_mutex);
+                       mutex_unlock(&res->lr_lvb_mutex);
+               }
+
+               if (unlikely(res->lr_lvb_len < 0)) {
+                       ldlm_resource_putref(res);
+                       res = NULL;
+               }
+               return res;
+       }
+       /* We won! Let's add the resource. */
+       cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+       if (cfs_hash_bd_count_get(&bd) == 1)
+               ldlm_namespace_get(ns);
+
+       cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+       if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+               int rc;
+
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+               rc = ns->ns_lvbo->lvbo_init(res);
+               if (rc < 0) {
+                       CERROR("lvbo_init failed for resource "
+                              LPU64": rc %d\n", name->name[0], rc);
+                       if (res->lr_lvb_data) {
+                               OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+                               res->lr_lvb_data = NULL;
+                       }
+                       res->lr_lvb_len = rc;
+                       mutex_unlock(&res->lr_lvb_mutex);
+                       ldlm_resource_putref(res);
+                       return NULL;
+               }
+       }
+
+       /* We create resource with locked lr_lvb_mutex. */
+       mutex_unlock(&res->lr_lvb_mutex);
+
+       return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+       LASSERT(res != NULL);
+       LASSERT(res != LP_POISON);
+       atomic_inc(&res->lr_refcount);
+       CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+              atomic_read(&res->lr_refcount));
+       return res;
+}
+
+static void __ldlm_resource_putref_final(cfs_hash_bd_t *bd,
+                                        struct ldlm_resource *res)
+{
+       struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+       if (!list_empty(&res->lr_granted)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       if (!list_empty(&res->lr_converting)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       if (!list_empty(&res->lr_waiting)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+                              bd, &res->lr_hash);
+       lu_ref_fini(&res->lr_reference);
+       if (cfs_hash_bd_count_get(bd) == 0)
+               ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+       cfs_hash_bd_t   bd;
+
+       LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "putref res: %p count: %d\n",
+              res, atomic_read(&res->lr_refcount) - 1);
+
+       cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+       if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+               __ldlm_resource_putref_final(&bd, res);
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+                       ns->ns_lvbo->lvbo_free(res);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+       LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "putref res: %p count: %d\n",
+              res, atomic_read(&res->lr_refcount) - 1);
+
+       if (atomic_dec_and_test(&res->lr_refcount)) {
+               cfs_hash_bd_t bd;
+
+               cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+                               &res->lr_name, &bd);
+               __ldlm_resource_putref_final(&bd, res);
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               /* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+                * so we should never be here while calling cfs_hash_del,
+                * cfs_hash_for_each_nolock is the only case we can get
+                * here, which is safe to release cfs_hash_bd_lock.
+                */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+                       ns->ns_lvbo->lvbo_free(res);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+               cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+                           struct ldlm_lock *lock)
+{
+       check_res_locked(res);
+
+       LDLM_DEBUG(lock, "About to add this lock:\n");
+
+       if (lock->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               return;
+       }
+
+       LASSERT(list_empty(&lock->l_res_link));
+
+       list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                    struct ldlm_lock *new)
+{
+       struct ldlm_resource *res = original->l_resource;
+
+       check_res_locked(res);
+
+       ldlm_resource_dump(D_INFO, res);
+       LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+       if (new->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               goto out;
+       }
+
+       LASSERT(list_empty(&new->l_res_link));
+
+       list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+       int type = lock->l_resource->lr_type;
+
+       check_res_locked(lock->l_resource);
+       if (type == LDLM_IBITS || type == LDLM_PLAIN)
+               ldlm_unlink_lock_skiplist(lock);
+       else if (type == LDLM_EXTENT)
+               ldlm_extent_unlink_lock(lock);
+       list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+       desc->lr_type = res->lr_type;
+       desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+       struct list_head *tmp;
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       mutex_lock(ldlm_namespace_lock(client));
+
+       list_for_each(tmp, ldlm_namespace_list(client)) {
+               struct ldlm_namespace *ns;
+               ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+               ldlm_namespace_dump(level, ns);
+       }
+
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                             struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       int    level = (int)(unsigned long)arg;
+
+       lock_res(res);
+       ldlm_resource_dump(level, res);
+       unlock_res(res);
+
+       return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+              ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+              ns_is_client(ns) ? "client" : "server");
+
+       if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+               return;
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                ldlm_res_hash_dump,
+                                (void *)(unsigned long)level);
+       spin_lock(&ns->ns_lock);
+       ns->ns_next_dump = cfs_time_shift(10);
+       spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+       struct ldlm_lock *lock;
+       unsigned int granted = 0;
+
+       CLASSERT(RES_NAME_SIZE == 4);
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       CDEBUG(level, "--- Resource: %p ("LPU64"/"LPU64"/"LPU64"/"LPU64
+              ") (rc: %d)\n", res, res->lr_name.name[0], res->lr_name.name[1],
+              res->lr_name.name[2], res->lr_name.name[3],
+              atomic_read(&res->lr_refcount));
+
+       if (!list_empty(&res->lr_granted)) {
+               CDEBUG(level, "Granted locks (in reverse order):\n");
+               list_for_each_entry_reverse(lock, &res->lr_granted,
+                                               l_res_link) {
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+                       if (!(level & D_CANTMASK) &&
+                           ++granted > ldlm_dump_granted_max) {
+                               CDEBUG(level, "only dump %d granted locks to "
+                                      "avoid DDOS.\n", granted);
+                               break;
+                       }
+               }
+       }
+       if (!list_empty(&res->lr_converting)) {
+               CDEBUG(level, "Converting locks:\n");
+               list_for_each_entry(lock, &res->lr_converting, l_res_link)
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+       }
+       if (!list_empty(&res->lr_waiting)) {
+               CDEBUG(level, "Waiting locks:\n");
+               list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+       }
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644 (file)
index 0000000..d64a3d0
--- /dev/null
@@ -0,0 +1,22 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-mem.o linux-cpu.o
+libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o
+libcfs-linux-objs += linux-proc.o linux-curproc.o
+libcfs-linux-objs += linux-utils.o linux-module.o
+libcfs-linux-objs += linux-crypto.o linux-crypto-crc32.o
+libcfs-linux-objs += linux-crypto-adler.o
+libcfs-linux-objs += linux-crypto-crc32pclmul.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+                  watchdog.o libcfs_string.o hash.o kernel_user_comm.o \
+                  prng.o workitem.o upcall_cache.o libcfs_cpu.o \
+                  libcfs_mem.o libcfs_lock.o crc32-pclmul_asm.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
+
+ccflags-y := -I$(src)/../include
+ccflags-y += -I$(src)/
diff --git a/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S b/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S
new file mode 100644 (file)
index 0000000..cfaf13f
--- /dev/null
@@ -0,0 +1,360 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:     Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *           Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+/* gcc 4.1.2 does not support pclmulqdq instruction
+ * Use macro defenition from linux kernel 2.6.38  */
+
+#define REG_NUM_INVALID        100
+       .macro R32_NUM opd r32
+       \opd = REG_NUM_INVALID
+       .ifc \r32,%eax
+       \opd = 0
+       .endif
+       .ifc \r32,%ecx
+       \opd = 1
+       .endif
+       .ifc \r32,%edx
+       \opd = 2
+       .endif
+       .ifc \r32,%ebx
+       \opd = 3
+       .endif
+       .ifc \r32,%esp
+       \opd = 4
+       .endif
+       .ifc \r32,%ebp
+       \opd = 5
+       .endif
+       .ifc \r32,%esi
+       \opd = 6
+       .endif
+       .ifc \r32,%edi
+       \opd = 7
+       .endif
+       .endm
+
+       .macro XMM_NUM opd xmm
+       \opd = REG_NUM_INVALID
+       .ifc \xmm,%xmm0
+       \opd = 0
+       .endif
+       .ifc \xmm,%xmm1
+       \opd = 1
+       .endif
+       .ifc \xmm,%xmm2
+       \opd = 2
+       .endif
+       .ifc \xmm,%xmm3
+       \opd = 3
+       .endif
+       .ifc \xmm,%xmm4
+       \opd = 4
+       .endif
+       .ifc \xmm,%xmm5
+       \opd = 5
+       .endif
+       .ifc \xmm,%xmm6
+       \opd = 6
+       .endif
+       .ifc \xmm,%xmm7
+       \opd = 7
+       .endif
+       .ifc \xmm,%xmm8
+       \opd = 8
+       .endif
+       .ifc \xmm,%xmm9
+       \opd = 9
+       .endif
+       .ifc \xmm,%xmm10
+       \opd = 10
+       .endif
+       .ifc \xmm,%xmm11
+       \opd = 11
+       .endif
+       .ifc \xmm,%xmm12
+       \opd = 12
+       .endif
+       .ifc \xmm,%xmm13
+       \opd = 13
+       .endif
+       .ifc \xmm,%xmm14
+       \opd = 14
+       .endif
+       .ifc \xmm,%xmm15
+       \opd = 15
+       .endif
+       .endm
+
+       .macro PFX_OPD_SIZE
+       .byte 0x66
+       .endm
+
+       .macro PFX_REX opd1 opd2 W=0
+       .if ((\opd1 | \opd2) & 8) || \W
+       .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+       .endif
+       .endm
+
+       .macro MODRM mod opd1 opd2
+       .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+       .endm
+
+       .macro PCLMULQDQ imm8 xmm1 xmm2
+       XMM_NUM clmul_opd1 \xmm1
+       XMM_NUM clmul_opd2 \xmm2
+       PFX_OPD_SIZE
+       PFX_REX clmul_opd1 clmul_opd2
+       .byte 0x0f, 0x3a, 0x44
+       MODRM 0xc0 clmul_opd1 clmul_opd2
+       .byte \imm8
+       .endm
+
+       .macro PEXTRD imm8 xmm1 reg1
+       XMM_NUM extrd_opd2 \xmm1
+       R32_NUM extrd_opd1 \reg1
+       PFX_OPD_SIZE
+       PFX_REX extrd_opd1 extrd_opd2
+       .byte 0x0f, 0x3a, 0x16
+       MODRM 0xc0 extrd_opd1 extrd_opd2
+       .byte \imm8
+       .endm
+
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+ * #define CONSTANT_R1  0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+ * #define CONSTANT_R2  0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+       .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+ * #define CONSTANT_R3  0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+ * #define CONSTANT_R4  0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+       .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+ * #define CONSTANT_R5  0x163cd6124LL
+ */
+.Lconstant_R5:
+       .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+       .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU  0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+       .octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF     %rdi
+#define LEN     %rsi
+#define CRC     %edx
+#else
+#define BUF     %eax
+#define LEN     %edx
+#define CRC     %ecx
+#endif
+
+
+
+.text
+/**
+ *      Calculate crc32
+ *      BUF - buffer (16 bytes aligned)
+ *      LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ *      CRC - initial crc32
+ *      return %eax crc32
+ *      uint crc32_pclmul_le_16(unsigned char const *buffer,
+ *                          size_t len, uint crc32)
+ */
+.globl crc32_pclmul_le_16
+.align 4, 0x90
+crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+       movdqa  (BUF), %xmm1
+       movdqa  0x10(BUF), %xmm2
+       movdqa  0x20(BUF), %xmm3
+       movdqa  0x30(BUF), %xmm4
+       movd    CRC, CONSTANT
+       pxor    CONSTANT, %xmm1
+       sub     $0x40, LEN
+       add     $0x40, BUF
+#ifndef __x86_64__
+       /* This is for position independed code(-fPIC) support for 32bit */
+       call    delta
+delta:
+       pop     %ecx
+#endif
+       cmp     $0x40, LEN
+       jb      less_64
+
+#ifdef __x86_64__
+       movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+       movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+#endif
+
+loop_64:/*  64 bytes Full cache line folding */
+       prefetchnta    0x40(BUF)
+       movdqa  %xmm1, %xmm5
+       movdqa  %xmm2, %xmm6
+       movdqa  %xmm3, %xmm7
+#ifdef __x86_64__
+       movdqa  %xmm4, %xmm8
+#endif
+       PCLMULQDQ 00, CONSTANT, %xmm1
+       PCLMULQDQ 00, CONSTANT, %xmm2
+       PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+       PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       PCLMULQDQ 0x11, CONSTANT, %xmm6
+       PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+       PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+       pxor    %xmm5, %xmm1
+       pxor    %xmm6, %xmm2
+       pxor    %xmm7, %xmm3
+#ifdef __x86_64__
+       pxor    %xmm8, %xmm4
+#else
+       /* xmm8 unsupported for x32 */
+       movdqa  %xmm4, %xmm5
+       PCLMULQDQ 00, CONSTANT, %xmm4
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       pxor    %xmm5, %xmm4
+#endif
+
+       pxor    (BUF), %xmm1
+       pxor    0x10(BUF), %xmm2
+       pxor    0x20(BUF), %xmm3
+       pxor    0x30(BUF), %xmm4
+
+       sub     $0x40, LEN
+       add     $0x40, BUF
+       cmp     $0x40, LEN
+       jge     loop_64
+less_64:/*  Folding cache line into 128bit */
+#ifdef __x86_64__
+       movdqa  .Lconstant_R4R3(%rip), CONSTANT
+#else
+       movdqa  .Lconstant_R4R3 - delta(%ecx), CONSTANT
+#endif
+       prefetchnta     (BUF)
+
+       movdqa  %xmm1, %xmm5
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       pxor    %xmm5, %xmm1
+       pxor    %xmm2, %xmm1
+
+       movdqa  %xmm1, %xmm5
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       pxor    %xmm5, %xmm1
+       pxor    %xmm3, %xmm1
+
+       movdqa  %xmm1, %xmm5
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       pxor    %xmm5, %xmm1
+       pxor    %xmm4, %xmm1
+
+       cmp     $0x10, LEN
+       jb      fold_64
+loop_16:/* Folding rest buffer into 128bit */
+       movdqa  %xmm1, %xmm5
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       PCLMULQDQ 0x11, CONSTANT, %xmm5
+       pxor    %xmm5, %xmm1
+       pxor    (BUF), %xmm1
+       sub     $0x10, LEN
+       add     $0x10, BUF
+       cmp     $0x10, LEN
+       jge     loop_16
+
+fold_64:
+       /* perform the last 64 bit fold, also adds 32 zeroes
+        * to the input stream */
+       PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+       psrldq  $0x08, %xmm1
+       pxor    CONSTANT, %xmm1
+
+       /* final 32-bit fold */
+       movdqa  %xmm1, %xmm2
+#ifdef __x86_64__
+       movdqa  .Lconstant_R5(%rip), CONSTANT
+       movdqa  .Lconstant_mask32(%rip), %xmm3
+#else
+       movdqa  .Lconstant_R5 - delta(%ecx), CONSTANT
+       movdqa  .Lconstant_mask32 - delta(%ecx), %xmm3
+#endif
+       psrldq  $0x04, %xmm2
+       pand    %xmm3, %xmm1
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       pxor    %xmm2, %xmm1
+
+       /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+       movdqa  .Lconstant_RUpoly(%rip), CONSTANT
+#else
+       movdqa  .Lconstant_RUpoly - delta(%ecx), CONSTANT
+#endif
+       movdqa  %xmm1, %xmm2
+       pand    %xmm3, %xmm1
+       PCLMULQDQ 0x10, CONSTANT, %xmm1
+       pand    %xmm3, %xmm1
+       PCLMULQDQ 0x00, CONSTANT, %xmm1
+       pxor    %xmm2, %xmm1
+       PEXTRD  0x01, %xmm1, %eax
+
+       ret
diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644 (file)
index 0000000..5a87b08
--- /dev/null
@@ -0,0 +1,476 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+CFS_MODULE_PARM(libcfs_subsystem_debug, "i", int, 0644,
+               "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+                            D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+CFS_MODULE_PARM(libcfs_debug, "i", int, 0644,
+               "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+unsigned int libcfs_debug_mb = 0;
+CFS_MODULE_PARM(libcfs_debug_mb, "i", uint, 0644,
+               "Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+CFS_MODULE_PARM(libcfs_printk, "i", uint, 0644,
+               "Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+CFS_MODULE_PARM(libcfs_console_ratelimit, "i", uint, 0644,
+               "Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+CFS_MODULE_PARM(libcfs_console_max_delay, "l", uint, 0644,
+               "Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+CFS_MODULE_PARM(libcfs_console_min_delay, "l", uint, 0644,
+               "Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+CFS_MODULE_PARM(libcfs_console_backoff, "i", uint, 0644,
+               "Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644,
+               "Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+char *libcfs_debug_file_path;
+CFS_MODULE_PARM(libcfs_debug_file_path, "s", charp, 0644,
+               "Path for dumping debug logs, "
+               "set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_subsys2str(int subsys)
+{
+       switch (1 << subsys) {
+       default:
+               return NULL;
+       case S_UNDEFINED:
+               return "undefined";
+       case S_MDC:
+               return "mdc";
+       case S_MDS:
+               return "mds";
+       case S_OSC:
+               return "osc";
+       case S_OST:
+               return "ost";
+       case S_CLASS:
+               return "class";
+       case S_LOG:
+               return "log";
+       case S_LLITE:
+               return "llite";
+       case S_RPC:
+               return "rpc";
+       case S_LNET:
+               return "lnet";
+       case S_LND:
+               return "lnd";
+       case S_PINGER:
+               return "pinger";
+       case S_FILTER:
+               return "filter";
+       case S_ECHO:
+               return "echo";
+       case S_LDLM:
+               return "ldlm";
+       case S_LOV:
+               return "lov";
+       case S_LQUOTA:
+               return "lquota";
+       case S_OSD:
+               return "osd";
+       case S_LMV:
+               return "lmv";
+       case S_SEC:
+               return "sec";
+       case S_GSS:
+               return "gss";
+       case S_MGC:
+               return "mgc";
+       case S_MGS:
+               return "mgs";
+       case S_FID:
+               return "fid";
+       case S_FLD:
+               return "fld";
+       }
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_dbg2str(int debug)
+{
+       switch (1 << debug) {
+       default:
+               return NULL;
+       case D_TRACE:
+               return "trace";
+       case D_INODE:
+               return "inode";
+       case D_SUPER:
+               return "super";
+       case D_EXT2:
+               return "ext2";
+       case D_MALLOC:
+               return "malloc";
+       case D_CACHE:
+               return "cache";
+       case D_INFO:
+               return "info";
+       case D_IOCTL:
+               return "ioctl";
+       case D_NETERROR:
+               return "neterror";
+       case D_NET:
+               return "net";
+       case D_WARNING:
+               return "warning";
+       case D_BUFFS:
+               return "buffs";
+       case D_OTHER:
+               return "other";
+       case D_DENTRY:
+               return "dentry";
+       case D_NETTRACE:
+               return "nettrace";
+       case D_PAGE:
+               return "page";
+       case D_DLMTRACE:
+               return "dlmtrace";
+       case D_ERROR:
+               return "error";
+       case D_EMERG:
+               return "emerg";
+       case D_HA:
+               return "ha";
+       case D_RPCTRACE:
+               return "rpctrace";
+       case D_VFSTRACE:
+               return "vfstrace";
+       case D_READA:
+               return "reada";
+       case D_MMAP:
+               return "mmap";
+       case D_CONFIG:
+               return "config";
+       case D_CONSOLE:
+               return "console";
+       case D_QUOTA:
+               return "quota";
+       case D_SEC:
+               return "sec";
+       case D_LFSCK:
+               return "lfsck";
+       }
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+       const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                libcfs_debug_dbg2str;
+       int        len = 0;
+       const char   *token;
+       int        i;
+
+       if (mask == 0) {                        /* "0" */
+               if (size > 0)
+                       str[0] = '0';
+               len = 1;
+       } else {                                /* space-separated tokens */
+               for (i = 0; i < 32; i++) {
+                       if ((mask & (1 << i)) == 0)
+                               continue;
+
+                       token = fn(i);
+                       if (token == NULL)            /* unused bit */
+                               continue;
+
+                       if (len > 0) {            /* separator? */
+                               if (len < size)
+                                       str[len] = ' ';
+                               len++;
+                       }
+
+                       while (*token != 0) {
+                               if (len < size)
+                                       str[len] = *token;
+                               token++;
+                               len++;
+                       }
+               }
+       }
+
+       /* terminate 'str' */
+       if (len < size)
+               str[len] = 0;
+       else
+               str[size - 1] = 0;
+
+       return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+       const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                libcfs_debug_dbg2str;
+       int      m = 0;
+       int      matched;
+       int      n;
+       int      t;
+
+       /* Allow a number for backwards compatibility */
+
+       for (n = strlen(str); n > 0; n--)
+               if (!isspace(str[n-1]))
+                       break;
+       matched = n;
+
+       if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+           matched == n) {
+               /* don't print warning for lctl set_param debug=0 or -1 */
+               if (m != 0 && m != -1)
+                       CWARN("You are trying to use a numerical value for the "
+                             "mask - this will be deprecated in a future "
+                             "release.\n");
+               *mask = m;
+               return 0;
+       }
+
+       return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+                           0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+       DECL_JOURNAL_DATA;
+
+       PUSH_JOURNAL;
+
+       if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+               snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+                        "%s.%ld." LPLD, libcfs_debug_file_path_arr,
+                        cfs_time_current_sec(), (long_ptr_t)arg);
+               printk(KERN_ALERT "LustreError: dumping log to %s\n",
+                      debug_file_name);
+               cfs_tracefile_dump_all_pages(debug_file_name);
+               libcfs_run_debug_log_upcall(debug_file_name);
+       }
+       POP_JOURNAL;
+}
+
+int libcfs_debug_dumplog_thread(void *arg)
+{
+       libcfs_debug_dumplog_internal(arg);
+       wake_up(&debug_ctlwq);
+       return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+       wait_queue_t wait;
+       task_t    *dumper;
+       ENTRY;
+
+       /* we're being careful to ensure that the kernel thread is
+        * able to set our state to running as it exits before we
+        * get to schedule() */
+       init_waitqueue_entry_current(&wait);
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(&debug_ctlwq, &wait);
+
+       dumper = kthread_run(libcfs_debug_dumplog_thread,
+                            (void *)(long)current_pid(),
+                            "libcfs_debug_dumper");
+       if (IS_ERR(dumper))
+               printk(KERN_ERR "LustreError: cannot start log dump thread:"
+                      " %ld\n", PTR_ERR(dumper));
+       else
+               waitq_wait(&wait, TASK_INTERRUPTIBLE);
+
+       /* be sure to teardown if cfs_create_thread() failed */
+       remove_wait_queue(&debug_ctlwq, &wait);
+       set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+       int    rc = 0;
+       unsigned int max = libcfs_debug_mb;
+
+       init_waitqueue_head(&debug_ctlwq);
+
+       if (libcfs_console_max_delay <= 0 || /* not set by user or */
+           libcfs_console_min_delay <= 0 || /* set to invalid values */
+           libcfs_console_min_delay >= libcfs_console_max_delay) {
+               libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+               libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+       }
+
+       if (libcfs_debug_file_path != NULL) {
+               memset(libcfs_debug_file_path_arr, 0, PATH_MAX);
+               strncpy(libcfs_debug_file_path_arr,
+                       libcfs_debug_file_path, PATH_MAX-1);
+       }
+
+       /* If libcfs_debug_mb is set to an invalid value or uninitialized
+        * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+       if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+               max = TCD_MAX_PAGES;
+       } else {
+               max = (max / num_possible_cpus());
+               max = (max << (20 - PAGE_CACHE_SHIFT));
+       }
+       rc = cfs_tracefile_init(max);
+
+       if (rc == 0)
+               libcfs_register_panic_notifier();
+
+       return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+       libcfs_unregister_panic_notifier();
+       cfs_tracefile_exit();
+       return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+       cfs_trace_flush_pages();
+       return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+       CDEBUG(D_TRACE,"***************************************************\n");
+       LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+       CDEBUG(D_TRACE,"***************************************************\n");
+
+       return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+       printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
+              debug_level);
+       libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
+
+long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+       libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+                        rc, rc, rc);
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_log_return);
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label,
+                    long_ptr_t rc)
+{
+       libcfs_debug_msg(msgdata, "Process leaving via %s (rc=" LPLU " : " LPLD
+                        " : " LPLX ")\n", label, (ulong_ptr_t)rc, rc, rc);
+}
+EXPORT_SYMBOL(libcfs_log_goto);
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644 (file)
index 0000000..c54448d
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc = 0;
+unsigned int cfs_fail_val = 0;
+wait_queue_head_t cfs_race_waitq;
+int cfs_race_state;
+
+EXPORT_SYMBOL(cfs_fail_loc);
+EXPORT_SYMBOL(cfs_fail_val);
+EXPORT_SYMBOL(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+       static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+       LASSERT(!(id & CFS_FAIL_ONCE));
+
+       if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+           (CFS_FAILED | CFS_FAIL_ONCE)) {
+               atomic_set(&cfs_fail_count, 0); /* paranoia */
+               return 0;
+       }
+
+       /* Fail 1/cfs_fail_val times */
+       if (cfs_fail_loc & CFS_FAIL_RAND) {
+               if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+                       return 0;
+       }
+
+       /* Skip the first cfs_fail_val, then fail */
+       if (cfs_fail_loc & CFS_FAIL_SKIP) {
+               if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+                       return 0;
+       }
+
+       /* check cfs_fail_val... */
+       if (set == CFS_FAIL_LOC_VALUE) {
+               if (cfs_fail_val != -1 && cfs_fail_val != value)
+                       return 0;
+       }
+
+       /* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+       if (cfs_fail_loc & CFS_FAIL_SOME &&
+           (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+               int count = atomic_inc_return(&cfs_fail_count);
+
+               if (count >= cfs_fail_val) {
+                       set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+                       atomic_set(&cfs_fail_count, 0);
+                       /* we are lost race to increase  */
+                       if (count > cfs_fail_val)
+                               return 0;
+               }
+       }
+
+       if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+           (value & CFS_FAIL_ONCE))
+               set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+       /* Lost race to set CFS_FAILED_BIT. */
+       if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+               /* If CFS_FAIL_ONCE is valid, only one process can fail,
+                * otherwise multi-process can fail at the same time. */
+               if (cfs_fail_loc & CFS_FAIL_ONCE)
+                       return 0;
+       }
+
+       switch (set) {
+               case CFS_FAIL_LOC_NOSET:
+               case CFS_FAIL_LOC_VALUE:
+                       break;
+               case CFS_FAIL_LOC_ORSET:
+                       cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+                       break;
+               case CFS_FAIL_LOC_RESET:
+                       cfs_fail_loc = value;
+                       break;
+               default:
+                       LASSERTF(0, "called with bad set %u\n", set);
+                       break;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+       int ret = 0;
+
+       ret = __cfs_fail_check_set(id, value, set);
+       if (ret) {
+               CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+                      id, ms);
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(ms) / 1000);
+               set_current_state(TASK_RUNNING);
+               CERROR("cfs_fail_timeout id %x awake\n", id);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644 (file)
index 0000000..231c678
--- /dev/null
@@ -0,0 +1,2135 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(intead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can speicify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+CFS_MODULE_PARM(warn_on_depth, "i", uint, 0644,
+               "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+       spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+       spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+       if (!exclusive)
+               read_lock(&lock->rw);
+       else
+               write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+       if (!exclusive)
+               read_unlock(&lock->rw);
+       else
+               write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_nl_lock,
+       .hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops =
+{
+       .hs_lock        = cfs_hash_spin_lock,
+       .hs_unlock      = cfs_hash_spin_unlock,
+       .hs_bkt_lock    = cfs_hash_nl_lock,
+       .hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops =
+{
+       .hs_lock        = cfs_hash_rw_lock,
+       .hs_unlock      = cfs_hash_rw_unlock,
+       .hs_bkt_lock    = cfs_hash_spin_lock,
+       .hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops =
+{
+       .hs_lock        = cfs_hash_rw_lock,
+       .hs_unlock      = cfs_hash_rw_unlock,
+       .hs_bkt_lock    = cfs_hash_rw_lock,
+       .hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_spin_lock,
+       .hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_rw_lock,
+       .hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_no_lock(hs)) {
+               hs->hs_lops = &cfs_hash_nl_lops;
+
+       } else if (cfs_hash_with_no_bktlock(hs)) {
+               hs->hs_lops = &cfs_hash_nbl_lops;
+               spin_lock_init(&hs->hs_lock.spin);
+
+       } else if (cfs_hash_with_rehash(hs)) {
+               rwlock_init(&hs->hs_lock.rw);
+
+               if (cfs_hash_with_rw_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_bkt_rw_lops;
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_bkt_spin_lops;
+               else
+                       LBUG();
+       } else {
+               if (cfs_hash_with_rw_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+               else
+                       LBUG();
+       }
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+       struct hlist_head       hh_head;        /**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+       return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+       return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       hlist_del_init(hnode);
+       return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+       struct hlist_head       hd_head;        /**< entries list */
+       unsigned int        hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_head_dep_t   *head;
+
+       head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+                                              cfs_hash_head_dep_t, hd_head);
+       hlist_add_head(hnode, &hh->hd_head);
+       return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+                                              cfs_hash_head_dep_t, hd_head);
+       hlist_del_init(hnode);
+       return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+       struct hlist_head       dh_head;        /**< entries list */
+       struct hlist_node       *dh_tail;       /**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_dhead_t *head;
+
+       head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+                                           cfs_hash_dhead_t, dh_head);
+
+       if (dh->dh_tail != NULL) /* not empty */
+               hlist_add_after(dh->dh_tail, hnode);
+       else /* empty list */
+               hlist_add_head(hnode, &dh->dh_head);
+       dh->dh_tail = hnode;
+       return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnd)
+{
+       cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+                                           cfs_hash_dhead_t, dh_head);
+
+       if (hnd->next == NULL) { /* it's the tail */
+               dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+                             container_of(hnd->pprev, struct hlist_node, next);
+       }
+       hlist_del_init(hnd);
+       return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+       struct hlist_head       dd_head;        /**< entries list */
+       struct hlist_node       *dd_tail;       /**< the last entry */
+       unsigned int        dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_dhead_dep_t *head;
+
+       head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+                                               cfs_hash_dhead_dep_t, dd_head);
+
+       if (dh->dd_tail != NULL) /* not empty */
+               hlist_add_after(dh->dd_tail, hnode);
+       else /* empty list */
+               hlist_add_head(hnode, &dh->dd_head);
+       dh->dd_tail = hnode;
+       return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnd)
+{
+       cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+                                               cfs_hash_dhead_dep_t, dd_head);
+
+       if (hnd->next == NULL) { /* it's the tail */
+               dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+                             container_of(hnd->pprev, struct hlist_node, next);
+       }
+       hlist_del_init(hnd);
+       return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_add_tail(hs)) {
+               hs->hs_hops = cfs_hash_with_depth(hs) ?
+                             &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+       } else {
+               hs->hs_hops = cfs_hash_with_depth(hs) ?
+                             &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+       }
+}
+
+static void
+cfs_hash_bd_from_key(cfs_hash_t *hs, cfs_hash_bucket_t **bkts,
+                    unsigned int bits, const void *key, cfs_hash_bd_t *bd)
+{
+       unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+       LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+       bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+       bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (likely(hs->hs_rehash_buckets == NULL)) {
+               cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                                    hs->hs_cur_bits, key, bd);
+       } else {
+               LASSERT(hs->hs_rehash_bits != 0);
+               cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                    hs->hs_rehash_bits, key, bd);
+       }
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(cfs_hash_t *hs, cfs_hash_bd_t *bd, int dep_cur)
+{
+       if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+               return;
+
+       bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+       if (likely(warn_on_depth == 0 ||
+                  max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+               return;
+
+       spin_lock(&hs->hs_dep_lock);
+       hs->hs_dep_max  = dep_cur;
+       hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+       hs->hs_dep_off  = bd->bd_offset;
+       hs->hs_dep_bits = hs->hs_cur_bits;
+       spin_unlock(&hs->hs_dep_lock);
+
+       cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                      struct hlist_node *hnode)
+{
+       int             rc;
+
+       rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+       cfs_hash_bd_dep_record(hs, bd, rc);
+       bd->bd_bucket->hsb_version++;
+       if (unlikely(bd->bd_bucket->hsb_version == 0))
+               bd->bd_bucket->hsb_version++;
+       bd->bd_bucket->hsb_count++;
+
+       if (cfs_hash_with_counter(hs))
+               atomic_inc(&hs->hs_count);
+       if (!cfs_hash_with_no_itemref(hs))
+               cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                      struct hlist_node *hnode)
+{
+       hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+       LASSERT(bd->bd_bucket->hsb_count > 0);
+       bd->bd_bucket->hsb_count--;
+       bd->bd_bucket->hsb_version++;
+       if (unlikely(bd->bd_bucket->hsb_version == 0))
+               bd->bd_bucket->hsb_version++;
+
+       if (cfs_hash_with_counter(hs)) {
+               LASSERT(atomic_read(&hs->hs_count) > 0);
+               atomic_dec(&hs->hs_count);
+       }
+       if (!cfs_hash_with_no_itemref(hs))
+               cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+                       cfs_hash_bd_t *bd_new, struct hlist_node *hnode)
+{
+       cfs_hash_bucket_t *obkt = bd_old->bd_bucket;
+       cfs_hash_bucket_t *nbkt = bd_new->bd_bucket;
+       int             rc;
+
+       if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+               return;
+
+       /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+        * in cfs_hash_bd_del/add_locked */
+       hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+       rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+       cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+       LASSERT(obkt->hsb_count > 0);
+       obkt->hsb_count--;
+       obkt->hsb_version++;
+       if (unlikely(obkt->hsb_version == 0))
+               obkt->hsb_version++;
+       nbkt->hsb_count++;
+       nbkt->hsb_version++;
+       if (unlikely(nbkt->hsb_version == 0))
+               nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+       /** always set, for sanity (avoid ZERO intent) */
+       CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+       /** return entry with a ref */
+       CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+       /** add entry if not existing */
+       CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+       /** delete entry, ignore other masks */
+       CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+       /** return item w/o refcount */
+       CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+       /** return item with refcount */
+       CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_REF),
+       /** return item w/o refcount if existed, otherwise add */
+       CFS_HS_LOOKUP_IT_ADD    = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_ADD),
+       /** return item with refcount if existed, otherwise add */
+       CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+                                      CFS_HS_LOOKUP_MASK_ADD),
+       /** delete if existed */
+       CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                         const void *key, struct hlist_node *hnode,
+                         cfs_hash_lookup_intent_t intent)
+
+{
+       struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+       struct hlist_node  *ehnode;
+       struct hlist_node  *match;
+       int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+       /* with this function, we can avoid a lot of useless refcount ops,
+        * which are expensive atomic operations most time. */
+       match = intent_add ? NULL : hnode;
+       hlist_for_each(ehnode, hhead) {
+               if (!cfs_hash_keycmp(hs, key, ehnode))
+                       continue;
+
+               if (match != NULL && match != ehnode) /* can't match */
+                       continue;
+
+               /* match and ... */
+               if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+                       cfs_hash_bd_del_locked(hs, bd, ehnode);
+                       return ehnode;
+               }
+
+               /* caller wants refcount? */
+               if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+                       cfs_hash_get(hs, ehnode);
+               return ehnode;
+       }
+       /* no match item */
+       if (!intent_add)
+               return NULL;
+
+       LASSERT(hnode != NULL);
+       cfs_hash_bd_add_locked(hs, bd, hnode);
+       return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+                                        CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+                                        CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          const void *key, struct hlist_node *hnode,
+                          int noref)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+                                        CFS_HS_LOOKUP_IT_ADD |
+                                        (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          const void *key, struct hlist_node *hnode)
+{
+       /* hnode can be NULL, we find the first item with @key */
+       return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+                                        CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                      unsigned n, int excl)
+{
+       cfs_hash_bucket_t *prev = NULL;
+       int             i;
+
+       /**
+        * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+        * NB: it's possible that several bds point to the same bucket but
+        * have different bd::bd_offset, so need take care of deadlock.
+        */
+       cfs_hash_for_each_bd(bds, n, i) {
+               if (prev == bds[i].bd_bucket)
+                       continue;
+
+               LASSERT(prev == NULL ||
+                       prev->hsb_index < bds[i].bd_bucket->hsb_index);
+               cfs_hash_bd_lock(hs, &bds[i], excl);
+               prev = bds[i].bd_bucket;
+       }
+}
+
+static void
+cfs_hash_multi_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                        unsigned n, int excl)
+{
+       cfs_hash_bucket_t *prev = NULL;
+       int             i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               if (prev != bds[i].bd_bucket) {
+                       cfs_hash_bd_unlock(hs, &bds[i], excl);
+                       prev = bds[i].bd_bucket;
+               }
+       }
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               unsigned n, const void *key)
+{
+       struct hlist_node  *ehnode;
+       unsigned           i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+                                                  CFS_HS_LOOKUP_IT_FIND);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+       return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(cfs_hash_t *hs,
+                                cfs_hash_bd_t *bds, unsigned n, const void *key,
+                                struct hlist_node *hnode, int noref)
+{
+       struct hlist_node  *ehnode;
+       int             intent;
+       unsigned           i;
+
+       LASSERT(hnode != NULL);
+       intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+                                                  NULL, intent);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+
+       if (i == 1) { /* only one bucket */
+               cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+       } else {
+               cfs_hash_bd_t      mybd;
+
+               cfs_hash_bd_get(hs, key, &mybd);
+               cfs_hash_bd_add_locked(hs, &mybd, hnode);
+       }
+
+       return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                                unsigned n, const void *key,
+                                struct hlist_node *hnode)
+{
+       struct hlist_node  *ehnode;
+       unsigned           i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+                                                  CFS_HS_LOOKUP_IT_FINDDEL);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+       return NULL;
+}
+
+static void
+cfs_hash_bd_order(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+       int     rc;
+
+       if (bd2->bd_bucket == NULL)
+               return;
+
+       if (bd1->bd_bucket == NULL) {
+               *bd1 = *bd2;
+               bd2->bd_bucket = NULL;
+               return;
+       }
+
+       rc = cfs_hash_bd_compare(bd1, bd2);
+       if (rc == 0) {
+               bd2->bd_bucket = NULL;
+
+       } else if (rc > 0) { /* swab bd1 and bd2 */
+               cfs_hash_bd_t tmp;
+
+               tmp = *bd2;
+               *bd2 = *bd1;
+               *bd1 = tmp;
+       }
+}
+
+void
+cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds)
+{
+       /* NB: caller should hold hs_lock.rw if REHASH is set */
+       cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                            hs->hs_cur_bits, key, &bds[0]);
+       if (likely(hs->hs_rehash_buckets == NULL)) {
+               /* no rehash or not rehashing */
+               bds[1].bd_bucket = NULL;
+               return;
+       }
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                            hs->hs_rehash_bits, key, &bds[1]);
+
+       cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                              const void *key)
+{
+       return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode,
+                               int noref)
+{
+       return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+                                               hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode)
+{
+       return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(cfs_hash_bucket_t **buckets,
+                     int bkt_size, int prev_size, int size)
+{
+       int     i;
+
+       for (i = prev_size; i < size; i++) {
+               if (buckets[i] != NULL)
+                       LIBCFS_FREE(buckets[i], bkt_size);
+       }
+
+       LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static cfs_hash_bucket_t **
+cfs_hash_buckets_realloc(cfs_hash_t *hs, cfs_hash_bucket_t **old_bkts,
+                        unsigned int old_size, unsigned int new_size)
+{
+       cfs_hash_bucket_t **new_bkts;
+       int              i;
+
+       LASSERT(old_size == 0 || old_bkts != NULL);
+
+       if (old_bkts != NULL && old_size == new_size)
+               return old_bkts;
+
+       LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+       if (new_bkts == NULL)
+               return NULL;
+
+       if (old_bkts != NULL) {
+               memcpy(new_bkts, old_bkts,
+                      min(old_size, new_size) * sizeof(*old_bkts));
+       }
+
+       for (i = old_size; i < new_size; i++) {
+               struct hlist_head *hhead;
+               cfs_hash_bd_t     bd;
+
+               LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+               if (new_bkts[i] == NULL) {
+                       cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+                                             old_size, new_size);
+                       return NULL;
+               }
+
+               new_bkts[i]->hsb_index   = i;
+               new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+               new_bkts[i]->hsb_depmax  = -1; /* unknown */
+               bd.bd_bucket = new_bkts[i];
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+                       INIT_HLIST_HEAD(hhead);
+
+               if (cfs_hash_with_no_lock(hs) ||
+                   cfs_hash_with_no_bktlock(hs))
+                       continue;
+
+               if (cfs_hash_with_rw_bktlock(hs))
+                       rwlock_init(&new_bkts[i]->hsb_lock.rw);
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+               else
+                       LBUG(); /* invalid use-case */
+       }
+       return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *        - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+       cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_dep_wi);
+       int      dep;
+       int      bkt;
+       int      off;
+       int      bits;
+
+       spin_lock(&hs->hs_dep_lock);
+       dep  = hs->hs_dep_max;
+       bkt  = hs->hs_dep_bkt;
+       off  = hs->hs_dep_off;
+       bits = hs->hs_dep_bits;
+       spin_unlock(&hs->hs_dep_lock);
+
+       LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+                     hs->hs_name, bits, dep, bkt, off);
+       spin_lock(&hs->hs_dep_lock);
+       hs->hs_dep_bits = 0; /* mark as workitem done */
+       spin_unlock(&hs->hs_dep_lock);
+       return 0;
+}
+
+static void cfs_hash_depth_wi_init(cfs_hash_t *hs)
+{
+       spin_lock_init(&hs->hs_dep_lock);
+       cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(cfs_hash_t *hs)
+{
+       if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+               return;
+
+       spin_lock(&hs->hs_dep_lock);
+       while (hs->hs_dep_bits != 0) {
+               spin_unlock(&hs->hs_dep_lock);
+               cond_resched();
+               spin_lock(&hs->hs_dep_lock);
+       }
+       spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(cfs_hash_t *hs) {}
+static inline void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+cfs_hash_t *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+               unsigned bkt_bits, unsigned extra_bytes,
+               unsigned min_theta, unsigned max_theta,
+               cfs_hash_ops_t *ops, unsigned flags)
+{
+       cfs_hash_t *hs;
+       int      len;
+
+       ENTRY;
+
+       CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+       LASSERT(name != NULL);
+       LASSERT(ops != NULL);
+       LASSERT(ops->hs_key);
+       LASSERT(ops->hs_hash);
+       LASSERT(ops->hs_object);
+       LASSERT(ops->hs_keycmp);
+       LASSERT(ops->hs_get != NULL);
+       LASSERT(ops->hs_put_locked != NULL);
+
+       if ((flags & CFS_HASH_REHASH) != 0)
+               flags |= CFS_HASH_COUNTER; /* must have counter */
+
+       LASSERT(cur_bits > 0);
+       LASSERT(cur_bits >= bkt_bits);
+       LASSERT(max_bits >= cur_bits && max_bits < 31);
+       LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+       LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+                    (flags & CFS_HASH_NO_LOCK) == 0));
+       LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+                     ops->hs_keycpy != NULL));
+
+       len = (flags & CFS_HASH_BIGNAME) == 0 ?
+             CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+       LIBCFS_ALLOC(hs, offsetof(cfs_hash_t, hs_name[len]));
+       if (hs == NULL)
+               RETURN(NULL);
+
+       strncpy(hs->hs_name, name, len);
+       hs->hs_name[len - 1] = '\0';
+       hs->hs_flags = flags;
+
+       atomic_set(&hs->hs_refcount, 1);
+       atomic_set(&hs->hs_count, 0);
+
+       cfs_hash_lock_setup(hs);
+       cfs_hash_hlist_setup(hs);
+
+       hs->hs_cur_bits = (__u8)cur_bits;
+       hs->hs_min_bits = (__u8)cur_bits;
+       hs->hs_max_bits = (__u8)max_bits;
+       hs->hs_bkt_bits = (__u8)bkt_bits;
+
+       hs->hs_ops       = ops;
+       hs->hs_extra_bytes = extra_bytes;
+       hs->hs_rehash_bits = 0;
+       cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+       cfs_hash_depth_wi_init(hs);
+
+       if (cfs_hash_with_rehash(hs))
+               __cfs_hash_set_theta(hs, min_theta, max_theta);
+
+       hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+                                                 CFS_HASH_NBKT(hs));
+       if (hs->hs_buckets != NULL)
+               return hs;
+
+       LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[len]));
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(cfs_hash_t *hs)
+{
+       struct hlist_node     *hnode;
+       struct hlist_node     *pos;
+       cfs_hash_bd_t    bd;
+       int                i;
+       ENTRY;
+
+       LASSERT(hs != NULL);
+       LASSERT(!cfs_hash_is_exiting(hs) &&
+               !cfs_hash_is_iterating(hs));
+
+       /**
+        * prohibit further rehashes, don't need any lock because
+        * I'm the only (last) one can change it.
+        */
+       hs->hs_exiting = 1;
+       if (cfs_hash_with_rehash(hs))
+               cfs_hash_rehash_cancel(hs);
+
+       cfs_hash_depth_wi_cancel(hs);
+       /* rehash should be done/canceled */
+       LASSERT(hs->hs_buckets != NULL &&
+               hs->hs_rehash_buckets == NULL);
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               LASSERT(bd.bd_bucket != NULL);
+               /* no need to take this lock, just for consistent code */
+               cfs_hash_bd_lock(hs, &bd, 1);
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       hlist_for_each_safe(hnode, pos, hhead) {
+                               LASSERTF(!cfs_hash_with_assert_empty(hs),
+                                        "hash %s bucket %u(%u) is not "
+                                        " empty: %u items left\n",
+                                        hs->hs_name, bd.bd_bucket->hsb_index,
+                                        bd.bd_offset, bd.bd_bucket->hsb_count);
+                               /* can't assert key valicate, because we
+                                * can interrupt rehash */
+                               cfs_hash_bd_del_locked(hs, &bd, hnode);
+                               cfs_hash_exit(hs, hnode);
+                       }
+               }
+               LASSERT(bd.bd_bucket->hsb_count == 0);
+               cfs_hash_bd_unlock(hs, &bd, 1);
+               cond_resched();
+       }
+
+       LASSERT(atomic_read(&hs->hs_count) == 0);
+
+       cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+                             0, CFS_HASH_NBKT(hs));
+       i = cfs_hash_with_bigname(hs) ?
+           CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+       LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[i]));
+
+       EXIT;
+}
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs)
+{
+       if (atomic_inc_not_zero(&hs->hs_refcount))
+               return hs;
+       return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(cfs_hash_t *hs)
+{
+       if (atomic_dec_and_test(&hs->hs_refcount))
+               cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_no_lock(hs) ||
+           !cfs_hash_with_rehash(hs))
+               return -EOPNOTSUPP;
+
+       if (unlikely(cfs_hash_is_exiting(hs)))
+               return -ESRCH;
+
+       if (unlikely(cfs_hash_is_rehashing(hs)))
+               return -EALREADY;
+
+       if (unlikely(cfs_hash_is_iterating(hs)))
+               return -EAGAIN;
+
+       /* XXX: need to handle case with max_theta != 2.0
+        *      and the case with min_theta != 0.5 */
+       if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+           (__cfs_hash_theta(hs) > hs->hs_max_theta))
+               return hs->hs_cur_bits + 1;
+
+       if (!cfs_hash_with_shrink(hs))
+               return 0;
+
+       if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+           (__cfs_hash_theta(hs) < hs->hs_min_theta))
+               return hs->hs_cur_bits - 1;
+
+       return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(cfs_hash_t *hs)
+{
+       return !cfs_hash_with_nblk_change(hs) &&
+              atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bd;
+       int          bits;
+
+       LASSERT(hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+       cfs_hash_key_validate(hs, key, hnode);
+       cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+       cfs_hash_bd_unlock(hs, &bd, 1);
+
+       bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(cfs_hash_t *hs, const void *key,
+                    struct hlist_node *hnode, int noref)
+{
+       struct hlist_node *ehnode;
+       cfs_hash_bd_t     bds[2];
+       int            bits = 0;
+
+       LASSERT(hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+       cfs_hash_key_validate(hs, key, hnode);
+       ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+                                                hnode, noref);
+       cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+       if (ehnode == hnode) /* new item added */
+               bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+       return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+              -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode)
+{
+       hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+       return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       void       *obj  = NULL;
+       int          bits = 0;
+       cfs_hash_bd_t   bds[2];
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+       /* NB: do nothing if @hnode is not in hash table */
+       if (hnode == NULL || !hlist_unhashed(hnode)) {
+               if (bds[1].bd_bucket == NULL && hnode != NULL) {
+                       cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+               } else {
+                       hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+                                                               key, hnode);
+               }
+       }
+
+       if (hnode != NULL) {
+               obj  = cfs_hash_object(hs, hnode);
+               bits = cfs_hash_rehash_bits(hs);
+       }
+
+       cfs_hash_dual_bd_unlock(hs, bds, 1);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+       return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(cfs_hash_t *hs, const void *key)
+{
+       return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(cfs_hash_t *hs, const void *key)
+{
+       void             *obj = NULL;
+       struct hlist_node     *hnode;
+       cfs_hash_bd_t    bds[2];
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+       hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+       if (hnode != NULL)
+               obj = cfs_hash_object(hs, hnode);
+
+       cfs_hash_dual_bd_unlock(hs, bds, 0);
+       cfs_hash_unlock(hs, 0);
+
+       return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(cfs_hash_t *hs)
+{
+       LASSERT(!cfs_hash_is_exiting(hs));
+
+       if (!cfs_hash_with_rehash(hs))
+               return;
+       /*
+        * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+        * because it's just an unreliable signal to rehash-thread,
+        * rehash-thread will try to finsih rehash ASAP when seeing this.
+        */
+       hs->hs_iterating = 1;
+
+       cfs_hash_lock(hs, 1);
+       hs->hs_iterators++;
+
+       /* NB: iteration is mostly called by service thread,
+        * we tend to cancel pending rehash-requst, instead of
+        * blocking service thread, we will relaunch rehash request
+        * after iteration */
+       if (cfs_hash_is_rehashing(hs))
+               cfs_hash_rehash_cancel_locked(hs);
+       cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(cfs_hash_t *hs)
+{
+       int remained;
+       int bits;
+
+       if (!cfs_hash_with_rehash(hs))
+               return;
+       cfs_hash_lock(hs, 1);
+       remained = --hs->hs_iterators;
+       bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 1);
+       /* NB: it's race on cfs_has_t::hs_iterating, see above */
+       if (remained == 0)
+               hs->hs_iterating = 0;
+       if (bits > 0) {
+               cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+                                   CFS_HASH_LOOP_HOG);
+       }
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(cfs_hash_t *hs, cfs_hash_for_each_cb_t func,
+                       void *data, int remove_safe)
+{
+       struct hlist_node     *hnode;
+       struct hlist_node     *pos;
+       cfs_hash_bd_t    bd;
+       __u64            count = 0;
+       int                excl  = !!remove_safe;
+       int                loop  = 0;
+       int                i;
+       ENTRY;
+
+       cfs_hash_for_each_enter(hs);
+
+       cfs_hash_lock(hs, 0);
+       LASSERT(!cfs_hash_is_rehashing(hs));
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, excl);
+               if (func == NULL) { /* only glimpse size */
+                       count += bd.bd_bucket->hsb_count;
+                       cfs_hash_bd_unlock(hs, &bd, excl);
+                       continue;
+               }
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       hlist_for_each_safe(hnode, pos, hhead) {
+                               cfs_hash_bucket_validate(hs, &bd, hnode);
+                               count++;
+                               loop++;
+                               if (func(hs, &bd, hnode, data)) {
+                                       cfs_hash_bd_unlock(hs, &bd, excl);
+                                       goto out;
+                               }
+                       }
+               }
+               cfs_hash_bd_unlock(hs, &bd, excl);
+               if (loop < CFS_HASH_LOOP_HOG)
+                       continue;
+               loop = 0;
+               cfs_hash_unlock(hs, 0);
+               cond_resched();
+               cfs_hash_lock(hs, 0);
+       }
+ out:
+       cfs_hash_unlock(hs, 0);
+
+       cfs_hash_for_each_exit(hs);
+       RETURN(count);
+}
+
+typedef struct {
+       cfs_hash_cond_opt_cb_t  func;
+       void               *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode, void *data)
+{
+       cfs_hash_cond_arg_t *cond = data;
+
+       if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+               cfs_hash_bd_del_locked(hs, bd, hnode);
+       return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+       cfs_hash_cond_arg_t arg = {
+               .func   = func,
+               .arg    = data,
+       };
+
+       cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(cfs_hash_t *hs,
+                 cfs_hash_for_each_cb_t func, void *data)
+{
+       cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(cfs_hash_t *hs,
+                      cfs_hash_for_each_cb_t func, void *data)
+{
+       cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+             struct hlist_node *hnode, void *data)
+{
+       *(int *)data = 0;
+       return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(cfs_hash_t *hs)
+{
+       int empty = 1;
+
+       cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+       return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(cfs_hash_t *hs)
+{
+       return cfs_hash_with_counter(hs) ?
+              atomic_read(&hs->hs_count) :
+              cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_node *hnode;
+       struct hlist_node *tmp;
+       cfs_hash_bd_t     bd;
+       __u32        version;
+       int            count = 0;
+       int            stop_on_change;
+       int            rc;
+       int            i;
+       ENTRY;
+
+       stop_on_change = cfs_hash_with_rehash_key(hs) ||
+                        !cfs_hash_with_no_itemref(hs) ||
+                        CFS_HOP(hs, put_locked) == NULL;
+       cfs_hash_lock(hs, 0);
+       LASSERT(!cfs_hash_is_rehashing(hs));
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, 0);
+               version = cfs_hash_bd_version_get(&bd);
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       for (hnode = hhead->first; hnode != NULL;) {
+                               cfs_hash_bucket_validate(hs, &bd, hnode);
+                               cfs_hash_get(hs, hnode);
+                               cfs_hash_bd_unlock(hs, &bd, 0);
+                               cfs_hash_unlock(hs, 0);
+
+                               rc = func(hs, &bd, hnode, data);
+                               if (stop_on_change)
+                                       cfs_hash_put(hs, hnode);
+                               cond_resched();
+                               count++;
+
+                               cfs_hash_lock(hs, 0);
+                               cfs_hash_bd_lock(hs, &bd, 0);
+                               if (!stop_on_change) {
+                                       tmp = hnode->next;
+                                       cfs_hash_put_locked(hs, hnode);
+                                       hnode = tmp;
+                               } else { /* bucket changed? */
+                                       if (version !=
+                                           cfs_hash_bd_version_get(&bd))
+                                               break;
+                                       /* safe to continue because no change */
+                                       hnode = hnode->next;
+                               }
+                               if (rc) /* callback wants to break iteration */
+                                       break;
+                       }
+               }
+               cfs_hash_bd_unlock(hs, &bd, 0);
+       }
+       cfs_hash_unlock(hs, 0);
+
+       return count;
+}
+
+int
+cfs_hash_for_each_nolock(cfs_hash_t *hs,
+                        cfs_hash_for_each_cb_t func, void *data)
+{
+       ENTRY;
+
+       if (cfs_hash_with_no_lock(hs) ||
+           cfs_hash_with_rehash_key(hs) ||
+           !cfs_hash_with_no_itemref(hs))
+               RETURN(-EOPNOTSUPP);
+
+       if (CFS_HOP(hs, get) == NULL ||
+           (CFS_HOP(hs, put) == NULL &&
+            CFS_HOP(hs, put_locked) == NULL))
+               RETURN(-EOPNOTSUPP);
+
+       cfs_hash_for_each_enter(hs);
+       cfs_hash_for_each_relax(hs, func, data);
+       cfs_hash_for_each_exit(hs);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(cfs_hash_t *hs,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+       unsigned  i = 0;
+       ENTRY;
+
+       if (cfs_hash_with_no_lock(hs))
+               return -EOPNOTSUPP;
+
+       if (CFS_HOP(hs, get) == NULL ||
+           (CFS_HOP(hs, put) == NULL &&
+            CFS_HOP(hs, put_locked) == NULL))
+               return -EOPNOTSUPP;
+
+       cfs_hash_for_each_enter(hs);
+       while (cfs_hash_for_each_relax(hs, func, data)) {
+               CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+                      hs->hs_name, i++);
+       }
+       cfs_hash_for_each_exit(hs);
+       RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_head   *hhead;
+       struct hlist_node   *hnode;
+       cfs_hash_bd_t       bd;
+
+       cfs_hash_for_each_enter(hs);
+       cfs_hash_lock(hs, 0);
+       if (hindex >= CFS_HASH_NHLIST(hs))
+               goto out;
+
+       cfs_hash_bd_index_set(hs, hindex, &bd);
+
+       cfs_hash_bd_lock(hs, &bd, 0);
+       hhead = cfs_hash_bd_hhead(hs, &bd);
+       hlist_for_each(hnode, hhead) {
+               if (func(hs, &bd, hnode, data))
+                       break;
+       }
+       cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+       cfs_hash_unlock(hs, 0);
+       cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+                     cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_node   *hnode;
+       cfs_hash_bd_t       bds[2];
+       unsigned            i;
+
+       cfs_hash_lock(hs, 0);
+
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+       cfs_hash_for_each_bd(bds, 2, i) {
+               struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+               hlist_for_each(hnode, hlist) {
+                       cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+                       if (cfs_hash_keycmp(hs, key, hnode)) {
+                               if (func(hs, &bds[i], hnode, data))
+                                       break;
+                       }
+               }
+       }
+
+       cfs_hash_dual_bd_unlock(hs, bds, 0);
+       cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(cfs_hash_t *hs)
+{
+       int     i;
+
+       /* need hold cfs_hash_lock(hs, 1) */
+       LASSERT(cfs_hash_with_rehash(hs) &&
+               !cfs_hash_with_no_lock(hs));
+
+       if (!cfs_hash_is_rehashing(hs))
+               return;
+
+       if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+               hs->hs_rehash_bits = 0;
+               return;
+       }
+
+       for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+               cfs_hash_unlock(hs, 1);
+               /* raise console warning while waiting too long */
+               CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+                      "hash %s is still rehashing, rescheded %d\n",
+                      hs->hs_name, i - 1);
+               cond_resched();
+               cfs_hash_lock(hs, 1);
+       }
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(cfs_hash_t *hs)
+{
+       cfs_hash_lock(hs, 1);
+       cfs_hash_rehash_cancel_locked(hs);
+       cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(cfs_hash_t *hs, int do_rehash)
+{
+       int     rc;
+
+       LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+       cfs_hash_lock(hs, 1);
+
+       rc = cfs_hash_rehash_bits(hs);
+       if (rc <= 0) {
+               cfs_hash_unlock(hs, 1);
+               return rc;
+       }
+
+       hs->hs_rehash_bits = rc;
+       if (!do_rehash) {
+               /* launch and return */
+               cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+               cfs_hash_unlock(hs, 1);
+               return 0;
+       }
+
+       /* rehash right now */
+       cfs_hash_unlock(hs, 1);
+
+       return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(cfs_hash_t *hs, cfs_hash_bd_t *old)
+{
+       cfs_hash_bd_t      new;
+       struct hlist_head  *hhead;
+       struct hlist_node  *hnode;
+       struct hlist_node  *pos;
+       void          *key;
+       int             c = 0;
+
+       /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+       cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+               hlist_for_each_safe(hnode, pos, hhead) {
+                       key = cfs_hash_key(hs, hnode);
+                       LASSERT(key != NULL);
+                       /* Validate hnode is in the correct bucket. */
+                       cfs_hash_bucket_validate(hs, old, hnode);
+                       /*
+                        * Delete from old hash bucket; move to new bucket.
+                        * ops->hs_key must be defined.
+                        */
+                       cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                            hs->hs_rehash_bits, key, &new);
+                       cfs_hash_bd_move_locked(hs, old, &new, hnode);
+                       c++;
+               }
+       }
+
+       return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+       cfs_hash_t       *hs = container_of(wi, cfs_hash_t, hs_rehash_wi);
+       cfs_hash_bucket_t **bkts;
+       cfs_hash_bd_t       bd;
+       unsigned int    old_size;
+       unsigned int    new_size;
+       int              bsize;
+       int              count = 0;
+       int              rc = 0;
+       int              i;
+
+       LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+       cfs_hash_lock(hs, 0);
+       LASSERT(cfs_hash_is_rehashing(hs));
+
+       old_size = CFS_HASH_NBKT(hs);
+       new_size = CFS_HASH_RH_NBKT(hs);
+
+       cfs_hash_unlock(hs, 0);
+
+       /*
+        * don't need hs::hs_rwlock for hs::hs_buckets,
+        * because nobody can change bkt-table except me.
+        */
+       bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+                                       old_size, new_size);
+       cfs_hash_lock(hs, 1);
+       if (bkts == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       if (bkts == hs->hs_buckets) {
+               bkts = NULL; /* do nothing */
+               goto out;
+       }
+
+       rc = __cfs_hash_theta(hs);
+       if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+               /* free the new allocated bkt-table */
+               old_size = new_size;
+               new_size = CFS_HASH_NBKT(hs);
+               rc = -EALREADY;
+               goto out;
+       }
+
+       LASSERT(hs->hs_rehash_buckets == NULL);
+       hs->hs_rehash_buckets = bkts;
+
+       rc = 0;
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               if (cfs_hash_is_exiting(hs)) {
+                       rc = -ESRCH;
+                       /* someone wants to destroy the hash, abort now */
+                       if (old_size < new_size) /* OK to free old bkt-table */
+                               break;
+                       /* it's shrinking, need free new bkt-table */
+                       hs->hs_rehash_buckets = NULL;
+                       old_size = new_size;
+                       new_size = CFS_HASH_NBKT(hs);
+                       goto out;
+               }
+
+               count += cfs_hash_rehash_bd(hs, &bd);
+               if (count < CFS_HASH_LOOP_HOG ||
+                   cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+                       continue;
+               }
+
+               count = 0;
+               cfs_hash_unlock(hs, 1);
+               cond_resched();
+               cfs_hash_lock(hs, 1);
+       }
+
+       hs->hs_rehash_count++;
+
+       bkts = hs->hs_buckets;
+       hs->hs_buckets = hs->hs_rehash_buckets;
+       hs->hs_rehash_buckets = NULL;
+
+       hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+       hs->hs_rehash_bits = 0;
+       if (rc == -ESRCH) /* never be scheduled again */
+               cfs_wi_exit(cfs_sched_rehash, wi);
+       bsize = cfs_hash_bkt_size(hs);
+       cfs_hash_unlock(hs, 1);
+       /* can't refer to @hs anymore because it could be destroyed */
+       if (bkts != NULL)
+               cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+       if (rc != 0)
+               CDEBUG(D_INFO, "early quit of of rehashing: %d\n", rc);
+       /* return 1 only if cfs_wi_exit is called */
+       return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+                        void *new_key, struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bds[3];
+       cfs_hash_bd_t   old_bds[2];
+       cfs_hash_bd_t   new_bd;
+
+       LASSERT(!hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+
+       cfs_hash_dual_bd_get(hs, old_key, old_bds);
+       cfs_hash_bd_get(hs, new_key, &new_bd);
+
+       bds[0] = old_bds[0];
+       bds[1] = old_bds[1];
+       bds[2] = new_bd;
+
+       /* NB: bds[0] and bds[1] are ordered already */
+       cfs_hash_bd_order(&bds[1], &bds[2]);
+       cfs_hash_bd_order(&bds[0], &bds[1]);
+
+       cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+       if (likely(old_bds[1].bd_bucket == NULL)) {
+               cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+       } else {
+               cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+               cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+       }
+       /* overwrite key inside locks, otherwise may screw up with
+        * other operations, i.e: rehash */
+       cfs_hash_keycpy(hs, new_key, hnode);
+
+       cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+       cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+int cfs_hash_debug_header(char *str, int size)
+{
+       return snprintf(str, size, "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%8s%8s%8s%s\n",
+                CFS_HASH_BIGNAME_LEN,
+                "name", "cur", "min", "max", "theta", "t-min", "t-max",
+                "flags", "rehash", "count", "maxdep", "maxdepb",
+                " distribution");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static cfs_hash_bucket_t **
+cfs_hash_full_bkts(cfs_hash_t *hs)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (hs->hs_rehash_buckets == NULL)
+               return hs->hs_buckets;
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       return hs->hs_rehash_bits > hs->hs_cur_bits ?
+              hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(cfs_hash_t *hs)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (hs->hs_rehash_buckets == NULL)
+               return CFS_HASH_NBKT(hs);
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       return hs->hs_rehash_bits > hs->hs_cur_bits ?
+              CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size)
+{
+       int                 dist[8] = { 0, };
+       int                 maxdep  = -1;
+       int                 maxdepb = -1;
+       int                 total   = 0;
+       int                 c       = 0;
+       int                 theta;
+       int                 i;
+
+       if (str == NULL || size == 0)
+               return 0;
+
+       cfs_hash_lock(hs, 0);
+       theta = __cfs_hash_theta(hs);
+
+       c += snprintf(str + c, size - c, "%-*s ",
+                     CFS_HASH_BIGNAME_LEN, hs->hs_name);
+       c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_cur_bits);
+       c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_min_bits);
+       c += snprintf(str + c, size - c, "%5d ",  1 << hs->hs_max_bits);
+       c += snprintf(str + c, size - c, "%d.%03d ",
+                     __cfs_hash_theta_int(theta),
+                     __cfs_hash_theta_frac(theta));
+       c += snprintf(str + c, size - c, "%d.%03d ",
+                     __cfs_hash_theta_int(hs->hs_min_theta),
+                     __cfs_hash_theta_frac(hs->hs_min_theta));
+       c += snprintf(str + c, size - c, "%d.%03d ",
+                     __cfs_hash_theta_int(hs->hs_max_theta),
+                     __cfs_hash_theta_frac(hs->hs_max_theta));
+       c += snprintf(str + c, size - c, " 0x%02x ", hs->hs_flags);
+       c += snprintf(str + c, size - c, "%6d ", hs->hs_rehash_count);
+
+       /*
+        * The distribution is a summary of the chained hash depth in
+        * each of the libcfs hash buckets.  Each buckets hsb_count is
+        * divided by the hash theta value and used to generate a
+        * histogram of the hash distribution.  A uniform hash will
+        * result in all hash buckets being close to the average thus
+        * only the first few entries in the histogram will be non-zero.
+        * If you hash function results in a non-uniform hash the will
+        * be observable by outlier bucks in the distribution histogram.
+        *
+        * Uniform hash distribution:      128/128/0/0/0/0/0/0
+        * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+        */
+       for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+               cfs_hash_bd_t  bd;
+
+               bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+               cfs_hash_bd_lock(hs, &bd, 0);
+               if (maxdep < bd.bd_bucket->hsb_depmax) {
+                       maxdep  = bd.bd_bucket->hsb_depmax;
+                       maxdepb = ffz(~maxdep);
+               }
+               total += bd.bd_bucket->hsb_count;
+               dist[min(__cfs_fls(bd.bd_bucket->hsb_count/max(theta,1)),7)]++;
+               cfs_hash_bd_unlock(hs, &bd, 0);
+       }
+
+       c += snprintf(str + c, size - c, "%7d ", total);
+       c += snprintf(str + c, size - c, "%7d ", maxdep);
+       c += snprintf(str + c, size - c, "%7d ", maxdepb);
+       for (i = 0; i < 8; i++)
+               c += snprintf(str + c, size - c, "%d%c",  dist[i],
+                             (i == 7) ? '\n' : '/');
+
+       cfs_hash_unlock(hs, 0);
+
+       return c;
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustre/lustre/libcfs/heap.c b/drivers/staging/lustre/lustre/libcfs/heap.c
new file mode 100644 (file)
index 0000000..147e4fe
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton <eeb@whamcloud.com>
+ *        Liang Zhen   <liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CBH_ALLOC(ptr, h)                                              \
+do {                                                                   \
+       if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)                      \
+               LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, h->cbh_cptid, \
+                                    CBH_NOB, GFP_ATOMIC);      \
+       else                                                            \
+               LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, h->cbh_cptid,     \
+                                CBH_NOB);                              \
+} while (0)
+
+#define CBH_FREE(ptr)  LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e cfs_binheap_node_t objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0      Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+cfs_binheap_grow(cfs_binheap_t *h)
+{
+       cfs_binheap_node_t ***frag1 = NULL;
+       cfs_binheap_node_t  **frag2;
+       int hwm = h->cbh_hwm;
+
+       /* need a whole new chunk of pointers */
+       LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+       if (hwm == 0) {
+               /* first use of single indirect */
+               CBH_ALLOC(h->cbh_elements1, h);
+               if (h->cbh_elements1 == NULL)
+                       return -ENOMEM;
+
+               goto out;
+       }
+
+       hwm -= CBH_SIZE;
+       if (hwm < CBH_SIZE * CBH_SIZE) {
+               /* not filled double indirect */
+               CBH_ALLOC(frag2, h);
+               if (frag2 == NULL)
+                       return -ENOMEM;
+
+               if (hwm == 0) {
+                       /* first use of double indirect */
+                       CBH_ALLOC(h->cbh_elements2, h);
+                       if (h->cbh_elements2 == NULL) {
+                               CBH_FREE(frag2);
+                               return -ENOMEM;
+                       }
+               }
+
+               h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+               goto out;
+       }
+
+       hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+       if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+               /* filled triple indirect */
+               return -ENOMEM;
+       }
+#endif
+       CBH_ALLOC(frag2, h);
+       if (frag2 == NULL)
+               return -ENOMEM;
+
+       if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+               /* first use of this 2nd level index */
+               CBH_ALLOC(frag1, h);
+               if (frag1 == NULL) {
+                       CBH_FREE(frag2);
+                       return -ENOMEM;
+               }
+       }
+
+       if (hwm == 0) {
+               /* first use of triple indirect */
+               CBH_ALLOC(h->cbh_elements3, h);
+               if (h->cbh_elements3 == NULL) {
+                       CBH_FREE(frag2);
+                       CBH_FREE(frag1);
+                       return -ENOMEM;
+               }
+       }
+
+       if (frag1 != NULL) {
+               LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+               h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+       } else {
+               frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+               LASSERT(frag1 != NULL);
+       }
+
+       frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+       h->cbh_hwm += CBH_SIZE;
+       return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL                 error
+ */
+cfs_binheap_t *
+cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+                  unsigned count, void *arg, struct cfs_cpt_table *cptab,
+                  int cptid)
+{
+       cfs_binheap_t *h;
+
+       LASSERT(ops != NULL);
+       LASSERT(ops->hop_compare != NULL);
+       LASSERT(cptab != NULL);
+       LASSERT(cptid == CFS_CPT_ANY ||
+              (cptid >= 0 && cptid < cptab->ctb_nparts));
+
+       LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+       if (h == NULL)
+               return NULL;
+
+       h->cbh_ops        = ops;
+       h->cbh_nelements  = 0;
+       h->cbh_hwm        = 0;
+       h->cbh_private    = arg;
+       h->cbh_flags      = flags & (~CBH_FLAG_ATOMIC_GROW);
+       h->cbh_cptab      = cptab;
+       h->cbh_cptid      = cptid;
+
+       while (h->cbh_hwm < count) { /* preallocate */
+               if (cfs_binheap_grow(h) != 0) {
+                       cfs_binheap_destroy(h);
+                       return NULL;
+               }
+       }
+
+       h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+       return h;
+}
+EXPORT_SYMBOL(cfs_binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+cfs_binheap_destroy(cfs_binheap_t *h)
+{
+       int idx0;
+       int idx1;
+       int n;
+
+       LASSERT(h != NULL);
+
+       n = h->cbh_hwm;
+
+       if (n > 0) {
+               CBH_FREE(h->cbh_elements1);
+               n -= CBH_SIZE;
+       }
+
+       if (n > 0) {
+               for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+                       CBH_FREE(h->cbh_elements2[idx0]);
+                       n -= CBH_SIZE;
+               }
+
+               CBH_FREE(h->cbh_elements2);
+       }
+
+       if (n > 0) {
+               for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+                       for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+                               CBH_FREE(h->cbh_elements3[idx0][idx1]);
+                               n -= CBH_SIZE;
+                       }
+
+                       CBH_FREE(h->cbh_elements3[idx0]);
+               }
+
+               CBH_FREE(h->cbh_elements3);
+       }
+
+       LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(cfs_binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h          The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static cfs_binheap_node_t **
+cfs_binheap_pointer(cfs_binheap_t *h, unsigned int idx)
+{
+       if (idx < CBH_SIZE)
+               return &(h->cbh_elements1[idx]);
+
+       idx -= CBH_SIZE;
+       if (idx < CBH_SIZE * CBH_SIZE)
+               return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+       idx -= CBH_SIZE * CBH_SIZE;
+       return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\
+                                [(idx >> CBH_SHIFT) & CBH_MASK]\
+                                [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h          The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL                 Supplied index is out of bounds
+ */
+cfs_binheap_node_t *
+cfs_binheap_find(cfs_binheap_t *h, unsigned int idx)
+{
+       if (idx >= h->cbh_nelements)
+               return NULL;
+
+       return *cfs_binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(cfs_binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_bubble(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         cur_idx = e->chn_index;
+       cfs_binheap_node_t **cur_ptr;
+       unsigned int         parent_idx;
+       cfs_binheap_node_t **parent_ptr;
+       int                  did_sth = 0;
+
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       while (cur_idx > 0) {
+               parent_idx = (cur_idx - 1) >> 1;
+
+               parent_ptr = cfs_binheap_pointer(h, parent_idx);
+               LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+               if (h->cbh_ops->hop_compare(*parent_ptr, e))
+                       break;
+
+               (*parent_ptr)->chn_index = cur_idx;
+               *cur_ptr = *parent_ptr;
+               cur_ptr = parent_ptr;
+               cur_idx = parent_idx;
+               did_sth = 1;
+       }
+
+       e->chn_index = cur_idx;
+       *cur_ptr = e;
+
+       return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_sink(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         n = h->cbh_nelements;
+       unsigned int         child_idx;
+       cfs_binheap_node_t **child_ptr;
+       cfs_binheap_node_t  *child;
+       unsigned int         child2_idx;
+       cfs_binheap_node_t **child2_ptr;
+       cfs_binheap_node_t  *child2;
+       unsigned int         cur_idx;
+       cfs_binheap_node_t **cur_ptr;
+       int                  did_sth = 0;
+
+       cur_idx = e->chn_index;
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       while (cur_idx < n) {
+               child_idx = (cur_idx << 1) + 1;
+               if (child_idx >= n)
+                       break;
+
+               child_ptr = cfs_binheap_pointer(h, child_idx);
+               child = *child_ptr;
+
+               child2_idx = child_idx + 1;
+               if (child2_idx < n) {
+                       child2_ptr = cfs_binheap_pointer(h, child2_idx);
+                       child2 = *child2_ptr;
+
+                       if (h->cbh_ops->hop_compare(child2, child)) {
+                               child_idx = child2_idx;
+                               child_ptr = child2_ptr;
+                               child = child2;
+                       }
+               }
+
+               LASSERT(child->chn_index == child_idx);
+
+               if (h->cbh_ops->hop_compare(e, child))
+                       break;
+
+               child->chn_index = cur_idx;
+               *cur_ptr = child;
+               cur_ptr = child_ptr;
+               cur_idx = child_idx;
+               did_sth = 1;
+       }
+
+       e->chn_index = cur_idx;
+       *cur_ptr = e;
+
+       return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0   Element inserted successfully
+ * \retval != 0 error
+ */
+int
+cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       cfs_binheap_node_t **new_ptr;
+       unsigned int         new_idx = h->cbh_nelements;
+       int                  rc;
+
+       if (new_idx == h->cbh_hwm) {
+               rc = cfs_binheap_grow(h);
+               if (rc != 0)
+                       return rc;
+       }
+
+       if (h->cbh_ops->hop_enter) {
+               rc = h->cbh_ops->hop_enter(h, e);
+               if (rc != 0)
+                       return rc;
+       }
+
+       e->chn_index = new_idx;
+       new_ptr = cfs_binheap_pointer(h, new_idx);
+       h->cbh_nelements++;
+       *new_ptr = e;
+
+       cfs_binheap_bubble(h, e);
+
+       return 0;
+}
+EXPORT_SYMBOL(cfs_binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         n = h->cbh_nelements;
+       unsigned int         cur_idx = e->chn_index;
+       cfs_binheap_node_t **cur_ptr;
+       cfs_binheap_node_t  *last;
+
+       LASSERT(cur_idx != CBH_POISON);
+       LASSERT(cur_idx < n);
+
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       n--;
+       last = *cfs_binheap_pointer(h, n);
+       h->cbh_nelements = n;
+       if (last == e)
+               return;
+
+       last->chn_index = cur_idx;
+       *cur_ptr = last;
+       if (!cfs_binheap_bubble(h, *cur_ptr))
+               cfs_binheap_sink(h, *cur_ptr);
+
+       e->chn_index = CBH_POISON;
+       if (h->cbh_ops->hop_exit)
+               h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_remove);
+
+/** @} heap */
diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644 (file)
index 0000000..c152223
--- /dev/null
@@ -0,0 +1,336 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include <linux/libcfs/libcfs.h>
+
+#ifdef LUSTRE_UTILS
+/* This is the userspace side. */
+
+/** Start the userspace side of a KUC pipe.
+ * @param link Private descriptor for pipe/socket.
+ * @param groups KUC broadcast group to listen to
+ *       (can be null for unicast to this pid)
+ */
+int libcfs_ukuc_start(lustre_kernelcomm *link, int group)
+{
+       int pfd[2];
+
+       if (pipe(pfd) < 0)
+               return -errno;
+
+       memset(link, 0, sizeof(*link));
+       link->lk_rfd = pfd[0];
+       link->lk_wfd = pfd[1];
+       link->lk_group = group;
+       link->lk_uid = getpid();
+       return 0;
+}
+
+int libcfs_ukuc_stop(lustre_kernelcomm *link)
+{
+       if (link->lk_wfd > 0)
+               close(link->lk_wfd);
+       return close(link->lk_rfd);
+}
+
+#define lhsz sizeof(*kuch)
+
+/** Read a message from the link.
+ * Allocates memory, returns handle
+ *
+ * @param link Private descriptor for pipe/socket.
+ * @param buf Buffer to read into, must include size for kuc_hdr
+ * @param maxsize Maximum message size allowed
+ * @param transport Only listen to messages on this transport
+ *      (and the generic transport)
+ */
+int libcfs_ukuc_msg_get(lustre_kernelcomm *link, char *buf, int maxsize,
+                       int transport)
+{
+       struct kuc_hdr *kuch;
+       int rc = 0;
+
+       memset(buf, 0, maxsize);
+
+       CDEBUG(D_KUC, "Waiting for message from kernel on fd %d\n",
+              link->lk_rfd);
+
+       while (1) {
+               /* Read header first to get message size */
+               rc = read(link->lk_rfd, buf, lhsz);
+               if (rc <= 0) {
+                       rc = -errno;
+                       break;
+               }
+               kuch = (struct kuc_hdr *)buf;
+
+               CDEBUG(D_KUC, "Received message mg=%x t=%d m=%d l=%d\n",
+                      kuch->kuc_magic, kuch->kuc_transport, kuch->kuc_msgtype,
+                      kuch->kuc_msglen);
+
+               if (kuch->kuc_magic != KUC_MAGIC) {
+                       CERROR("bad message magic %x != %x\n",
+                              kuch->kuc_magic, KUC_MAGIC);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kuch->kuc_msglen > maxsize) {
+                       rc = -EMSGSIZE;
+                       break;
+               }
+
+               /* Read payload */
+               rc = read(link->lk_rfd, buf + lhsz, kuch->kuc_msglen - lhsz);
+               if (rc < 0) {
+                       rc = -errno;
+                       break;
+               }
+               if (rc < (kuch->kuc_msglen - lhsz)) {
+                       CERROR("short read: got %d of %d bytes\n",
+                              rc, kuch->kuc_msglen);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kuch->kuc_transport == transport ||
+                   kuch->kuc_transport == KUC_TRANSPORT_GENERIC) {
+                       return 0;
+               }
+               /* Drop messages for other transports */
+       }
+       return rc;
+}
+
+#else /* LUSTRE_UTILS */
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+       struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+       int rc = -ENOSYS;
+
+       if (filp == NULL || IS_ERR(filp))
+               return -EBADF;
+
+       if (kuch->kuc_magic != KUC_MAGIC) {
+               CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+               return -ENOSYS;
+       }
+
+       {
+               loff_t offset = 0;
+               rc = filp_user_write(filp, payload, kuch->kuc_msglen,
+                                    &offset);
+       }
+
+       if (rc < 0)
+               CWARN("message send failed (%d)\n", rc);
+       else
+               CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group reigstration has a uid and a file pointer */
+struct kkuc_reg {
+       struct list_head        kr_chain;
+       int             kr_uid;
+       struct file     *kr_fp;
+       __u32           kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identidier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+       struct kkuc_reg *reg;
+
+       if (group > KUC_GRP_MAX) {
+               CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+               return -EINVAL;
+       }
+
+       /* fput in group_rem */
+       if (filp == NULL)
+               return -EBADF;
+
+       /* freed in group_rem */
+       reg = kmalloc(sizeof(*reg), 0);
+       if (reg == NULL)
+               return -ENOMEM;
+
+       reg->kr_fp = filp;
+       reg->kr_uid = uid;
+       reg->kr_data = data;
+
+       down_write(&kg_sem);
+       if (kkuc_groups[group].next == NULL)
+               INIT_LIST_HEAD(&kkuc_groups[group]);
+       list_add(&reg->kr_chain, &kkuc_groups[group]);
+       up_write(&kg_sem);
+
+       CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+       return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+       struct kkuc_reg *reg, *next;
+       ENTRY;
+
+       if (kkuc_groups[group].next == NULL)
+               RETURN(0);
+
+       if (uid == 0) {
+               /* Broadcast a shutdown message */
+               struct kuc_hdr lh;
+
+               lh.kuc_magic = KUC_MAGIC;
+               lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+               lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+               lh.kuc_msglen = sizeof(lh);
+               libcfs_kkuc_group_put(group, &lh);
+       }
+
+       down_write(&kg_sem);
+       list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+               if ((uid == 0) || (uid == reg->kr_uid)) {
+                       list_del(&reg->kr_chain);
+                       CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+                              reg->kr_uid, reg->kr_fp, group);
+                       if (reg->kr_fp != NULL)
+                               fput(reg->kr_fp);
+                       kfree(reg);
+               }
+       }
+       up_write(&kg_sem);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+       struct kkuc_reg *reg;
+       int              rc = 0;
+       int one_success = 0;
+       ENTRY;
+
+       down_read(&kg_sem);
+       list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+               if (reg->kr_fp != NULL) {
+                       rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+                       if (rc == 0)
+                               one_success = 1;
+                       else if (rc == -EPIPE) {
+                               fput(reg->kr_fp);
+                               reg->kr_fp = NULL;
+                       }
+               }
+       }
+       up_read(&kg_sem);
+
+       /* don't return an error if the message has been delivered
+        * at least to one agent */
+       if (one_success)
+               rc = 0;
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+                             void *cb_arg)
+{
+       struct kkuc_reg *reg;
+       int rc = 0;
+       ENTRY;
+
+       if (group > KUC_GRP_MAX) {
+               CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+               RETURN(-EINVAL);
+       }
+
+       /* no link for this group */
+       if (kkuc_groups[group].next == NULL)
+               RETURN(0);
+
+       down_read(&kg_sem);
+       list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+               if (reg->kr_fp != NULL) {
+                       rc = cb_func(reg->kr_data, cb_arg);
+               }
+       }
+       up_read(&kg_sem);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
+
+#endif /* LUSTRE_UTILS */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644 (file)
index 0000000..8e88eb5
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly = NULL;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC     0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+       struct cfs_cpt_table *cptab;
+
+       if (ncpt != 1) {
+               CERROR("Can't support cpu partition number %d\n", ncpt);
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(cptab, sizeof(*cptab));
+       if (cptab != NULL) {
+               cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+               cptab->ctb_nparts  = ncpt;
+       }
+
+       return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+       LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+       LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+       if (cfs_cpt_table != NULL) {
+               cfs_cpt_table_free(cfs_cpt_table);
+               cfs_cpt_table = NULL;
+       }
+}
+
+int
+cfs_cpu_init(void)
+{
+       cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+       return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644 (file)
index 0000000..8d6c4ad
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+       LASSERT(pcl->pcl_locks != NULL);
+       LASSERT(!pcl->pcl_locked);
+
+       cfs_percpt_free(pcl->pcl_locks);
+       LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+       struct cfs_percpt_lock  *pcl;
+       spinlock_t              *lock;
+       int                     i;
+
+       /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+       LIBCFS_ALLOC(pcl, sizeof(*pcl));
+       if (pcl == NULL)
+               return NULL;
+
+       pcl->pcl_cptab = cptab;
+       pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+       if (pcl->pcl_locks == NULL) {
+               LIBCFS_FREE(pcl, sizeof(*pcl));
+               return NULL;
+       }
+
+       cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+               spin_lock_init(lock);
+
+       return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+       int     ncpt = cfs_cpt_number(pcl->pcl_cptab);
+       int     i;
+
+       LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+       if (ncpt == 1) {
+               index = 0;
+       } else { /* serialize with exclusive lock */
+               while (pcl->pcl_locked)
+                       cpu_relax();
+       }
+
+       if (likely(index != CFS_PERCPT_LOCK_EX)) {
+               spin_lock(pcl->pcl_locks[index]);
+               return;
+       }
+
+       /* exclusive lock request */
+       for (i = 0; i < ncpt; i++) {
+               spin_lock(pcl->pcl_locks[i]);
+               if (i == 0) {
+                       LASSERT(!pcl->pcl_locked);
+                       /* nobody should take private lock after this
+                        * so I wouldn't starve for too long time */
+                       pcl->pcl_locked = 1;
+               }
+       }
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+       int     ncpt = cfs_cpt_number(pcl->pcl_cptab);
+       int     i;
+
+       index = ncpt == 1 ? 0 : index;
+
+       if (likely(index != CFS_PERCPT_LOCK_EX)) {
+               spin_unlock(pcl->pcl_locks[index]);
+               return;
+       }
+
+       for (i = ncpt - 1; i >= 0; i--) {
+               if (i == 0) {
+                       LASSERT(pcl->pcl_locked);
+                       pcl->pcl_locked = 0;
+               }
+               spin_unlock(pcl->pcl_locks[i]);
+       }
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+       cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+       atomic_t        **refs;
+       atomic_t        *ref;
+       int             i;
+
+       refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+       if (refs == NULL)
+               return NULL;
+
+       cfs_percpt_for_each(ref, i, refs)
+               atomic_set(ref, init_val);
+       return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+       atomic_t        *ref;
+       int             i;
+       int             val = 0;
+
+       cfs_percpt_for_each(ref, i, refs)
+               val += atomic_read(ref);
+
+       return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644 (file)
index 0000000..8791373
--- /dev/null
@@ -0,0 +1,205 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+struct cfs_var_array {
+       unsigned int            va_count;       /* # of buffers */
+       unsigned int            va_size;        /* size of each var */
+       struct cfs_cpt_table    *va_cptab;      /* cpu partition table */
+       void                    *va_ptrs[0];    /* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+       struct  cfs_var_array *arr;
+       int     i;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       for (i = 0; i < arr->va_count; i++) {
+               if (arr->va_ptrs[i] != NULL)
+                       LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+       }
+
+       LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+                                 va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *     arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *     then caller can access memory block for CPU 0 by arr[0],
+ *     memory block for CPU 1 by arr[1]...
+ *     memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+       struct cfs_var_array    *arr;
+       int                     count;
+       int                     i;
+
+       count = cfs_cpt_number(cptab);
+
+       LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+       if (arr == NULL)
+               return NULL;
+
+       arr->va_size    = size = L1_CACHE_ALIGN(size);
+       arr->va_count   = count;
+       arr->va_cptab   = cptab;
+
+       for (i = 0; i < count; i++) {
+               LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+               if (arr->va_ptrs[i] == NULL) {
+                       cfs_percpt_free((void *)&arr->va_ptrs[0]);
+                       return NULL;
+               }
+       }
+
+       return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+       struct cfs_var_array *arr;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+       struct cfs_var_array *arr;
+       int    cpt;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+       cpt = cfs_cpt_current(arr->va_cptab, 0);
+       if (cpt < 0)
+               return NULL;
+
+       return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+       struct cfs_var_array *arr;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       LASSERT(idx >= 0 && idx < arr->va_count);
+       return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+       struct cfs_var_array    *arr;
+       int                     i;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       for (i = 0; i < arr->va_count; i++) {
+               if (arr->va_ptrs[i] == NULL)
+                       continue;
+
+               LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+       }
+       LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+                                 va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+       struct cfs_var_array    *arr;
+       int                     i;
+
+       LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+       if (arr == NULL)
+               return NULL;
+
+       arr->va_count   = count;
+       arr->va_size    = size;
+
+       for (i = 0; i < count; i++) {
+               LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+               if (arr->va_ptrs[i] == NULL) {
+                       cfs_array_free((void *)&arr->va_ptrs[0]);
+                       return NULL;
+               }
+       }
+
+       return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644 (file)
index 0000000..9edccc9
--- /dev/null
@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+/* non-0 = don't match */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+       if (s1 == NULL || s2 == NULL)
+               return 1;
+
+       if (n == 0)
+               return 0;
+
+       while (n-- != 0 && tolower(*s1) == tolower(*s2)) {
+               if (n == 0 || *s1 == '\0' || *s2 == '\0')
+                       break;
+               s1++;
+               s2++;
+       }
+
+       return tolower(*(unsigned char *)s1) - tolower(*(unsigned char *)s2);
+}
+EXPORT_SYMBOL(cfs_strncasecmp);
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                int *oldmask, int minmask, int allmask)
+{
+       const char *debugstr;
+       char op = 0;
+       int newmask = minmask, i, len, found = 0;
+       ENTRY;
+
+       /* <str> must be a list of tokens separated by whitespace
+        * and optionally an operator ('+' or '-').  If an operator
+        * appears first in <str>, '*oldmask' is used as the starting point
+        * (relative), otherwise minmask is used (absolute).  An operator
+        * applies to all following tokens up to the next operator. */
+       while (*str != 0) {
+               while (isspace(*str))
+                       str++;
+               if (*str == 0)
+                       break;
+               if (*str == '+' || *str == '-') {
+                       op = *str++;
+                       if (!found)
+                               /* only if first token is relative */
+                               newmask = *oldmask;
+                       while (isspace(*str))
+                               str++;
+                       if (*str == 0)    /* trailing op */
+                               return -EINVAL;
+               }
+
+               /* find token length */
+               for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+                     str[len] != '+' && str[len] != '-'; len++);
+
+               /* match token */
+               found = 0;
+               for (i = 0; i < 32; i++) {
+                       debugstr = bit2str(i);
+                       if (debugstr != NULL &&
+                           strlen(debugstr) == len &&
+                           cfs_strncasecmp(str, debugstr, len) == 0) {
+                               if (op == '-')
+                                       newmask &= ~(1 << i);
+                               else
+                                       newmask |= (1 << i);
+                               found = 1;
+                               break;
+                       }
+               }
+               if (!found && len == 3 &&
+                   (cfs_strncasecmp(str, "ALL", len) == 0)) {
+                       if (op == '-')
+                               newmask = minmask;
+                       else
+                               newmask = allmask;
+                       found = 1;
+               }
+               if (!found) {
+                       CWARN("unknown mask '%.*s'.\n"
+                             "mask usage: [+|-]<all|type> ...\n", len, str);
+                       return -EINVAL;
+               }
+               str += len;
+       }
+
+       *oldmask = newmask;
+       return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/* Duplicate a string in a platform-independent way */
+char *cfs_strdup(const char *str, u_int32_t flags)
+{
+       size_t lenz; /* length of str + zero byte */
+       char *dup_str;
+
+       lenz = strlen(str) + 1;
+
+       dup_str = kmalloc(lenz, flags);
+       if (dup_str == NULL)
+               return NULL;
+
+       memcpy(dup_str, str, lenz);
+
+       return dup_str;
+}
+EXPORT_SYMBOL(cfs_strdup);
+
+/**
+ * cfs_{v}snprintf() return the actual size that is printed rather than
+ * the size that would be printed in standard functions.
+ */
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+       int i;
+
+       LASSERT(size > 0);
+       i = vsnprintf(buf, size, fmt, args);
+
+       return  (i >= size ? size - 1 : i);
+}
+EXPORT_SYMBOL(cfs_vsnprintf);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...)
+{
+       va_list args;
+       int i;
+
+       va_start(args, fmt);
+       i = cfs_vsnprintf(buf, size, fmt, args);
+       va_end(args);
+
+       return  i;
+}
+EXPORT_SYMBOL(cfs_snprintf);
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+       size_t i = 0;
+       char  *end;
+
+       /* trim leading spaces */
+       while (i < size && *str && isspace(*str)) {
+               ++i;
+               ++str;
+       }
+
+       /* string with all spaces */
+       if (*str == '\0')
+               goto out;
+
+       end = str;
+       while (i < size && *end != '\0' && !isspace(*end)) {
+               ++i;
+               ++end;
+       }
+
+       *end= '\0';
+out:
+       return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+       char *end;
+
+       while (cfs_iswhite(*str))
+               str++;
+
+       end = str + strlen(str);
+       while (end > str) {
+               if (!cfs_iswhite(end[-1]))
+                       break;
+               end--;
+       }
+
+       *end = 0;
+       return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+       char *end;
+
+       if (next->ls_str == NULL)
+               return 0;
+
+       /* skip leading white spaces */
+       while (next->ls_len) {
+               if (!cfs_iswhite(*next->ls_str))
+                       break;
+               next->ls_str++;
+               next->ls_len--;
+       }
+
+       if (next->ls_len == 0) /* whitespaces only */
+               return 0;
+
+       if (*next->ls_str == delim) {
+               /* first non-writespace is the delimiter */
+               return 0;
+       }
+
+       res->ls_str = next->ls_str;
+       end = memchr(next->ls_str, delim, next->ls_len);
+       if (end == NULL) {
+               /* there is no the delimeter in the string */
+               end = next->ls_str + next->ls_len;
+               next->ls_str = NULL;
+       } else {
+               next->ls_str = end + 1;
+               next->ls_len -= (end - res->ls_str + 1);
+       }
+
+       /* skip ending whitespaces */
+       while (--end != res->ls_str) {
+               if (!cfs_iswhite(*end))
+                       break;
+       }
+
+       res->ls_len = end - res->ls_str + 1;
+       return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+                 unsigned min, unsigned max)
+{
+       char    *endp;
+
+       str = cfs_trimwhite(str);
+       *num = strtoul(str, &endp, 0);
+       if (endp == str)
+               return 0;
+
+       for (; endp < str + nob; endp++) {
+               if (!cfs_iswhite(*endp))
+                       return 0;
+       }
+
+       return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+                    int bracketed, struct cfs_range_expr **expr)
+{
+       struct cfs_range_expr   *re;
+       struct cfs_lstr         tok;
+
+       LIBCFS_ALLOC(re, sizeof(*re));
+       if (re == NULL)
+               return -ENOMEM;
+
+       if (src->ls_len == 1 && src->ls_str[0] == '*') {
+               re->re_lo = min;
+               re->re_hi = max;
+               re->re_stride = 1;
+               goto out;
+       }
+
+       if (cfs_str2num_check(src->ls_str, src->ls_len,
+                             &re->re_lo, min, max)) {
+               /* <number> is parsed */
+               re->re_hi = re->re_lo;
+               re->re_stride = 1;
+               goto out;
+       }
+
+       if (!bracketed || !cfs_gettok(src, '-', &tok))
+               goto failed;
+
+       if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+                              &re->re_lo, min, max))
+               goto failed;
+
+       /* <number> - */
+       if (cfs_str2num_check(src->ls_str, src->ls_len,
+                             &re->re_hi, min, max)) {
+               /* <number> - <number> is parsed */
+               re->re_stride = 1;
+               goto out;
+       }
+
+       /* go to check <number> '-' <number> '/' <number> */
+       if (cfs_gettok(src, '/', &tok)) {
+               if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+                                      &re->re_hi, min, max))
+                       goto failed;
+
+               /* <number> - <number> / ... */
+               if (cfs_str2num_check(src->ls_str, src->ls_len,
+                                     &re->re_stride, min, max)) {
+                       /* <number> - <number> / <number> is parsed */
+                       goto out;
+               }
+       }
+
+ out:
+       *expr = re;
+       return 0;
+
+ failed:
+       LIBCFS_FREE(re, sizeof(*re));
+       return -EINVAL;
+}
+EXPORT_SYMBOL(cfs_range_expr_parse);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+       struct cfs_range_expr   *expr;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               if (value >= expr->re_lo && value <= expr->re_hi &&
+                   ((value - expr->re_lo) % expr->re_stride) == 0)
+                       return 1;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+       struct cfs_range_expr   *expr;
+       __u32                   *val;
+       int                     count = 0;
+       int                     i;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               count++;
+               }
+       }
+
+       if (count == 0) /* empty expression list */
+               return 0;
+
+       if (count > max) {
+               CERROR("Number of values %d exceeds max allowed %d\n",
+                      max, count);
+               return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+       if (val == NULL)
+               return -ENOMEM;
+
+       count = 0;
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               val[count++] = i;
+               }
+       }
+
+       *valpp = val;
+       return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+       while (!list_empty(&expr_list->el_exprs)) {
+               struct cfs_range_expr *expr;
+
+               expr = list_entry(expr_list->el_exprs.next,
+                                     struct cfs_range_expr, re_link),
+               list_del(&expr->re_link);
+               LIBCFS_FREE(expr, sizeof(*expr));
+       }
+
+       LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+void
+cfs_expr_list_print(struct cfs_expr_list *expr_list)
+{
+       struct cfs_range_expr *expr;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               CDEBUG(D_WARNING, "%d-%d/%d\n",
+                      expr->re_lo, expr->re_hi, expr->re_stride);
+       }
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+                   struct cfs_expr_list **elpp)
+{
+       struct cfs_expr_list    *expr_list;
+       struct cfs_range_expr   *expr;
+       struct cfs_lstr         src;
+       int                     rc;
+
+       LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+       if (expr_list == NULL)
+               return -ENOMEM;
+
+       src.ls_str = str;
+       src.ls_len = len;
+
+       INIT_LIST_HEAD(&expr_list->el_exprs);
+
+       if (src.ls_str[0] == '[' &&
+           src.ls_str[src.ls_len - 1] == ']') {
+               src.ls_str++;
+               src.ls_len -= 2;
+
+               rc = -EINVAL;
+               while (src.ls_str != NULL) {
+                       struct cfs_lstr tok;
+
+                       if (!cfs_gettok(&src, ',', &tok)) {
+                               rc = -EINVAL;
+                               break;
+                       }
+
+                       rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+                       if (rc != 0)
+                               break;
+
+                       list_add_tail(&expr->re_link,
+                                         &expr_list->el_exprs);
+               }
+       } else {
+               rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+               if (rc == 0) {
+                       list_add_tail(&expr->re_link,
+                                         &expr_list->el_exprs);
+               }
+       }
+
+       if (rc != 0)
+               cfs_expr_list_free(expr_list);
+       else
+               *elpp = expr_list;
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+       struct cfs_expr_list *el;
+
+       while (!list_empty(list)) {
+               el = list_entry(list->next,
+                                   struct cfs_expr_list, el_link);
+               list_del(&el->el_link);
+               cfs_expr_list_free(el);
+       }
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+       struct cfs_expr_list    *el;
+       struct cfs_lstr         src;
+       int                     rc;
+       int                     i;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       i = 0;
+
+       while (src.ls_str != NULL) {
+               struct cfs_lstr res;
+
+               if (!cfs_gettok(&src, '.', &res)) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+
+               rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+               if (rc != 0)
+                       goto out;
+
+               list_add_tail(&el->el_link, list);
+               i++;
+       }
+
+       if (i == 4)
+               return 0;
+
+       rc = -EINVAL;
+ out:
+       cfs_expr_list_free_list(list);
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+       struct cfs_expr_list *el;
+       int i = 0;
+
+       list_for_each_entry_reverse(el, list, el_link) {
+               if (!cfs_expr_list_match(addr & 0xff, el))
+                       return 0;
+               addr >>= 8;
+               i++;
+       }
+
+       return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+       cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644 (file)
index 0000000..6e255ff
--- /dev/null
@@ -0,0 +1,1085 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/libcfs/libcfs.h>
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int     cpu_npartitions;
+CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char    *cpu_pattern = "";
+CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+       /* serialize hotplug etc */
+       spinlock_t              cpt_lock;
+       /* reserved for hotplug */
+       unsigned long           cpt_version;
+       /* mutex to protect cpt_cpumask */
+       struct semaphore        cpt_mutex;
+       /* scratch buffer for set/unset_node */
+       cpumask_t               *cpt_cpumask;
+};
+
+static struct cfs_cpt_data     cpt_data;
+
+void
+cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+       /* return cpumask of cores in the same socket */
+       cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_core_siblings);
+
+/* return number of cores in the same socket of \a cpu */
+int
+cfs_cpu_core_nsiblings(int cpu)
+{
+       int     num;
+
+       down(&cpt_data.cpt_mutex);
+
+       cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask);
+       num = cpus_weight(*cpt_data.cpt_cpumask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return num;
+}
+EXPORT_SYMBOL(cfs_cpu_core_nsiblings);
+
+/* return cpumask of HTs in the same core */
+void
+cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+       cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_ht_siblings);
+
+/* return number of HTs in the same core of \a cpu */
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+       int     num;
+
+       down(&cpt_data.cpt_mutex);
+
+       cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+       num = cpus_weight(*cpt_data.cpt_cpumask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return num;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+void
+cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+       cpumask_copy(mask, cpumask_of_node(node));
+}
+EXPORT_SYMBOL(cfs_node_to_cpumask);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+       int     i;
+
+       if (cptab->ctb_cpu2cpt != NULL) {
+               LIBCFS_FREE(cptab->ctb_cpu2cpt,
+                           num_possible_cpus() *
+                           sizeof(cptab->ctb_cpu2cpt[0]));
+       }
+
+       for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+               struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+               if (part->cpt_nodemask != NULL) {
+                       LIBCFS_FREE(part->cpt_nodemask,
+                                   sizeof(*part->cpt_nodemask));
+               }
+
+               if (part->cpt_cpumask != NULL)
+                       LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+       }
+
+       if (cptab->ctb_parts != NULL) {
+               LIBCFS_FREE(cptab->ctb_parts,
+                           cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+       }
+
+       if (cptab->ctb_nodemask != NULL)
+               LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+       if (cptab->ctb_cpumask != NULL)
+               LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+       LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+       struct cfs_cpt_table *cptab;
+       int     i;
+
+       LIBCFS_ALLOC(cptab, sizeof(*cptab));
+       if (cptab == NULL)
+               return NULL;
+
+       cptab->ctb_nparts = ncpt;
+
+       LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+       LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+       if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+               goto failed;
+
+       LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+                    num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+       if (cptab->ctb_cpu2cpt == NULL)
+               goto failed;
+
+       memset(cptab->ctb_cpu2cpt, -1,
+              num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+       LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+       if (cptab->ctb_parts == NULL)
+               goto failed;
+
+       for (i = 0; i < ncpt; i++) {
+               struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+               LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+               LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+               if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+                       goto failed;
+       }
+
+       spin_lock(&cpt_data.cpt_lock);
+       /* Reserved for hotplug */
+       cptab->ctb_version = cpt_data.cpt_version;
+       spin_unlock(&cpt_data.cpt_lock);
+
+       return cptab;
+
+ failed:
+       cfs_cpt_table_free(cptab);
+       return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       char    *tmp = buf;
+       int     rc = 0;
+       int     i;
+       int     j;
+
+       for (i = 0; i < cptab->ctb_nparts; i++) {
+               if (len > 0) {
+                       rc = snprintf(tmp, len, "%d\t: ", i);
+                       len -= rc;
+               }
+
+               if (len <= 0) {
+                       rc = -EFBIG;
+                       goto out;
+               }
+
+               tmp += rc;
+               for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
+                       rc = snprintf(tmp, len, "%d ", j);
+                       len -= rc;
+                       if (len <= 0) {
+                               rc = -EFBIG;
+                               goto out;
+                       }
+                       tmp += rc;
+               }
+
+               *tmp = '\n';
+               tmp++;
+               len--;
+       }
+
+ out:
+       if (rc < 0)
+               return rc;
+
+       return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+       return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cpus_weight(*cptab->ctb_cpumask) :
+              cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
+              any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       int     node;
+
+       LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+       if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
+               CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+               return 0;
+       }
+
+       if (cptab->ctb_cpu2cpt[cpu] != -1) {
+               CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+                      cpu, cptab->ctb_cpu2cpt[cpu]);
+               return 0;
+       }
+
+       cptab->ctb_cpu2cpt[cpu] = cpt;
+
+       LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
+       LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+
+       cpu_set(cpu, *cptab->ctb_cpumask);
+       cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+
+       node = cpu_to_node(cpu);
+
+       /* first CPU of @node in this CPT table */
+       if (!node_isset(node, *cptab->ctb_nodemask))
+               node_set(node, *cptab->ctb_nodemask);
+
+       /* first CPU of @node in this partition */
+       if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+               node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       int     node;
+       int     i;
+
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       if (cpu < 0 || cpu >= NR_CPUS) {
+               CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+               return;
+       }
+
+       if (cpt == CFS_CPT_ANY) {
+               /* caller doesn't know the partition ID */
+               cpt = cptab->ctb_cpu2cpt[cpu];
+               if (cpt < 0) { /* not set in this CPT-table */
+                       CDEBUG(D_INFO, "Try to unset cpu %d which is "
+                                      "not in CPT-table %p\n", cpt, cptab);
+                       return;
+               }
+
+       } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+               CDEBUG(D_INFO,
+                      "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+               return;
+       }
+
+       LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+       LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
+
+       cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+       cpu_clear(cpu, *cptab->ctb_cpumask);
+       cptab->ctb_cpu2cpt[cpu] = -1;
+
+       node = cpu_to_node(cpu);
+
+       LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+       LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+       for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
+               /* this CPT has other CPU belonging to this node? */
+               if (cpu_to_node(i) == node)
+                       break;
+       }
+
+       if (i == NR_CPUS)
+               node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+       for_each_cpu_mask(i, *cptab->ctb_cpumask) {
+               /* this CPT-table has other CPU belonging to this node? */
+               if (cpu_to_node(i) == node)
+                       break;
+       }
+
+       if (i == NR_CPUS)
+               node_clear(node, *cptab->ctb_nodemask);
+
+       return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       int     i;
+
+       if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
+               CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
+                              "for CPU partition %d\n", cpt);
+               return 0;
+       }
+
+       for_each_cpu_mask(i, *mask) {
+               if (!cfs_cpt_set_cpu(cptab, cpt, i))
+                       return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       int     i;
+
+       for_each_cpu_mask(i, *mask)
+               cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       cpumask_t       *mask;
+       int             rc;
+
+       if (node < 0 || node >= MAX_NUMNODES) {
+               CDEBUG(D_INFO,
+                      "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+               return 0;
+       }
+
+       down(&cpt_data.cpt_mutex);
+
+       mask = cpt_data.cpt_cpumask;
+       cfs_node_to_cpumask(node, mask);
+
+       rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       cpumask_t *mask;
+
+       if (node < 0 || node >= MAX_NUMNODES) {
+               CDEBUG(D_INFO,
+                      "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+               return;
+       }
+
+       down(&cpt_data.cpt_mutex);
+
+       mask = cpt_data.cpt_cpumask;
+       cfs_node_to_cpumask(node, mask);
+
+       cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+       up(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       int     i;
+
+       for_each_node_mask(i, *mask) {
+               if (!cfs_cpt_set_node(cptab, cpt, i))
+                       return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       int     i;
+
+       for_each_node_mask(i, *mask)
+               cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+       int     last;
+       int     i;
+
+       if (cpt == CFS_CPT_ANY) {
+               last = cptab->ctb_nparts - 1;
+               cpt = 0;
+       } else {
+               last = cpt;
+       }
+
+       for (; cpt <= last; cpt++) {
+               for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
+                       cfs_cpt_unset_cpu(cptab, cpt, i);
+       }
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+       nodemask_t      *mask;
+       int             weight;
+       int             rotor;
+       int             node;
+
+       /* convert CPU partition ID to HW node id */
+
+       if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+               mask = cptab->ctb_nodemask;
+               rotor = cptab->ctb_spread_rotor++;
+       } else {
+               mask = cptab->ctb_parts[cpt].cpt_nodemask;
+               rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+       }
+
+       weight = nodes_weight(*mask);
+       LASSERT(weight > 0);
+
+       rotor %= weight;
+
+       for_each_node_mask(node, *mask) {
+               if (rotor-- == 0)
+                       return node;
+       }
+
+       LBUG();
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+       int     cpu = smp_processor_id();
+       int     cpt = cptab->ctb_cpu2cpt[cpu];
+
+       if (cpt < 0) {
+               if (!remap)
+                       return cpt;
+
+               /* don't return negative value for safety of upper layer,
+                * instead we shadow the unknown cpu to a valid partition ID */
+               cpt = cpu % cptab->ctb_nparts;
+       }
+
+       return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+       LASSERT(cpu >= 0 && cpu < NR_CPUS);
+
+       return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+       cpumask_t       *cpumask;
+       nodemask_t      *nodemask;
+       int             rc;
+       int             i;
+
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       if (cpt == CFS_CPT_ANY) {
+               cpumask = cptab->ctb_cpumask;
+               nodemask = cptab->ctb_nodemask;
+       } else {
+               cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+               nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+       }
+
+       if (any_online_cpu(*cpumask) == NR_CPUS) {
+               CERROR("No online CPU found in CPU partition %d, did someone "
+                      "do CPU hotplug on system? You might need to reload "
+                      "Lustre modules to keep system working well.\n", cpt);
+               return -EINVAL;
+       }
+
+       for_each_online_cpu(i) {
+               if (cpu_isset(i, *cpumask))
+                       continue;
+
+               rc = set_cpus_allowed(current, *cpumask);
+               set_mems_allowed(*nodemask);
+               if (rc == 0)
+                       schedule(); /* switch to allowed CPU */
+
+               return rc;
+       }
+
+       /* don't need to set affinity because all online CPUs are covered */
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+                    cpumask_t *node, int number)
+{
+       cpumask_t       *socket = NULL;
+       cpumask_t       *core = NULL;
+       int             rc = 0;
+       int             cpu;
+
+       LASSERT(number > 0);
+
+       if (number >= cpus_weight(*node)) {
+               while (!cpus_empty(*node)) {
+                       cpu = first_cpu(*node);
+
+                       rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+                       if (!rc)
+                               return -EINVAL;
+                       cpu_clear(cpu, *node);
+               }
+               return 0;
+       }
+
+       /* allocate scratch buffer */
+       LIBCFS_ALLOC(socket, cpumask_size());
+       LIBCFS_ALLOC(core, cpumask_size());
+       if (socket == NULL || core == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       while (!cpus_empty(*node)) {
+               cpu = first_cpu(*node);
+
+               /* get cpumask for cores in the same socket */
+               cfs_cpu_core_siblings(cpu, socket);
+               cpus_and(*socket, *socket, *node);
+
+               LASSERT(!cpus_empty(*socket));
+
+               while (!cpus_empty(*socket)) {
+                       int     i;
+
+                       /* get cpumask for hts in the same core */
+                       cfs_cpu_ht_siblings(cpu, core);
+                       cpus_and(*core, *core, *node);
+
+                       LASSERT(!cpus_empty(*core));
+
+                       for_each_cpu_mask(i, *core) {
+                               cpu_clear(i, *socket);
+                               cpu_clear(i, *node);
+
+                               rc = cfs_cpt_set_cpu(cptab, cpt, i);
+                               if (!rc) {
+                                       rc = -EINVAL;
+                                       goto out;
+                               }
+
+                               if (--number == 0)
+                                       goto out;
+                       }
+                       cpu = first_cpu(*socket);
+               }
+       }
+
+ out:
+       if (socket != NULL)
+               LIBCFS_FREE(socket, cpumask_size());
+       if (core != NULL)
+               LIBCFS_FREE(core, cpumask_size());
+       return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+       unsigned nnode = num_online_nodes();
+       unsigned ncpu  = num_online_cpus();
+       unsigned ncpt;
+
+       if (ncpu <= CPT_WEIGHT_MIN) {
+               ncpt = 1;
+               goto out;
+       }
+
+       /* generate reasonable number of CPU partitions based on total number
+        * of CPUs, Preferred N should be power2 and match this condition:
+        * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+       for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+       if (ncpt <= nnode) { /* fat numa system */
+               while (nnode > ncpt)
+                       nnode >>= 1;
+
+       } else { /* ncpt > nnode */
+               while ((nnode << 1) <= ncpt)
+                       nnode <<= 1;
+       }
+
+       ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+       /* config many CPU partitions on 32-bit system could consume
+        * too much memory */
+       ncpt = min(2U, ncpt);
+#endif
+       while (ncpu % ncpt != 0)
+               ncpt--; /* worst case is 1 */
+
+       return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+       struct cfs_cpt_table *cptab = NULL;
+       cpumask_t       *mask = NULL;
+       int             cpt = 0;
+       int             num;
+       int             rc;
+       int             i;
+
+       rc = cfs_cpt_num_estimate();
+       if (ncpt <= 0)
+               ncpt = rc;
+
+       if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+               CWARN("CPU partition number %d is larger than suggested "
+                     "value (%d), your system may have performance"
+                     "issue or run out of memory while under pressure\n",
+                     ncpt, rc);
+       }
+
+       if (num_online_cpus() % ncpt != 0) {
+               CERROR("CPU number %d is not multiple of cpu_npartition %d, "
+                      "please try different cpu_npartitions value or"
+                      "set pattern string by cpu_pattern=STRING\n",
+                      (int)num_online_cpus(), ncpt);
+               goto failed;
+       }
+
+       cptab = cfs_cpt_table_alloc(ncpt);
+       if (cptab == NULL) {
+               CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+               goto failed;
+       }
+
+       num = num_online_cpus() / ncpt;
+       if (num == 0) {
+               CERROR("CPU changed while setting CPU partition\n");
+               goto failed;
+       }
+
+       LIBCFS_ALLOC(mask, cpumask_size());
+       if (mask == NULL) {
+               CERROR("Failed to allocate scratch cpumask\n");
+               goto failed;
+       }
+
+       for_each_online_node(i) {
+               cfs_node_to_cpumask(i, mask);
+
+               while (!cpus_empty(*mask)) {
+                       struct cfs_cpu_partition *part;
+                       int    n;
+
+                       if (cpt >= ncpt)
+                               goto failed;
+
+                       part = &cptab->ctb_parts[cpt];
+
+                       n = num - cpus_weight(*part->cpt_cpumask);
+                       LASSERT(n > 0);
+
+                       rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+                       if (rc < 0)
+                               goto failed;
+
+                       LASSERT(num >= cpus_weight(*part->cpt_cpumask));
+                       if (num == cpus_weight(*part->cpt_cpumask))
+                               cpt++;
+               }
+       }
+
+       if (cpt != ncpt ||
+           num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+               CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
+                      "CPU hotplug/unplug while setting?\n",
+                      cptab->ctb_nparts, num, cpt,
+                      cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+               goto failed;
+       }
+
+       LIBCFS_FREE(mask, cpumask_size());
+
+       return cptab;
+
+ failed:
+       CERROR("Failed to setup CPU-partition-table with %d "
+              "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+              ncpt, num_online_nodes(), num_online_cpus());
+
+       if (mask != NULL)
+               LIBCFS_FREE(mask, cpumask_size());
+
+       if (cptab != NULL)
+               cfs_cpt_table_free(cptab);
+
+       return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+       struct cfs_cpt_table    *cptab;
+       char                    *str    = pattern;
+       int                     node    = 0;
+       int                     high;
+       int                     ncpt;
+       int                     c;
+
+       for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+               str = strchr(str, '[');
+               if (str == NULL)
+                       break;
+               str++;
+       }
+
+       str = cfs_trimwhite(pattern);
+       if (*str == 'n' || *str == 'N') {
+               pattern = str + 1;
+               node = 1;
+       }
+
+       if (ncpt == 0 ||
+           (node && ncpt > num_online_nodes()) ||
+           (!node && ncpt > num_online_cpus())) {
+               CERROR("Invalid pattern %s, or too many partitions %d\n",
+                      pattern, ncpt);
+               return NULL;
+       }
+
+       high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
+
+       cptab = cfs_cpt_table_alloc(ncpt);
+       if (cptab == NULL) {
+               CERROR("Failed to allocate cpu partition table\n");
+               return NULL;
+       }
+
+       for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+               struct cfs_range_expr   *range;
+               struct cfs_expr_list    *el;
+               char                    *bracket = strchr(str, '[');
+               int                     cpt;
+               int                     rc;
+               int                     i;
+               int                     n;
+
+               if (bracket == NULL) {
+                       if (*str != 0) {
+                               CERROR("Invalid pattern %s\n", str);
+                               goto failed;
+                       } else if (c != ncpt) {
+                               CERROR("expect %d partitions but found %d\n",
+                                      ncpt, c);
+                               goto failed;
+                       }
+                       break;
+               }
+
+               if (sscanf(str, "%u%n", &cpt, &n) < 1) {
+                       CERROR("Invalid cpu pattern %s\n", str);
+                       goto failed;
+               }
+
+               if (cpt < 0 || cpt >= ncpt) {
+                       CERROR("Invalid partition id %d, total partitions %d\n",
+                              cpt, ncpt);
+                       goto failed;
+               }
+
+               if (cfs_cpt_weight(cptab, cpt) != 0) {
+                       CERROR("Partition %d has already been set.\n", cpt);
+                       goto failed;
+               }
+
+               str = cfs_trimwhite(str + n);
+               if (str != bracket) {
+                       CERROR("Invalid pattern %s\n", str);
+                       goto failed;
+               }
+
+               bracket = strchr(str, ']');
+               if (bracket == NULL) {
+                       CERROR("missing right bracket for cpt %d, %s\n",
+                              cpt, str);
+                       goto failed;
+               }
+
+               if (cfs_expr_list_parse(str, (bracket - str) + 1,
+                                       0, high, &el) != 0) {
+                       CERROR("Can't parse number range: %s\n", str);
+                       goto failed;
+               }
+
+               list_for_each_entry(range, &el->el_exprs, re_link) {
+                       for (i = range->re_lo; i <= range->re_hi; i++) {
+                               if ((i - range->re_lo) % range->re_stride != 0)
+                                       continue;
+
+                               rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+                                           cfs_cpt_set_cpu(cptab, cpt, i);
+                               if (!rc) {
+                                       cfs_expr_list_free(el);
+                                       goto failed;
+                               }
+                       }
+               }
+
+               cfs_expr_list_free(el);
+
+               if (!cfs_cpt_online(cptab, cpt)) {
+                       CERROR("No online CPU is found on partition %d\n", cpt);
+                       goto failed;
+               }
+
+               str = cfs_trimwhite(bracket + 1);
+       }
+
+       return cptab;
+
+ failed:
+       cfs_cpt_table_free(cptab);
+       return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int  cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               spin_lock(&cpt_data.cpt_lock);
+               cpt_data.cpt_version++;
+               spin_unlock(&cpt_data.cpt_lock);
+       default:
+               CWARN("Lustre: can't support CPU hotplug well now, "
+                     "performance and stability could be impacted"
+                     "[CPU %u notify: %lx]\n", cpu, action);
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+       .notifier_call  = cfs_cpu_notify,
+       .priority       = 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+       if (cfs_cpt_table != NULL)
+               cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+       unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+       if (cpt_data.cpt_cpumask != NULL)
+               LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+       LASSERT(cfs_cpt_table == NULL);
+
+       memset(&cpt_data, 0, sizeof(cpt_data));
+
+       LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+       if (cpt_data.cpt_cpumask == NULL) {
+               CERROR("Failed to allocate scratch buffer\n");
+               return -1;
+       }
+
+       spin_lock_init(&cpt_data.cpt_lock);
+       sema_init(&cpt_data.cpt_mutex, 1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+       register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+       if (*cpu_pattern != 0) {
+               cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+               if (cfs_cpt_table == NULL) {
+                       CERROR("Failed to create cptab from pattern %s\n",
+                              cpu_pattern);
+                       goto failed;
+               }
+
+       } else {
+               cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+               if (cfs_cpt_table == NULL) {
+                       CERROR("Failed to create ptable with npartitions %d\n",
+                              cpu_npartitions);
+                       goto failed;
+               }
+       }
+
+       spin_lock(&cpt_data.cpt_lock);
+       if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+               spin_unlock(&cpt_data.cpt_lock);
+               CERROR("CPU hotplug/unplug during setup\n");
+               goto failed;
+       }
+       spin_unlock(&cpt_data.cpt_lock);
+
+       LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+                num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+       return 0;
+
+ failed:
+       cfs_cpu_fini();
+       return -1;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644 (file)
index 0000000..20b2d61
--- /dev/null
@@ -0,0 +1,144 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+
+
+#define CHKSUM_BLOCK_SIZE      1
+#define CHKSUM_DIGEST_SIZE     4
+
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+       return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = 1;
+
+       return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+                         unsigned int keylen)
+{
+       u32 *mctx = crypto_shash_ctx(hash);
+
+       if (keylen != sizeof(u32)) {
+               crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       *mctx = *(u32 *)key;
+       return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+       u32 *mctx = crypto_shash_ctx(desc->tfm);
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *cksump = *mctx;
+
+       return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+                         unsigned int len)
+{
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *cksump = __adler32(*cksump, data, len);
+       return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+                          u8 *out)
+{
+       *(u32 *)out = __adler32(*cksump, data, len);
+       return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+       return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *(u32 *)out = *cksump;
+       return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+                         unsigned int len, u8 *out)
+{
+       return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+                                   out);
+}
+static struct shash_alg alg = {
+       .setkey         = adler32_setkey,
+       .init           = adler32_init,
+       .update         = adler32_update,
+       .final          = adler32_final,
+       .finup          = adler32_finup,
+       .digest         = adler32_digest,
+       .descsize       = sizeof(u32),
+       .digestsize     = CHKSUM_DIGEST_SIZE,
+       .base           = {
+               .cra_name               = "adler32",
+               .cra_driver_name        = "adler32-zlib",
+               .cra_priority           = 100,
+               .cra_blocksize          = CHKSUM_BLOCK_SIZE,
+               .cra_ctxsize            = sizeof(u32),
+               .cra_module             = THIS_MODULE,
+               .cra_init               = adler32_cra_init,
+       }
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+       return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_register);
+
+void cfs_crypto_adler32_unregister(void)
+{
+       crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_unregister);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c
new file mode 100644 (file)
index 0000000..83af630
--- /dev/null
@@ -0,0 +1,149 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to crc32_le.
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+
+#define CHKSUM_BLOCK_SIZE      1
+#define CHKSUM_DIGEST_SIZE     4
+
+static u32 __crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+       return crc32_le(crc, p, len);
+}
+
+/** No default init with ~0 */
+static int crc32_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = 0;
+
+       return 0;
+}
+
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_setkey(struct crypto_shash *hash, const u8 *key,
+                       unsigned int keylen)
+{
+       u32 *mctx = crypto_shash_ctx(hash);
+
+       if (keylen != sizeof(u32)) {
+               crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       *mctx = le32_to_cpup((__le32 *)key);
+       return 0;
+}
+
+static int crc32_init(struct shash_desc *desc)
+{
+       u32 *mctx = crypto_shash_ctx(desc->tfm);
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *crcp = *mctx;
+
+       return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+                       unsigned int len)
+{
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *crcp = __crc32_le(*crcp, data, len);
+       return 0;
+}
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len,
+                        u8 *out)
+{
+       *(__le32 *)out = cpu_to_le32(__crc32_le(*crcp, data, len));
+       return 0;
+}
+
+static int crc32_finup(struct shash_desc *desc, const u8 *data,
+                      unsigned int len, u8 *out)
+{
+       return __crc32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *(__le32 *)out = cpu_to_le32p(crcp);
+       return 0;
+}
+
+static int crc32_digest(struct shash_desc *desc, const u8 *data,
+                       unsigned int len, u8 *out)
+{
+       return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len,
+                            out);
+}
+static struct shash_alg alg = {
+       .setkey         = crc32_setkey,
+       .init           = crc32_init,
+       .update         = crc32_update,
+       .final          = crc32_final,
+       .finup          = crc32_finup,
+       .digest         = crc32_digest,
+       .descsize       = sizeof(u32),
+       .digestsize     = CHKSUM_DIGEST_SIZE,
+       .base           = {
+               .cra_name               = "crc32",
+               .cra_driver_name        = "crc32-table",
+               .cra_priority           = 100,
+               .cra_blocksize          = CHKSUM_BLOCK_SIZE,
+               .cra_ctxsize            = sizeof(u32),
+               .cra_module             = THIS_MODULE,
+               .cra_init               = crc32_cra_init,
+       }
+};
+
+int cfs_crypto_crc32_register(void)
+{
+       return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_crc32_register);
+
+void cfs_crypto_crc32_unregister(void)
+{
+       crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_crc32_unregister);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c
new file mode 100644 (file)
index 0000000..dd29aa5
--- /dev/null
@@ -0,0 +1,193 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ *
+ * Author:     Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/crc32.h>
+#include <asm/cpufeature.h>
+#include <asm/i387.h>
+#include <linux/libcfs/libcfs.h>
+
+#define CHKSUM_BLOCK_SIZE      1
+#define CHKSUM_DIGEST_SIZE     4
+
+#define PCLMUL_MIN_LEN         64L     /* minimum size of buffer
+                                        * for crc32_pclmul_le_16 */
+#define SCALE_F                        16L     /* size of xmm register */
+#define SCALE_F_MASK           (SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+       crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+       unsigned int iquotient;
+       unsigned int iremainder;
+       unsigned int prealign;
+
+       if (len < PCLMUL_MIN_LEN + SCALE_F_MASK)
+               return crc32_le(crc, p, len);
+
+       if ((long)p & SCALE_F_MASK) {
+               /* align p to 16 byte */
+               prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+               crc = crc32_le(crc, p, prealign);
+               len -= prealign;
+               p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+                                    ~SCALE_F_MASK);
+       }
+       iquotient = len & (~SCALE_F_MASK);
+       iremainder = len & SCALE_F_MASK;
+
+       kernel_fpu_begin();
+       crc = crc32_pclmul_le_16(p, iquotient, crc);
+       kernel_fpu_end();
+
+       if (iremainder)
+               crc = crc32_le(crc, p + iquotient, iremainder);
+
+       return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = 0;
+
+       return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+                       unsigned int keylen)
+{
+       u32 *mctx = crypto_shash_ctx(hash);
+
+       if (keylen != sizeof(u32)) {
+               crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       *mctx = le32_to_cpup((__le32 *)key);
+       return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+       u32 *mctx = crypto_shash_ctx(desc->tfm);
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *crcp = *mctx;
+
+       return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+                              unsigned int len)
+{
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *crcp = crc32_pclmul_le(*crcp, data, len);
+       return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+                               u8 *out)
+{
+       *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+       return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+                             unsigned int len, u8 *out)
+{
+       return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *crcp = shash_desc_ctx(desc);
+
+       *(__le32 *)out = cpu_to_le32p(crcp);
+       return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+                              unsigned int len, u8 *out)
+{
+       return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+                                   out);
+}
+
+static struct shash_alg alg = {
+       .setkey         = crc32_pclmul_setkey,
+       .init           = crc32_pclmul_init,
+       .update         = crc32_pclmul_update,
+       .final          = crc32_pclmul_final,
+       .finup          = crc32_pclmul_finup,
+       .digest         = crc32_pclmul_digest,
+       .descsize       = sizeof(u32),
+       .digestsize     = CHKSUM_DIGEST_SIZE,
+       .base           = {
+                       .cra_name               = "crc32",
+                       .cra_driver_name        = "crc32-pclmul",
+                       .cra_priority           = 200,
+                       .cra_blocksize          = CHKSUM_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(u32),
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = crc32_pclmul_cra_init,
+       }
+};
+
+#ifndef X86_FEATURE_PCLMULQDQ
+#define X86_FEATURE_PCLMULQDQ  (4 * 32 + 1)    /* PCLMULQDQ instruction */
+#endif
+
+int cfs_crypto_crc32_pclmul_register(void)
+{
+
+       if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+               CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not "
+                      "detected.\n");
+               return -ENODEV;
+       }
+       return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_crc32_pclmul_unregister(void)
+{
+       crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644 (file)
index 0000000..f3899bd
--- /dev/null
@@ -0,0 +1,305 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/linux-crypto.h>
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+                                const struct cfs_crypto_hash_type **type,
+                                struct hash_desc *desc, unsigned char *key,
+                                unsigned int key_len)
+{
+       int     err = 0;
+
+       *type = cfs_crypto_hash_type(alg_id);
+
+       if (*type == NULL) {
+               CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+                     alg_id, CFS_HASH_ALG_MAX);
+               return -EINVAL;
+       }
+       desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+       if (desc->tfm == NULL)
+               return -EINVAL;
+
+       if (IS_ERR(desc->tfm)) {
+               CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+                      (*type)->cht_name);
+               return PTR_ERR(desc->tfm);
+       }
+
+       desc->flags = 0;
+
+       /** Shash have different logic for initialization then digest
+        * shash: crypto_hash_setkey, crypto_hash_init
+        * digest: crypto_digest_init, crypto_digest_setkey
+        * Skip this function for digest, because we use shash logic at
+        * cfs_crypto_hash_alloc.
+        */
+       if (key != NULL) {
+               err = crypto_hash_setkey(desc->tfm, key, key_len);
+       } else if ((*type)->cht_key != 0) {
+               err = crypto_hash_setkey(desc->tfm,
+                                        (unsigned char *)&((*type)->cht_key),
+                                        (*type)->cht_size);
+       }
+
+       if (err != 0) {
+               crypto_free_hash(desc->tfm);
+               return err;
+       }
+
+       CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+              (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+              (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+              cfs_crypto_hash_speeds[alg_id]);
+
+       return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+                          const void *buf, unsigned int buf_len,
+                          unsigned char *key, unsigned int key_len,
+                          unsigned char *hash, unsigned int *hash_len)
+{
+       struct scatterlist      sl;
+       struct hash_desc        hdesc;
+       int                     err;
+       const struct cfs_crypto_hash_type       *type;
+
+       if (buf == NULL || buf_len == 0 || hash_len == NULL)
+               return -EINVAL;
+
+       err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+       if (err != 0)
+               return err;
+
+       if (hash == NULL || *hash_len < type->cht_size) {
+               *hash_len = type->cht_size;
+               crypto_free_hash(hdesc.tfm);
+               return -ENOSPC;
+       }
+       sg_init_one(&sl, (void *)buf, buf_len);
+
+       hdesc.flags = 0;
+       err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+       crypto_free_hash(hdesc.tfm);
+
+       return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+       cfs_crypto_hash_init(unsigned char alg_id,
+                            unsigned char *key, unsigned int key_len)
+{
+
+       struct  hash_desc       *hdesc;
+       int                  err;
+       const struct cfs_crypto_hash_type       *type;
+
+       hdesc = kmalloc(sizeof(*hdesc), 0);
+       if (hdesc == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+       if (err) {
+               kfree(hdesc);
+               return ERR_PTR(err);
+       }
+       return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+                               struct page *page, unsigned int offset,
+                               unsigned int len)
+{
+       struct scatterlist sl;
+
+       sg_init_table(&sl, 1);
+       sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+       return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+                          const void *buf, unsigned int buf_len)
+{
+       struct scatterlist sl;
+
+       sg_init_one(&sl, (void *)buf, buf_len);
+
+       return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+                         unsigned char *hash, unsigned int *hash_len)
+{
+       int     err;
+       int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+       if (hash_len == NULL) {
+               crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+               kfree(hdesc);
+               return 0;
+       }
+       if (hash == NULL || *hash_len < size) {
+               *hash_len = size;
+               return -ENOSPC;
+       }
+       err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+       if (err < 0) {
+               /* May be caller can fix error */
+               return err;
+       }
+       crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+       kfree(hdesc);
+       return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+                                       const unsigned char *buf,
+                                       unsigned int buf_len)
+{
+       unsigned long              start, end;
+       int                          bcount, err = 0;
+       int                          sec = 1; /* do test only 1 sec */
+       unsigned char              hash[64];
+       unsigned int                hash_len = 64;
+
+       for (start = jiffies, end = start + sec * HZ, bcount = 0;
+            time_before(jiffies, end); bcount++) {
+               err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+                                            hash, &hash_len);
+               if (err)
+                       break;
+
+       }
+       end = jiffies;
+
+       if (err) {
+               cfs_crypto_hash_speeds[alg_id] =  -1;
+               CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+                      cfs_crypto_hash_name(alg_id), err);
+       } else {
+               unsigned long   tmp;
+               tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+                      1000) / (1024 * 1024);
+               cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+       }
+       CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+              cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+       if (hash_alg < CFS_HASH_ALG_MAX)
+               return cfs_crypto_hash_speeds[hash_alg];
+       else
+               return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+       unsigned char      i;
+       unsigned char      *data;
+       unsigned int        j;
+       /* Data block size for testing hash. Maximum
+        * kmalloc size for 2.6.18 kernel is 128K */
+       unsigned int        data_len = 1 * 128 * 1024;
+
+       data = kmalloc(data_len, 0);
+       if (data == NULL) {
+               CERROR("Failed to allocate mem\n");
+               return -ENOMEM;
+       }
+
+       for (j = 0; j < data_len; j++)
+               data[j] = j & 0xff;
+
+       for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+               cfs_crypto_performance_test(i, data, data_len);
+
+       kfree(data);
+       return 0;
+}
+
+static int crc32, adler32;
+
+#ifdef CONFIG_X86
+static int crc32pclmul;
+#endif
+
+int cfs_crypto_register(void)
+{
+       crc32 = cfs_crypto_crc32_register();
+       adler32 = cfs_crypto_adler32_register();
+
+#ifdef CONFIG_X86
+       crc32pclmul = cfs_crypto_crc32_pclmul_register();
+#endif
+
+       /* check all algorithms and do performance test */
+       cfs_crypto_test_hashes();
+       return 0;
+}
+void cfs_crypto_unregister(void)
+{
+       if (crc32 == 0)
+               cfs_crypto_crc32_unregister();
+       if (adler32 == 0)
+               cfs_crypto_adler32_unregister();
+
+#ifdef CONFIG_X86
+       if (crc32pclmul == 0)
+               cfs_crypto_crc32_pclmul_unregister();
+#endif
+
+       return;
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644 (file)
index 0000000..f236510
--- /dev/null
@@ -0,0 +1,339 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+int    cfs_curproc_groups_nr(void)
+{
+       int nr;
+
+       task_lock(current);
+       nr = current_cred()->group_info->ngroups;
+       task_unlock(current);
+       return nr;
+}
+
+void   cfs_curproc_groups_dump(gid_t *array, int size)
+{
+       task_lock(current);
+       size = min_t(int, size, current_cred()->group_info->ngroups);
+       memcpy(array, current_cred()->group_info->blocks[0], size * sizeof(__u32));
+       task_unlock(current);
+}
+
+
+int    current_is_in_group(gid_t gid)
+{
+       return in_group_p(gid);
+}
+
+/* Currently all the CFS_CAP_* defines match CAP_* ones. */
+#define cfs_cap_pack(cap) (cap)
+#define cfs_cap_unpack(cap) (cap)
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cap_raise(cred->cap_effective, cfs_cap_unpack(cap));
+               commit_creds(cred);
+       }
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cap_lower(cred->cap_effective, cfs_cap_unpack(cap));
+               commit_creds(cred);
+       }
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+       return cap_raised(current_cap(), cfs_cap_unpack(cap));
+}
+
+void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+       *cap = cfs_cap_pack(kcap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+       *cap = cfs_cap_pack(kcap[0]);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+       /* XXX lost high byte */
+       *cap = cfs_cap_pack(kcap.cap[0]);
+#else
+       #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+       *kcap = cfs_cap_unpack(cap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+       (*kcap)[0] = cfs_cap_unpack(cap);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+       kcap->cap[0] = cfs_cap_unpack(cap);
+#else
+       #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+       cfs_cap_t cap;
+       cfs_kernel_cap_pack(current_cap(), &cap);
+       return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cfs_kernel_cap_unpack(&cred->cap_effective, cap);
+               commit_creds(cred);
+       }
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+       return capable(cfs_cap_unpack(cap));
+}
+
+/* Check if task is running in 32-bit API mode, for the purpose of
+ * userspace binary interfaces.  On 32-bit Linux this is (unfortunately)
+ * always true, even if the application is using LARGEFILE64 and 64-bit
+ * APIs, because Linux provides no way for the filesystem to know if it
+ * is called via 32-bit or 64-bit APIs.  Other clients may vary.  On
+ * 64-bit systems, this will only be true if the binary is calling a
+ * 32-bit system call. */
+int current_is_32bit(void)
+{
+       return is_compat_task();
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
+                                void *buf, int len, int write)
+{
+       /* Just copied from kernel for the kernels which doesn't
+        * have access_process_vm() exported */
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       struct page *page;
+       void *old_buf = buf;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       down_read(&mm->mmap_sem);
+       /* ignore errors, just check how much was sucessfully transfered */
+       while (len) {
+               int bytes, rc, offset;
+               void *maddr;
+
+               rc = get_user_pages(tsk, mm, addr, 1,
+                                    write, 1, &page, &vma);
+               if (rc <= 0)
+                       break;
+
+               bytes = len;
+               offset = addr & (PAGE_SIZE-1);
+               if (bytes > PAGE_SIZE-offset)
+                       bytes = PAGE_SIZE-offset;
+
+               maddr = kmap(page);
+               if (write) {
+                       copy_to_user_page(vma, page, addr,
+                                         maddr + offset, buf, bytes);
+                       set_page_dirty_lock(page);
+               } else {
+                       copy_from_user_page(vma, page, addr,
+                                           buf, maddr + offset, bytes);
+               }
+               kunmap(page);
+               page_cache_release(page);
+               len -= bytes;
+               buf += bytes;
+               addr += bytes;
+       }
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+
+       return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+       struct mm_struct *mm;
+       char *buffer, *tmp_buf = NULL;
+       int buf_len = PAGE_CACHE_SIZE;
+       int key_len = strlen(key);
+       unsigned long addr;
+       int rc;
+       ENTRY;
+
+       buffer = kmalloc(buf_len, GFP_USER);
+       if (!buffer)
+               RETURN(-ENOMEM);
+
+       mm = get_task_mm(current);
+       if (!mm) {
+               kfree(buffer);
+               RETURN(-EINVAL);
+       }
+
+       /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+        * which is already holding mmap_sem for writes.  If some other
+        * thread gets the write lock in the meantime, this thread will
+        * block, but at least it won't deadlock on itself.  LU-1735 */
+       if (down_read_trylock(&mm->mmap_sem) == 0)
+               return -EDEADLK;
+       up_read(&mm->mmap_sem);
+
+       addr = mm->env_start;
+       while (addr < mm->env_end) {
+               int this_len, retval, scan_len;
+               char *env_start, *env_end;
+
+               memset(buffer, 0, buf_len);
+
+               this_len = min_t(int, mm->env_end - addr, buf_len);
+               retval = cfs_access_process_vm(current, addr, buffer,
+                                              this_len, 0);
+               if (retval != this_len)
+                       break;
+
+               addr += retval;
+
+               /* Parse the buffer to find out the specified key/value pair.
+                * The "key=value" entries are separated by '\0'. */
+               env_start = buffer;
+               scan_len = this_len;
+               while (scan_len) {
+                       char *entry;
+                       int entry_len;
+
+                       env_end = memscan(env_start, '\0', scan_len);
+                       LASSERT(env_end >= env_start &&
+                               env_end <= env_start + scan_len);
+
+                       /* The last entry of this buffer cross the buffer
+                        * boundary, reread it in next cycle. */
+                       if (unlikely(env_end - env_start == scan_len)) {
+                               /* This entry is too large to fit in buffer */
+                               if (unlikely(scan_len == this_len)) {
+                                       CERROR("Too long env variable.\n");
+                                       GOTO(out, rc = -EINVAL);
+                               }
+                               addr -= scan_len;
+                               break;
+                       }
+
+                       entry = env_start;
+                       entry_len = env_end - env_start;
+
+                       /* Key length + length of '=' */
+                       if (entry_len > key_len + 1 &&
+                           !memcmp(entry, key, key_len)) {
+                               entry += key_len + 1;
+                               entry_len -= key_len + 1;
+                               /* The 'value' buffer passed in is too small.*/
+                               if (entry_len >= *val_len)
+                                       GOTO(out, rc = -EOVERFLOW);
+
+                               memcpy(value, entry, entry_len);
+                               *val_len = entry_len;
+                               GOTO(out, rc = 0);
+                       }
+
+                       scan_len -= (env_end - env_start + 1);
+                       env_start = env_end + 1;
+               }
+       }
+       GOTO(out, rc = -ENOENT);
+
+out:
+       mmput(mm);
+       kfree((void *)buffer);
+       if (tmp_buf)
+               kfree((void *)tmp_buf);
+       return rc;
+}
+EXPORT_SYMBOL(cfs_get_environ);
+
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(current_is_in_group);
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+EXPORT_SYMBOL(cfs_curproc_cap_unpack);
+EXPORT_SYMBOL(cfs_capable);
+EXPORT_SYMBOL(current_is_32bit);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644 (file)
index 0000000..e2c195b
--- /dev/null
@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include "tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+       char *argv[3];
+       int   rc;
+       char *envp[] = {
+               "HOME=/",
+               "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+               NULL};
+       ENTRY;
+
+       argv[0] = lnet_debug_log_upcall;
+
+       LASSERTF(file != NULL, "called on a null filename\n");
+       argv[1] = file; //only need to pass the path of the file
+
+       argv[2] = NULL;
+
+       rc = USERMODEHELPER(argv[0], argv, envp);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("Error %d invoking LNET debug log upcall %s %s; "
+                      "check /proc/sys/lnet/debug_log_upcall\n",
+                      rc, argv[0], argv[1]);
+       } else {
+               CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+                      argv[0], argv[1]);
+       }
+
+       EXIT;
+}
+
+void libcfs_run_upcall(char **argv)
+{
+       int   rc;
+       int   argc;
+       char *envp[] = {
+               "HOME=/",
+               "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+               NULL};
+       ENTRY;
+
+       argv[0] = lnet_upcall;
+       argc = 1;
+       while (argv[argc] != NULL)
+               argc++;
+
+       LASSERT(argc >= 2);
+
+       rc = USERMODEHELPER(argv[0], argv, envp);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; "
+                      "check /proc/sys/lnet/upcall\n",
+                      rc, argv[0], argv[1],
+                      argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+                      argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+                      argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+                      argc < 6 ? "" : ",...");
+       } else {
+               CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+                      argv[0], argv[1],
+                      argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+                      argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+                      argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+                      argc < 6 ? "" : ",...");
+       }
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+       char *argv[6];
+       char buf[32];
+
+       ENTRY;
+       snprintf (buf, sizeof buf, "%d", msgdata->msg_line);
+
+       argv[1] = "LBUG";
+       argv[2] = (char *)msgdata->msg_file;
+       argv[3] = (char *)msgdata->msg_fn;
+       argv[4] = buf;
+       argv[5] = NULL;
+
+       libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+       libcfs_catastrophe = 1;
+       libcfs_debug_msg(msgdata, "LBUG\n");
+
+       if (in_interrupt()) {
+               panic("LBUG in interrupt.\n");
+               /* not reached */
+       }
+
+       libcfs_debug_dumpstack(NULL);
+       if (!libcfs_panic_on_lbug)
+               libcfs_debug_dumplog();
+       libcfs_run_lbug_upcall(msgdata);
+       if (libcfs_panic_on_lbug)
+               panic("LBUG");
+       set_task_state(current, TASK_UNINTERRUPTIBLE);
+       while (1)
+               schedule();
+}
+
+
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+
+static int print_trace_stack(void *data, char *name)
+{
+       printk(" <%s> ", name);
+       return 0;
+}
+
+# define RELIABLE reliable
+# define DUMP_TRACE_CONST const
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+       char fmt[32];
+       touch_nmi_watchdog();
+       sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? ");
+       __print_symbol(fmt, addr);
+}
+
+static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = {
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+       .walk_stack = print_context_stack,
+};
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+       /* dump_stack() */
+       /* show_trace() */
+       if (tsk == NULL)
+               tsk = current;
+       printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+       /* show_trace_log_lvl() */
+       printk("\nCall Trace:\n");
+       dump_trace(tsk, NULL, NULL,
+                  0,
+                  &print_trace_ops, NULL);
+       printk("\n");
+}
+
+task_t *libcfs_current(void)
+{
+       CWARN("current task struct is %p\n", current);
+       return current;
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+                        void *unused2)
+{
+       if (libcfs_panic_in_progress)
+               return 0;
+
+       libcfs_panic_in_progress = 1;
+       mb();
+
+       return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+       notifier_call :     panic_notifier,
+       next :        NULL,
+       priority :        10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+       atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+EXPORT_SYMBOL(libcfs_current);
+
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c
new file mode 100644 (file)
index 0000000..cb96969
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/ctype.h>
+#include <asm/uaccess.h>
+
+#include <linux/libcfs/libcfs.h>
+
+/* write a userspace buffer to disk.
+ * NOTE: this returns 0 on success, not the number of bytes written. */
+ssize_t
+filp_user_write(struct file *filp, const void *buf, size_t count,
+               loff_t *offset)
+{
+       mm_segment_t fs;
+       ssize_t size = 0;
+
+       fs = get_fs();
+       set_fs(KERNEL_DS);
+       while ((ssize_t)count > 0) {
+               size = vfs_write(filp, (const void __user *)buf, count, offset);
+               if (size < 0)
+                       break;
+               count -= size;
+               buf += size;
+               size = 0;
+       }
+       set_fs(fs);
+
+       return size;
+}
+EXPORT_SYMBOL(filp_user_write);
+
+#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL &&        \
+     CFS_O_NOACCESS == O_NOACCESS &&\
+     CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\
+     CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\
+     CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\
+     CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\
+     CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW)
+
+int cfs_oflags2univ(int flags)
+{
+       int f;
+
+       f = flags & O_NOACCESS;
+       f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+       f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+       f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0;
+       f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+       f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+       f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+       f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+       f |= (flags & FASYNC)? CFS_O_ASYNC: 0;
+       f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0;
+       f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0;
+       f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0;
+       f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0;
+       f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0;
+       return f;
+}
+#else
+
+int cfs_oflags2univ(int flags)
+{
+       return (flags);
+}
+#endif
+EXPORT_SYMBOL(cfs_oflags2univ);
+
+/*
+ * XXX Liang: we don't need cfs_univ2oflags() now.
+ */
+int cfs_univ2oflags(int flags)
+{
+       return (flags);
+}
+EXPORT_SYMBOL(cfs_univ2oflags);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c
new file mode 100644 (file)
index 0000000..6f7162e
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <arch-linux/cfs_lock.h>
+#include <linux/libcfs/libcfs.h>
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c
new file mode 100644 (file)
index 0000000..3be3ede
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * NB: we will rename some of above functions in another patch:
+ * - rename kmalloc to cfs_malloc
+ * - rename kmalloc/free_page to cfs_page_alloc/free
+ * - rename kmalloc/free_large to cfs_vmalloc/vfree
+ */
+
+void *
+cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt,
+              size_t nr_bytes, unsigned int flags)
+{
+       void    *ptr;
+
+       ptr = kmalloc_node(nr_bytes, flags,
+                          cfs_cpt_spread_node(cptab, cpt));
+       if (ptr != NULL && (flags & __GFP_ZERO) != 0)
+               memset(ptr, 0, nr_bytes);
+
+       return ptr;
+}
+EXPORT_SYMBOL(cfs_cpt_malloc);
+
+void *
+cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes)
+{
+       return vmalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt));
+}
+EXPORT_SYMBOL(cfs_cpt_vmalloc);
+
+struct page *
+cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, unsigned int flags)
+{
+       return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0);
+}
+EXPORT_SYMBOL(cfs_page_cpt_alloc);
+
+void *
+cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab,
+                       int cpt, unsigned int flags)
+{
+       return kmem_cache_alloc_node(cachep, flags,
+                                    cfs_cpt_spread_node(cptab, cpt));
+}
+EXPORT_SYMBOL(cfs_mem_cache_cpt_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644 (file)
index 0000000..2c7d4a3
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+       struct libcfs_ioctl_hdr   *hdr;
+       struct libcfs_ioctl_data  *data;
+       int err;
+       ENTRY;
+
+       hdr = (struct libcfs_ioctl_hdr *)buf;
+       data = (struct libcfs_ioctl_data *)buf;
+
+       err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+       if (err)
+               RETURN(err);
+
+       if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+               CERROR("PORTALS: version mismatch kernel vs application\n");
+               RETURN(-EINVAL);
+       }
+
+       if (hdr->ioc_len + buf >= end) {
+               CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+               RETURN(-EINVAL);
+       }
+
+
+       if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+               CERROR("PORTALS: user buffer too small for ioctl\n");
+               RETURN(-EINVAL);
+       }
+
+       err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+       if (err)
+               RETURN(err);
+
+       if (libcfs_ioctl_is_invalid(data)) {
+               CERROR("PORTALS: ioctl not correctly formatted\n");
+               RETURN(-EINVAL);
+       }
+
+       if (data->ioc_inllen1)
+               data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+       if (data->ioc_inllen2)
+               data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                       cfs_size_round(data->ioc_inllen1);
+
+       RETURN(0);
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+       if (copy_to_user((char *)arg, data, size))
+               return -EFAULT;
+       return 0;
+}
+
+extern struct cfs_psdev_ops      libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode * inode, struct file * file)
+{
+       struct libcfs_device_userstate **pdu = NULL;
+       int    rc = 0;
+
+       if (!inode)
+               return (-EINVAL);
+       pdu = (struct libcfs_device_userstate **)&file->private_data;
+       if (libcfs_psdev_ops.p_open != NULL)
+               rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+       else
+               return (-EPERM);
+       return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode * inode, struct file * file)
+{
+       struct libcfs_device_userstate *pdu;
+       int    rc = 0;
+
+       if (!inode)
+               return (-EINVAL);
+       pdu = file->private_data;
+       if (libcfs_psdev_ops.p_close != NULL)
+               rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+       else
+               rc = -EPERM;
+       return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+       struct cfs_psdev_file    pfile;
+       int    rc = 0;
+
+       if (current_fsuid() != 0)
+               return -EACCES;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+               return (-EINVAL);
+       }
+
+       /* Handle platform-dependent IOC requests */
+       switch (cmd) {
+       case IOC_LIBCFS_PANIC:
+               if (!cfs_capable(CFS_CAP_SYS_BOOT))
+                       return (-EPERM);
+               panic("debugctl-invoked panic");
+               return (0);
+       case IOC_LIBCFS_MEMHOG:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       return -EPERM;
+               /* go thought */
+       }
+
+       pfile.off = 0;
+       pfile.private_data = file->private_data;
+       if (libcfs_psdev_ops.p_ioctl != NULL)
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+       else
+               rc = -EPERM;
+       return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+       unlocked_ioctl: libcfs_ioctl,
+       open :    libcfs_psdev_open,
+       release :       libcfs_psdev_release
+};
+
+psdev_t libcfs_dev = {
+       LNET_MINOR,
+       "lnet",
+       &libcfs_fops
+};
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644 (file)
index 0000000..b652a79
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include <linux/libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#define LINUX_WAITQ(w) ((wait_queue_t *) w)
+#define LINUX_WAITQ_HEAD(w) ((wait_queue_head_t *) w)
+
+void
+init_waitqueue_entry_current(wait_queue_t *link)
+{
+       init_waitqueue_entry(LINUX_WAITQ(link), current);
+}
+EXPORT_SYMBOL(init_waitqueue_entry_current);
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+       __add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link));
+       spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void
+waitq_wait(wait_queue_t *link, cfs_task_state_t state)
+{
+       schedule();
+}
+EXPORT_SYMBOL(waitq_wait);
+
+int64_t
+waitq_timedwait(wait_queue_t *link, cfs_task_state_t state,
+                   int64_t timeout)
+{
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(waitq_timedwait);
+
+void
+schedule_timeout_and_set_state(cfs_task_state_t state, int64_t timeout)
+{
+       set_current_state(state);
+       schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_and_set_state);
+
+/* deschedule for a bit... */
+void
+cfs_pause(cfs_duration_t ticks)
+{
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule_timeout(ticks);
+}
+EXPORT_SYMBOL(cfs_pause);
+
+void cfs_init_timer(timer_list_t *t)
+{
+       init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg)
+{
+       init_timer(t);
+       t->function = func;
+       t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(timer_list_t *t)
+{
+       return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline)
+{
+       mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(timer_list_t *t)
+{
+       del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(timer_list_t *t)
+{
+       return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+cfs_time_t cfs_timer_deadline(timer_list_t *t)
+{
+       return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+//     BREAKPOINT();
+#else
+       /* nothing */
+#endif
+}
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+       unsigned long     flags;
+       sigset_t        old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigfillset(&current->blocked);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+
+       return old;
+}
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+       unsigned long  flags;
+       sigset_t        old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigaddsetmask(&current->blocked, sigs);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+       return old;
+}
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+       unsigned long flags;
+       sigset_t old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigaddsetmask(&current->blocked, ~sigs);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+
+       return old;
+}
+
+void
+cfs_restore_sigs (sigset_t old)
+{
+       unsigned long  flags;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       current->blocked = old;
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+cfs_signal_pending(void)
+{
+       return signal_pending(current);
+}
+
+void
+cfs_clear_sigpending(void)
+{
+       unsigned long flags;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       clear_tsk_thread_flag(current, TIF_SIGPENDING);
+       SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+libcfs_arch_init(void)
+{
+       return 0;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+       return;
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_enter_debugger);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_block_sigsinv);
+EXPORT_SYMBOL(cfs_restore_sigs);
+EXPORT_SYMBOL(cfs_signal_pending);
+EXPORT_SYMBOL(cfs_clear_sigpending);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c
new file mode 100644 (file)
index 0000000..522b28e
--- /dev/null
@@ -0,0 +1,580 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-proc.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <asm/div64.h>
+#include "tracefile.h"
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_header_t *lnet_table_header = NULL;
+#endif
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET       (0x100)
+enum {
+       PSDEV_DEBUG = 1,          /* control debugging */
+       PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+       PSDEV_PRINTK,        /* force all messages to console */
+       PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+       PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+       PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+       PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+       PSDEV_DEBUG_PATH,        /* crashdump log location */
+       PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+       PSDEV_CPT_TABLE,          /* information about cpu partitions */
+       PSDEV_LNET_UPCALL,      /* User mode upcall script  */
+       PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+       PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+       PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+       PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+       PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+       PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+       PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+       PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+       PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+       PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+       PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+int
+proc_call_handler(void *data, int write,
+                 loff_t *ppos, void *buffer, size_t *lenp,
+                 int (*handler)(void *data, int write,
+                                loff_t pos, void *buffer, int len))
+{
+       int rc = handler(data, write, *ppos, buffer, *lenp);
+
+       if (rc < 0)
+               return rc;
+
+       if (write) {
+               *ppos += *lenp;
+       } else {
+               *lenp = rc;
+               *ppos += rc;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(proc_call_handler);
+
+static int __proc_dobitmasks(void *data, int write,
+                            loff_t pos, void *buffer, int nob)
+{
+       const int     tmpstrlen = 512;
+       char     *tmpstr;
+       int        rc;
+       unsigned int *mask = data;
+       int        is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+       int        is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+       rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+       if (rc < 0)
+               return rc;
+
+       if (!write) {
+               libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+               rc = strlen(tmpstr);
+
+               if (pos >= rc) {
+                       rc = 0;
+               } else {
+                       rc = cfs_trace_copyout_string(buffer, nob,
+                                                     tmpstr + pos, "\n");
+               }
+       } else {
+               rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+               if (rc < 0) {
+                       cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+                       return rc;
+               }
+
+               rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+               /* Always print LBUG/LASSERT to console, so keep this mask */
+               if (is_printk)
+                       *mask |= D_EMERG;
+       }
+
+       cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_dobitmasks)
+
+static int min_watchdog_ratelimit = 0;   /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+                             loff_t pos, void *buffer, int nob)
+{
+       if (!write)
+               return 0;
+
+       return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_dump_kernel)
+
+static int __proc_daemon_file(void *data, int write,
+                             loff_t pos, void *buffer, int nob)
+{
+       if (!write) {
+               int len = strlen(cfs_tracefile);
+
+               if (pos >= len)
+                       return 0;
+
+               return cfs_trace_copyout_string(buffer, nob,
+                                               cfs_tracefile + pos, "\n");
+       }
+
+       return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_daemon_file)
+
+static int __proc_debug_mb(void *data, int write,
+                          loff_t pos, void *buffer, int nob)
+{
+       if (!write) {
+               char tmpstr[32];
+               int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+                                   cfs_trace_get_debug_mb());
+
+               if (pos >= len)
+                       return 0;
+
+               return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+                      "\n");
+       }
+
+       return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_debug_mb)
+
+int LL_PROC_PROTO(proc_console_max_delay_cs)
+{
+       int rc, max_delay_cs;
+       ctl_table_t dummy = *table;
+       cfs_duration_t d;
+
+       dummy.data = &max_delay_cs;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       max_delay_cs = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (max_delay_cs <= 0)
+               return -EINVAL;
+
+       d = cfs_time_seconds(max_delay_cs) / 100;
+       if (d == 0 || d < libcfs_console_min_delay)
+               return -EINVAL;
+       libcfs_console_max_delay = d;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_console_min_delay_cs)
+{
+       int rc, min_delay_cs;
+       ctl_table_t dummy = *table;
+       cfs_duration_t d;
+
+       dummy.data = &min_delay_cs;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       min_delay_cs = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (min_delay_cs <= 0)
+               return -EINVAL;
+
+       d = cfs_time_seconds(min_delay_cs) / 100;
+       if (d == 0 || d > libcfs_console_max_delay)
+               return -EINVAL;
+       libcfs_console_min_delay = d;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_console_backoff)
+{
+       int rc, backoff;
+       ctl_table_t dummy = *table;
+
+       dummy.data = &backoff;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               backoff= libcfs_console_backoff;
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       backoff = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (backoff <= 0)
+               return -EINVAL;
+
+       libcfs_console_backoff = backoff;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(libcfs_force_lbug)
+{
+       if (write)
+               LBUG();
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_fail_loc)
+{
+       int rc;
+       long old_fail_loc = cfs_fail_loc;
+
+       rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos);
+       if (old_fail_loc != cfs_fail_loc)
+               wake_up(&cfs_race_waitq);
+       return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+                           loff_t pos, void *buffer, int nob)
+{
+       char *buf = NULL;
+       int   len = 4096;
+       int   rc  = 0;
+
+       if (write)
+               return -EPERM;
+
+       LASSERT(cfs_cpt_table != NULL);
+
+       while (1) {
+               LIBCFS_ALLOC(buf, len);
+               if (buf == NULL)
+                       return -ENOMEM;
+
+               rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+               if (rc >= 0)
+                       break;
+
+               LIBCFS_FREE(buf, len);
+               if (rc == -EFBIG) {
+                       len <<= 1;
+                       continue;
+               }
+               goto out;
+       }
+
+       if (pos >= rc) {
+               rc = 0;
+               goto out;
+       }
+
+       rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+       if (buf != NULL)
+               LIBCFS_FREE(buf, len);
+       return rc;
+}
+DECLARE_PROC_HANDLER(proc_cpt_table)
+
+static ctl_table_t lnet_table[] = {
+       /*
+        * NB No .strategy entries have been provided since sysctl(8) prefers
+        * to go via /proc for portability.
+        */
+       {
+               INIT_CTL_NAME(PSDEV_DEBUG)
+               .procname = "debug",
+               .data     = &libcfs_debug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_SUBSYSTEM_DEBUG)
+               .procname = "subsystem_debug",
+               .data     = &libcfs_subsystem_debug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_PRINTK)
+               .procname = "printk",
+               .data     = &libcfs_printk,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_RATELIMIT)
+               .procname = "console_ratelimit",
+               .data     = &libcfs_console_ratelimit,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_MAX_DELAY_CS)
+               .procname = "console_max_delay_centisecs",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_max_delay_cs
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_MIN_DELAY_CS)
+               .procname = "console_min_delay_centisecs",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_min_delay_cs
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_BACKOFF)
+               .procname = "console_backoff",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_backoff
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_DEBUG_PATH)
+               .procname = "debug_path",
+               .data     = libcfs_debug_file_path_arr,
+               .maxlen   = sizeof(libcfs_debug_file_path_arr),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_CPT_TABLE)
+               .procname = "cpu_partition_table",
+               .maxlen   = 128,
+               .mode     = 0444,
+               .proc_handler = &proc_cpt_table,
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_LNET_UPCALL)
+               .procname = "upcall",
+               .data     = lnet_upcall,
+               .maxlen   = sizeof(lnet_upcall),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DEBUG_LOG_UPCALL)
+               .procname = "debug_log_upcall",
+               .data     = lnet_debug_log_upcall,
+               .maxlen   = sizeof(lnet_debug_log_upcall),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_MEMUSED)
+               .procname = "lnet_memused",
+               .data     = (int *)&libcfs_kmemory.counter,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_CATASTROPHE)
+               .procname = "catastrophe",
+               .data     = &libcfs_catastrophe,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PANIC_ON_LBUG)
+               .procname = "panic_on_lbug",
+               .data     = &libcfs_panic_on_lbug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DUMP_KERNEL)
+               .procname = "dump_kernel",
+               .maxlen   = 256,
+               .mode     = 0200,
+               .proc_handler = &proc_dump_kernel,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DAEMON_FILE)
+               .procname = "daemon_file",
+               .mode     = 0644,
+               .maxlen   = 256,
+               .proc_handler = &proc_daemon_file,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DEBUG_MB)
+               .procname = "debug_mb",
+               .mode     = 0644,
+               .proc_handler = &proc_debug_mb,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_WATCHDOG_RATELIMIT)
+               .procname = "watchdog_ratelimit",
+               .data     = &libcfs_watchdog_ratelimit,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec_minmax,
+               .extra1   = &min_watchdog_ratelimit,
+               .extra2   = &max_watchdog_ratelimit,
+       },
+       {       INIT_CTL_NAME(PSDEV_LNET_FORCE_LBUG)
+               .procname = "force_lbug",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0200,
+               .proc_handler = &libcfs_force_lbug
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_FAIL_LOC)
+               .procname = "fail_loc",
+               .data     = &cfs_fail_loc,
+               .maxlen   = sizeof(cfs_fail_loc),
+               .mode     = 0644,
+               .proc_handler = &proc_fail_loc
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_FAIL_VAL)
+               .procname = "fail_val",
+               .data     = &cfs_fail_val,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t top_table[] = {
+       {
+               INIT_CTL_NAME(CTL_LNET)
+               .procname = "lnet",
+               .mode     = 0555,
+               .data     = NULL,
+               .maxlen   = 0,
+               .child    = lnet_table,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+#endif
+
+int insert_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header == NULL)
+               lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+       return 0;
+}
+
+void remove_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header != NULL)
+               unregister_sysctl_table(lnet_table_header);
+
+       lnet_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c
new file mode 100644 (file)
index 0000000..a304347
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644 (file)
index 0000000..4a01816
--- /dev/null
@@ -0,0 +1,664 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+       mm_segment_t    oldmm = get_fs();
+       struct socket  *sock;
+       int          fd = -1;
+       int          rc;
+       struct file    *sock_filp;
+
+       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return rc;
+       }
+
+       sock_filp = sock_alloc_file(sock, 0, NULL);
+       if (!sock_filp) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       set_fs(KERNEL_DS);
+       if (sock_filp->f_op->unlocked_ioctl)
+               rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+       set_fs(oldmm);
+
+       fput(sock_filp);
+
+ out:
+       if (fd >= 0)
+               sys_close(fd);
+       else
+               sock_release(sock);
+       return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+       struct ifreq   ifr;
+       int         nob;
+       int         rc;
+       __u32     val;
+
+       nob = strnlen(name, IFNAMSIZ);
+       if (nob == IFNAMSIZ) {
+               CERROR("Interface name %s too long\n", name);
+               return -EINVAL;
+       }
+
+       CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+       strcpy(ifr.ifr_name, name);
+       rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get flags for interface %s\n", name);
+               return rc;
+       }
+
+       if ((ifr.ifr_flags & IFF_UP) == 0) {
+               CDEBUG(D_NET, "Interface %s down\n", name);
+               *up = 0;
+               *ip = *mask = 0;
+               return 0;
+       }
+
+       *up = 1;
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get IP address for interface %s\n", name);
+               return rc;
+       }
+
+       val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+       *ip = ntohl(val);
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get netmask for interface %s\n", name);
+               return rc;
+       }
+
+       val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+       *mask = ntohl(val);
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+       /* Allocate and fill in 'names', returning # interfaces/error */
+       char       **names;
+       int          toobig;
+       int          nalloc;
+       int          nfound;
+       struct ifreq   *ifr;
+       struct ifconf   ifc;
+       int          rc;
+       int          nob;
+       int          i;
+
+
+       nalloc = 16;    /* first guess at max interfaces */
+       toobig = 0;
+       for (;;) {
+               if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+                       toobig = 1;
+                       nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+                       CWARN("Too many interfaces: only enumerating first %d\n",
+                             nalloc);
+               }
+
+               LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+               if (ifr == NULL) {
+                       CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                       rc = -ENOMEM;
+                       goto out0;
+               }
+
+               ifc.ifc_buf = (char *)ifr;
+               ifc.ifc_len = nalloc * sizeof(*ifr);
+
+               rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+               if (rc < 0) {
+                       CERROR ("Error %d enumerating interfaces\n", rc);
+                       goto out1;
+               }
+
+               LASSERT (rc == 0);
+
+               nfound = ifc.ifc_len/sizeof(*ifr);
+               LASSERT (nfound <= nalloc);
+
+               if (nfound < nalloc || toobig)
+                       break;
+
+               LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+               nalloc *= 2;
+       }
+
+       if (nfound == 0)
+               goto out1;
+
+       LIBCFS_ALLOC(names, nfound * sizeof(*names));
+       if (names == NULL) {
+               rc = -ENOMEM;
+               goto out1;
+       }
+       /* NULL out all names[i] */
+       memset (names, 0, nfound * sizeof(*names));
+
+       for (i = 0; i < nfound; i++) {
+
+               nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+               if (nob == IFNAMSIZ) {
+                       /* no space for terminating NULL */
+                       CERROR("interface name %.*s too long (%d max)\n",
+                              nob, ifr[i].ifr_name, IFNAMSIZ);
+                       rc = -ENAMETOOLONG;
+                       goto out2;
+               }
+
+               LIBCFS_ALLOC(names[i], IFNAMSIZ);
+               if (names[i] == NULL) {
+                       rc = -ENOMEM;
+                       goto out2;
+               }
+
+               memcpy(names[i], ifr[i].ifr_name, nob);
+               names[i][nob] = 0;
+       }
+
+       *namesp = names;
+       rc = nfound;
+
+ out2:
+       if (rc < 0)
+               libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+       LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+       int      i;
+
+       LASSERT (n > 0);
+
+       for (i = 0; i < n && names[i] != NULL; i++)
+               LIBCFS_FREE(names[i], IFNAMSIZ);
+
+       LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+       int         rc;
+       mm_segment_t   oldmm = get_fs();
+       long       ticks = timeout * HZ;
+       unsigned long  then;
+       struct timeval tv;
+
+       LASSERT (nob > 0);
+       /* Caller may pass a zero timeout if she thinks the socket buffer is
+        * empty enough to take the whole message immediately */
+
+       for (;;) {
+               struct iovec  iov = {
+                       .iov_base = buffer,
+                       .iov_len  = nob
+               };
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = &iov,
+                       .msg_iovlen     = 1,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+               };
+
+               if (timeout != 0) {
+                       /* Set send timeout to remaining time */
+                       tv = (struct timeval) {
+                               .tv_sec = ticks / HZ,
+                               .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                       };
+                       set_fs(KERNEL_DS);
+                       rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                                            (char *)&tv, sizeof(tv));
+                       set_fs(oldmm);
+                       if (rc != 0) {
+                               CERROR("Can't set socket send timeout "
+                                      "%ld.%06d: %d\n",
+                                      (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                               return rc;
+                       }
+               }
+
+               set_fs (KERNEL_DS);
+               then = jiffies;
+               rc = sock_sendmsg (sock, &msg, iov.iov_len);
+               ticks -= jiffies - then;
+               set_fs (oldmm);
+
+               if (rc == nob)
+                       return 0;
+
+               if (rc < 0)
+                       return rc;
+
+               if (rc == 0) {
+                       CERROR ("Unexpected zero rc\n");
+                       return (-ECONNABORTED);
+               }
+
+               if (ticks <= 0)
+                       return -EAGAIN;
+
+               buffer = ((char *)buffer) + rc;
+               nob -= rc;
+       }
+
+       return (0);
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+       int         rc;
+       mm_segment_t   oldmm = get_fs();
+       long       ticks = timeout * HZ;
+       unsigned long  then;
+       struct timeval tv;
+
+       LASSERT (nob > 0);
+       LASSERT (ticks > 0);
+
+       for (;;) {
+               struct iovec  iov = {
+                       .iov_base = buffer,
+                       .iov_len  = nob
+               };
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = &iov,
+                       .msg_iovlen     = 1,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = 0
+               };
+
+               /* Set receive timeout to remaining time */
+               tv = (struct timeval) {
+                       .tv_sec = ticks / HZ,
+                       .tv_usec = ((ticks % HZ) * 1000000) / HZ
+               };
+               set_fs(KERNEL_DS);
+               rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                                    (char *)&tv, sizeof(tv));
+               set_fs(oldmm);
+               if (rc != 0) {
+                       CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+                              (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                       return rc;
+               }
+
+               set_fs(KERNEL_DS);
+               then = jiffies;
+               rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+               ticks -= jiffies - then;
+               set_fs(oldmm);
+
+               if (rc < 0)
+                       return rc;
+
+               if (rc == 0)
+                       return -ECONNRESET;
+
+               buffer = ((char *)buffer) + rc;
+               nob -= rc;
+
+               if (nob == 0)
+                       return 0;
+
+               if (ticks <= 0)
+                       return -ETIMEDOUT;
+       }
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+                   __u32 local_ip, int local_port)
+{
+       struct sockaddr_in  locaddr;
+       struct socket      *sock;
+       int              rc;
+       int              option;
+       mm_segment_t    oldmm = get_fs();
+
+       /* All errors are fatal except bind failure if the port is in use */
+       *fatal = 1;
+
+       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+       *sockp = sock;
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       option = 1;
+       rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                            (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+               goto failed;
+       }
+
+       if (local_ip != 0 || local_port != 0) {
+               memset(&locaddr, 0, sizeof(locaddr));
+               locaddr.sin_family = AF_INET;
+               locaddr.sin_port = htons(local_port);
+               locaddr.sin_addr.s_addr = (local_ip == 0) ?
+                                         INADDR_ANY : htonl(local_ip);
+
+               rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+                                    sizeof(locaddr));
+               if (rc == -EADDRINUSE) {
+                       CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                       *fatal = 0;
+                       goto failed;
+               }
+               if (rc != 0) {
+                       CERROR("Error trying to bind to port %d: %d\n",
+                              local_port, rc);
+                       goto failed;
+               }
+       }
+
+       return 0;
+
+ failed:
+       sock_release(sock);
+       return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+       mm_segment_t    oldmm = get_fs();
+       int              option;
+       int              rc;
+
+       if (txbufsize != 0) {
+               option = txbufsize;
+               set_fs (KERNEL_DS);
+               rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                                    (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't set send buffer %d: %d\n",
+                               option, rc);
+                       return (rc);
+               }
+       }
+
+       if (rxbufsize != 0) {
+               option = rxbufsize;
+               set_fs (KERNEL_DS);
+               rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                     (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't set receive buffer %d: %d\n",
+                               option, rc);
+                       return (rc);
+               }
+       }
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+       struct sockaddr_in sin;
+       int             len = sizeof (sin);
+       int             rc;
+
+       rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+                                remote ? 2 : 0);
+       if (rc != 0) {
+               CERROR ("Error %d getting sock %s IP/port\n",
+                       rc, remote ? "peer" : "local");
+               return rc;
+       }
+
+       if (ip != NULL)
+               *ip = ntohl (sin.sin_addr.s_addr);
+
+       if (port != NULL)
+               *port = ntohs (sin.sin_port);
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+       if (txbufsize != NULL) {
+               *txbufsize = sock->sk->sk_sndbuf;
+       }
+
+       if (rxbufsize != NULL) {
+               *rxbufsize = sock->sk->sk_rcvbuf;
+       }
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+                   __u32 local_ip, int local_port, int backlog)
+{
+       int      fatal;
+       int      rc;
+
+       rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+       if (rc != 0) {
+               if (!fatal)
+                       CERROR("Can't create socket: port %d already in use\n",
+                              local_port);
+               return rc;
+       }
+
+       rc = (*sockp)->ops->listen(*sockp, backlog);
+       if (rc == 0)
+               return 0;
+
+       CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+       sock_release(*sockp);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+       wait_queue_t   wait;
+       struct socket *newsock;
+       int         rc;
+
+       init_waitqueue_entry(&wait, current);
+
+       /* XXX this should add a ref to sock->ops->owner, if
+        * TCP could be a module */
+       rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+       if (rc) {
+               CERROR("Can't allocate socket\n");
+               return rc;
+       }
+
+       newsock->ops = sock->ops;
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+
+       rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       if (rc == -EAGAIN) {
+               /* Nothing ready, so wait for activity */
+               schedule();
+               rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       }
+
+       remove_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+       set_current_state(TASK_RUNNING);
+
+       if (rc != 0)
+               goto failed;
+
+       *newsockp = newsock;
+       return 0;
+
+ failed:
+       sock_release(newsock);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+       wake_up_all(cfs_sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+                    __u32 local_ip, int local_port,
+                    __u32 peer_ip, int peer_port)
+{
+       struct sockaddr_in  srvaddr;
+       int              rc;
+
+       rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+       if (rc != 0)
+               return rc;
+
+       memset (&srvaddr, 0, sizeof (srvaddr));
+       srvaddr.sin_family = AF_INET;
+       srvaddr.sin_port = htons(peer_port);
+       srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+       rc = (*sockp)->ops->connect(*sockp,
+                                   (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                   0);
+       if (rc == 0)
+               return 0;
+
+       /* EADDRNOTAVAIL probably means we're already connected to the same
+        * peer/port on the same local port on a differently typed
+        * connection.  Let our caller retry with a different local
+        * port... */
+       *fatal = !(rc == -EADDRNOTAVAIL);
+
+       CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+              "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+              HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+       sock_release(*sockp);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+       sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644 (file)
index 0000000..6f56343
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+       80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+       10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+       10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch()
+{
+       int    i;
+       int    j;
+       struct cfs_trace_cpu_data *tcd;
+
+       init_rwsem(&cfs_tracefile_sem);
+
+       /* initialize trace_data */
+       memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+       for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+               cfs_trace_data[i] =
+                       kmalloc(sizeof(union cfs_trace_data_union) *
+                               num_possible_cpus(), GFP_KERNEL);
+               if (cfs_trace_data[i] == NULL)
+                       goto out;
+
+       }
+
+       /* arch related info initialized */
+       cfs_tcd_for_each(tcd, i, j) {
+               spin_lock_init(&tcd->tcd_lock);
+               tcd->tcd_pages_factor = pages_factor[i];
+               tcd->tcd_type = i;
+               tcd->tcd_cpu = j;
+       }
+
+       for (i = 0; i < num_possible_cpus(); i++)
+               for (j = 0; j < 3; j++) {
+                       cfs_trace_console_buffers[i][j] =
+                               kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                       GFP_KERNEL);
+
+                       if (cfs_trace_console_buffers[i][j] == NULL)
+                               goto out;
+               }
+
+       return 0;
+
+out:
+       cfs_tracefile_fini_arch();
+       printk(KERN_ERR "lnet: Not enough memory\n");
+       return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch()
+{
+       int    i;
+       int    j;
+
+       for (i = 0; i < num_possible_cpus(); i++)
+               for (j = 0; j < 3; j++)
+                       if (cfs_trace_console_buffers[i][j] != NULL) {
+                               kfree(cfs_trace_console_buffers[i][j]);
+                               cfs_trace_console_buffers[i][j] = NULL;
+                       }
+
+       for (i = 0; cfs_trace_data[i] != NULL; i++) {
+               kfree(cfs_trace_data[i]);
+               cfs_trace_data[i] = NULL;
+       }
+
+       fini_rwsem(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_lock()
+{
+       down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock()
+{
+       up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock()
+{
+       down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock()
+{
+       up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get()
+{
+       if (in_irq())
+               return CFS_TCD_TYPE_IRQ;
+       else if (in_softirq())
+               return CFS_TCD_TYPE_SOFTIRQ;
+       else
+               return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+       __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+       if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+               spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+       else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+               spin_lock_bh(&tcd->tcd_lock);
+       else if (unlikely(walking))
+               spin_lock_irq(&tcd->tcd_lock);
+       else
+               spin_lock(&tcd->tcd_lock);
+       return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+       __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+       if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+               spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+       else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+               spin_unlock_bh(&tcd->tcd_lock);
+       else if (unlikely(walking))
+               spin_unlock_irq(&tcd->tcd_lock);
+       else
+               spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                     struct cfs_trace_page *tage)
+{
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+                       struct libcfs_debug_msg_data *msgdata,
+                       unsigned long stack)
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+
+       header->ph_subsys = msgdata->msg_subsys;
+       header->ph_mask = msgdata->msg_mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_type = cfs_trace_buf_idx_get();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = current->pid;
+       header->ph_line_num = msgdata->msg_line;
+       header->ph_extern_pid = 0;
+       return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+       switch (hdr->ph_subsys) {
+
+               case S_LND:
+               case S_LNET:
+                       return "LNetError";
+               default:
+                       return "LustreError";
+       }
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+       switch (hdr->ph_subsys) {
+
+               case S_LND:
+               case S_LNET:
+                       return "LNet";
+               default:
+                       return "Lustre";
+       }
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                         const char *buf, int len, const char *file,
+                         const char *fn)
+{
+       char *prefix = "Lustre", *ptype = NULL;
+
+       if ((mask & D_EMERG) != 0) {
+               prefix = dbghdr_to_err_string(hdr);
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = dbghdr_to_err_string(hdr);
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = dbghdr_to_info_string(hdr);
+               ptype = KERN_WARNING;
+       } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+               prefix = dbghdr_to_info_string(hdr);
+               ptype = KERN_INFO;
+       }
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %.*s", ptype, prefix, len, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+                      hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+                      fn, len, buf);
+       }
+       return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+       int  total_mb = (num_physpages >> (20 - PAGE_SHIFT));
+
+       return MAX(512, (total_mb * 80)/100);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644 (file)
index 0000000..ba84e4f
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+       CFS_TCD_TYPE_PROC = 0,
+       CFS_TCD_TYPE_SOFTIRQ,
+       CFS_TCD_TYPE_IRQ,
+       CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c
new file mode 100644 (file)
index 0000000..e73903c
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-utils.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/*
+ * miscellaneous libcfs stuff
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/*
+ * Convert server error code to client format. Error codes are from
+ * Linux errno.h, so for Linux client---identity.
+ */
+int convert_server_error(__u64 ecode)
+{
+       return ecode;
+}
+EXPORT_SYMBOL(convert_server_error);
+
+/*
+ * convert <fcntl.h> flag from client to server.
+ */
+int convert_client_oflag(int cflag, int *result)
+{
+       *result = cflag;
+       return 0;
+}
+EXPORT_SYMBOL(convert_client_oflag);
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+
+EXPORT_SYMBOL(cfs_stack_trace_fill);
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+       return NULL;
+}
+EXPORT_SYMBOL(cfs_stack_trace_frame);
diff --git a/drivers/staging/lustre/lustre/libcfs/lwt.c b/drivers/staging/lustre/lustre/libcfs/lwt.c
new file mode 100644 (file)
index 0000000..b631f7d
--- /dev/null
@@ -0,0 +1,266 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/lwt.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#if LWT_SUPPORT
+
+#if !KLWT_SUPPORT
+int     lwt_enabled;
+lwt_cpu_t   lwt_cpus[NR_CPUS];
+#endif
+
+int     lwt_pages_per_cpu;
+
+/* NB only root is allowed to retrieve LWT info; it's an open door into the
+ * kernel... */
+
+int
+lwt_lookup_string (int *size, char *knl_ptr,
+                  char *user_ptr, int user_size)
+{
+       int   maxsize = 128;
+
+       /* knl_ptr was retrieved from an LWT snapshot and the caller wants to
+        * turn it into a string.  NB we can crash with an access violation
+        * trying to determine the string length, so we're trusting our
+        * caller... */
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       if (user_size > 0 &&
+           maxsize > user_size)
+               maxsize = user_size;
+
+       *size = strnlen (knl_ptr, maxsize - 1) + 1;
+
+       if (user_ptr != NULL) {
+               if (user_size < 4)
+                       return (-EINVAL);
+
+               if (copy_to_user (user_ptr, knl_ptr, *size))
+                       return (-EFAULT);
+
+               /* Did I truncate the string?  */
+               if (knl_ptr[*size - 1] != 0)
+                       copy_to_user (user_ptr + *size - 4, "...", 4);
+       }
+
+       return (0);
+}
+
+int
+lwt_control (int enable, int clear)
+{
+       lwt_page_t  *p;
+       int       i;
+       int       j;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       if (!enable) {
+               LWT_EVENT(0,0,0,0);
+               lwt_enabled = 0;
+               mb();
+               /* give people some time to stop adding traces */
+               schedule_timeout(10);
+       }
+
+       for (i = 0; i < num_online_cpus(); i++) {
+               p = lwt_cpus[i].lwtc_current_page;
+
+               if (p == NULL)
+                       return (-ENODATA);
+
+               if (!clear)
+                       continue;
+
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       memset (p->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+                       p = list_entry (p->lwtp_list.next,
+                                           lwt_page_t, lwtp_list);
+               }
+       }
+
+       if (enable) {
+               lwt_enabled = 1;
+               mb();
+               LWT_EVENT(0,0,0,0);
+       }
+
+       return (0);
+}
+
+int
+lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+             void *user_ptr, int user_size)
+{
+       const int    events_per_page = PAGE_CACHE_SIZE / sizeof(lwt_event_t);
+       const int    bytes_per_page = events_per_page * sizeof(lwt_event_t);
+       lwt_page_t  *p;
+       int       i;
+       int       j;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       *ncpu = num_online_cpus();
+       *total_size = num_online_cpus() * lwt_pages_per_cpu *
+               bytes_per_page;
+       *now = get_cycles();
+
+       if (user_ptr == NULL)
+               return (0);
+
+       for (i = 0; i < num_online_cpus(); i++) {
+               p = lwt_cpus[i].lwtc_current_page;
+
+               if (p == NULL)
+                       return (-ENODATA);
+
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       if (copy_to_user(user_ptr, p->lwtp_events,
+                                            bytes_per_page))
+                               return (-EFAULT);
+
+                       user_ptr = ((char *)user_ptr) + bytes_per_page;
+                       p = list_entry(p->lwtp_list.next,
+                                          lwt_page_t, lwtp_list);
+               }
+       }
+
+       return (0);
+}
+
+int
+lwt_init ()
+{
+       int     i;
+       int     j;
+
+       for (i = 0; i < num_online_cpus(); i++)
+               if (lwt_cpus[i].lwtc_current_page != NULL)
+                       return (-EALREADY);
+
+       LASSERT (!lwt_enabled);
+
+       /* NULL pointers, zero scalars */
+       memset (lwt_cpus, 0, sizeof (lwt_cpus));
+       lwt_pages_per_cpu =
+               LWT_MEMORY / (num_online_cpus() * PAGE_CACHE_SIZE);
+
+       for (i = 0; i < num_online_cpus(); i++)
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       struct page *page = alloc_page (GFP_KERNEL);
+                       lwt_page_t  *lwtp;
+
+                       if (page == NULL) {
+                               CERROR ("Can't allocate page\n");
+                               lwt_fini ();
+                               return (-ENOMEM);
+                       }
+
+                       LIBCFS_ALLOC(lwtp, sizeof (*lwtp));
+                       if (lwtp == NULL) {
+                               CERROR ("Can't allocate lwtp\n");
+                               __free_page(page);
+                               lwt_fini ();
+                               return (-ENOMEM);
+                       }
+
+                       lwtp->lwtp_page = page;
+                       lwtp->lwtp_events = page_address(page);
+                       memset (lwtp->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+                       if (j == 0) {
+                               INIT_LIST_HEAD (&lwtp->lwtp_list);
+                               lwt_cpus[i].lwtc_current_page = lwtp;
+                       } else {
+                               list_add (&lwtp->lwtp_list,
+                                   &lwt_cpus[i].lwtc_current_page->lwtp_list);
+                       }
+               }
+
+       lwt_enabled = 1;
+       mb();
+
+       LWT_EVENT(0,0,0,0);
+
+       return (0);
+}
+
+void
+lwt_fini ()
+{
+       int    i;
+
+       lwt_control(0, 0);
+
+       for (i = 0; i < num_online_cpus(); i++)
+               while (lwt_cpus[i].lwtc_current_page != NULL) {
+                       lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page;
+
+                       if (list_empty (&lwtp->lwtp_list)) {
+                               lwt_cpus[i].lwtc_current_page = NULL;
+                       } else {
+                               lwt_cpus[i].lwtc_current_page =
+                                       list_entry (lwtp->lwtp_list.next,
+                                                       lwt_page_t, lwtp_list);
+
+                               list_del (&lwtp->lwtp_list);
+                       }
+
+                       __free_page (lwtp->lwtp_page);
+                       LIBCFS_FREE (lwtp, sizeof (*lwtp));
+               }
+}
+
+EXPORT_SYMBOL(lwt_enabled);
+EXPORT_SYMBOL(lwt_cpus);
+
+EXPORT_SYMBOL(lwt_init);
+EXPORT_SYMBOL(lwt_fini);
+EXPORT_SYMBOL(lwt_lookup_string);
+EXPORT_SYMBOL(lwt_control);
+EXPORT_SYMBOL(lwt_snapshot);
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644 (file)
index 0000000..3372537
--- /dev/null
@@ -0,0 +1,498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs_crypto.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet.h>
+#include "tracefile.h"
+
+void
+kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+       struct page **level0p = &ldu->ldu_memhog_root_page;
+       struct page **level1p;
+       struct page **level2p;
+       int        count1;
+       int        count2;
+
+       if (*level0p != NULL) {
+
+               level1p = (struct page **)page_address(*level0p);
+               count1 = 0;
+
+               while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+                      *level1p != NULL) {
+
+                       level2p = (struct page **)page_address(*level1p);
+                       count2 = 0;
+
+                       while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+                              *level2p != NULL) {
+
+                               __free_page(*level2p);
+                               ldu->ldu_memhog_pages--;
+                               level2p++;
+                               count2++;
+                       }
+
+                       __free_page(*level1p);
+                       ldu->ldu_memhog_pages--;
+                       level1p++;
+                       count1++;
+               }
+
+               __free_page(*level0p);
+               ldu->ldu_memhog_pages--;
+
+               *level0p = NULL;
+       }
+
+       LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+int
+kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags)
+{
+       struct page **level0p;
+       struct page **level1p;
+       struct page **level2p;
+       int        count1;
+       int        count2;
+
+       LASSERT (ldu->ldu_memhog_pages == 0);
+       LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+       if (npages < 0)
+               return -EINVAL;
+
+       if (npages == 0)
+               return 0;
+
+       level0p = &ldu->ldu_memhog_root_page;
+       *level0p = alloc_page(flags);
+       if (*level0p == NULL)
+               return -ENOMEM;
+       ldu->ldu_memhog_pages++;
+
+       level1p = (struct page **)page_address(*level0p);
+       count1 = 0;
+       memset(level1p, 0, PAGE_CACHE_SIZE);
+
+       while (ldu->ldu_memhog_pages < npages &&
+              count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+               if (cfs_signal_pending())
+                       return (-EINTR);
+
+               *level1p = alloc_page(flags);
+               if (*level1p == NULL)
+                       return -ENOMEM;
+               ldu->ldu_memhog_pages++;
+
+               level2p = (struct page **)page_address(*level1p);
+               count2 = 0;
+               memset(level2p, 0, PAGE_CACHE_SIZE);
+
+               while (ldu->ldu_memhog_pages < npages &&
+                      count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+                       if (cfs_signal_pending())
+                               return (-EINTR);
+
+                       *level2p = alloc_page(flags);
+                       if (*level2p == NULL)
+                               return (-ENOMEM);
+                       ldu->ldu_memhog_pages++;
+
+                       level2p++;
+                       count2++;
+               }
+
+               level1p++;
+               count1++;
+       }
+
+       return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+       struct libcfs_device_userstate *ldu;
+       ENTRY;
+
+       try_module_get(THIS_MODULE);
+
+       LIBCFS_ALLOC(ldu, sizeof(*ldu));
+       if (ldu != NULL) {
+               ldu->ldu_memhog_pages = 0;
+               ldu->ldu_memhog_root_page = NULL;
+       }
+       *(struct libcfs_device_userstate **)args = ldu;
+
+       RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+       struct libcfs_device_userstate *ldu;
+       ENTRY;
+
+       ldu = (struct libcfs_device_userstate *)args;
+       if (ldu != NULL) {
+               kportal_memhog_free(ldu);
+               LIBCFS_FREE(ldu, sizeof(*ldu));
+       }
+
+       module_put(THIS_MODULE);
+       RETURN(0);
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+       int rc = 0;
+
+       down_write(&ioctl_list_sem);
+       if (!list_empty(&hand->item))
+               rc = -EBUSY;
+       else
+               list_add_tail(&hand->item, &ioctl_list);
+       up_write(&ioctl_list_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+       int rc = 0;
+
+       down_write(&ioctl_list_sem);
+       if (list_empty(&hand->item))
+               rc = -ENOENT;
+       else
+               list_del_init(&hand->item);
+       up_write(&ioctl_list_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile,unsigned long cmd,
+                           void *arg, struct libcfs_ioctl_data *data)
+{
+       int err = -EINVAL;
+       ENTRY;
+
+       switch (cmd) {
+       case IOC_LIBCFS_CLEAR_DEBUG:
+               libcfs_debug_clear_buffer();
+               RETURN(0);
+       /*
+        * case IOC_LIBCFS_PANIC:
+        * Handled in arch/cfs_module.c
+        */
+       case IOC_LIBCFS_MARK_DEBUG:
+               if (data->ioc_inlbuf1 == NULL ||
+                   data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                       RETURN(-EINVAL);
+               libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+               RETURN(0);
+#if LWT_SUPPORT
+       case IOC_LIBCFS_LWT_CONTROL:
+               err = lwt_control ((data->ioc_flags & 1) != 0,
+                                  (data->ioc_flags & 2) != 0);
+               break;
+
+       case IOC_LIBCFS_LWT_SNAPSHOT: {
+               cfs_cycles_t   now;
+               int         ncpu;
+               int         total_size;
+
+               err = lwt_snapshot (&now, &ncpu, &total_size,
+                                   data->ioc_pbuf1, data->ioc_plen1);
+               data->ioc_u64[0] = now;
+               data->ioc_u32[0] = ncpu;
+               data->ioc_u32[1] = total_size;
+
+               /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
+               data->ioc_u32[2] = sizeof(lwt_event_t);
+               data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where);
+
+               if (err == 0 &&
+                   libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+                       err = -EFAULT;
+               break;
+       }
+
+       case IOC_LIBCFS_LWT_LOOKUP_STRING:
+               err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+                                        data->ioc_pbuf2, data->ioc_plen2);
+               if (err == 0 &&
+                   libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+                       err = -EFAULT;
+               break;
+#endif
+       case IOC_LIBCFS_MEMHOG:
+               if (pfile->private_data == NULL) {
+                       err = -EINVAL;
+               } else {
+                       kportal_memhog_free(pfile->private_data);
+                       /* XXX The ioc_flags is not GFP flags now, need to be fixed */
+                       err = kportal_memhog_alloc(pfile->private_data,
+                                                  data->ioc_count,
+                                                  data->ioc_flags);
+                       if (err != 0)
+                               kportal_memhog_free(pfile->private_data);
+               }
+               break;
+
+       case IOC_LIBCFS_PING_TEST: {
+               extern void (kping_client)(struct libcfs_ioctl_data *);
+               void (*ping)(struct libcfs_ioctl_data *);
+
+               CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+                      data->ioc_count, libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(data->ioc_nid));
+               ping = symbol_get(kping_client);
+               if (!ping)
+                       CERROR("symbol_get failed\n");
+               else {
+                       ping(data);
+                       symbol_put(kping_client);
+               }
+               RETURN(0);
+       }
+
+       default: {
+               struct libcfs_ioctl_handler *hand;
+               err = -EINVAL;
+               down_read(&ioctl_list_sem);
+               list_for_each_entry(hand, &ioctl_list, item) {
+                       err = hand->handle_ioctl(cmd, data);
+                       if (err != -EINVAL) {
+                               if (err == 0)
+                                       err = libcfs_ioctl_popdata(arg,
+                                                       data, sizeof (*data));
+                               break;
+                       }
+               }
+               up_read(&ioctl_list_sem);
+               break;
+       }
+       }
+
+       RETURN(err);
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+       char    *buf;
+       struct libcfs_ioctl_data *data;
+       int err = 0;
+       ENTRY;
+
+       LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+       if (buf == NULL)
+               RETURN(-ENOMEM);
+
+       /* 'cmd' and permissions get checked in our arch-specific caller */
+       if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+               CERROR("PORTALS ioctl: data error\n");
+               GOTO(out, err = -EINVAL);
+       }
+       data = (struct libcfs_ioctl_data *)buf;
+
+       err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+       LIBCFS_FREE(buf, 1024);
+       RETURN(err);
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+       libcfs_psdev_open,
+       libcfs_psdev_release,
+       NULL,
+       NULL,
+       libcfs_ioctl
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern psdev_t libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+
+extern void libcfs_init_nidstrings(void);
+extern int libcfs_arch_init(void);
+extern void libcfs_arch_cleanup(void);
+
+static int init_libcfs_module(void)
+{
+       int rc;
+
+       libcfs_arch_init();
+       libcfs_init_nidstrings();
+       init_rwsem(&cfs_tracefile_sem);
+       mutex_init(&cfs_trace_thread_mutex);
+       init_rwsem(&ioctl_list_sem);
+       INIT_LIST_HEAD(&ioctl_list);
+       init_waitqueue_head(&cfs_race_waitq);
+
+       rc = libcfs_debug_init(5 * 1024 * 1024);
+       if (rc < 0) {
+               printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
+               return (rc);
+       }
+
+       rc = cfs_cpu_init();
+       if (rc != 0)
+               goto cleanup_debug;
+
+#if LWT_SUPPORT
+       rc = lwt_init();
+       if (rc != 0) {
+               CERROR("lwt_init: error %d\n", rc);
+               goto cleanup_debug;
+       }
+#endif
+       rc = misc_register(&libcfs_dev);
+       if (rc) {
+               CERROR("misc_register: error %d\n", rc);
+               goto cleanup_lwt;
+       }
+
+       rc = cfs_wi_startup();
+       if (rc) {
+               CERROR("initialize workitem: error %d\n", rc);
+               goto cleanup_deregister;
+       }
+
+       /* max to 4 threads, should be enough for rehash */
+       rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+       rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+                                rc, &cfs_sched_rehash);
+       if (rc != 0) {
+               CERROR("Startup workitem scheduler: error: %d\n", rc);
+               goto cleanup_deregister;
+       }
+
+       rc = cfs_crypto_register();
+       if (rc) {
+               CERROR("cfs_crypto_regster: error %d\n", rc);
+               goto cleanup_wi;
+       }
+
+
+       rc = insert_proc();
+       if (rc) {
+               CERROR("insert_proc: error %d\n", rc);
+               goto cleanup_crypto;
+       }
+
+       CDEBUG (D_OTHER, "portals setup OK\n");
+       return 0;
+ cleanup_crypto:
+       cfs_crypto_unregister();
+ cleanup_wi:
+       cfs_wi_shutdown();
+ cleanup_deregister:
+       misc_deregister(&libcfs_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+       lwt_fini();
+#endif
+ cleanup_debug:
+       libcfs_debug_cleanup();
+       return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+       int rc;
+
+       remove_proc();
+
+       CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       if (cfs_sched_rehash != NULL) {
+               cfs_wi_sched_destroy(cfs_sched_rehash);
+               cfs_sched_rehash = NULL;
+       }
+
+       cfs_crypto_unregister();
+       cfs_wi_shutdown();
+
+       rc = misc_deregister(&libcfs_dev);
+       if (rc)
+               CERROR("misc_deregister error %d\n", rc);
+
+#if LWT_SUPPORT
+       lwt_fini();
+#endif
+       cfs_cpu_fini();
+
+       if (atomic_read(&libcfs_kmemory) != 0)
+               CERROR("Portals memory leaked: %d bytes\n",
+                      atomic_read(&libcfs_kmemory));
+
+       rc = libcfs_debug_cleanup();
+       if (rc)
+               printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
+                      rc);
+
+       fini_rwsem(&ioctl_list_sem);
+       fini_rwsem(&cfs_tracefile_sem);
+
+       libcfs_arch_cleanup();
+}
+
+cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module);
diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644 (file)
index 0000000..9a2d70c
--- /dev/null
@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx = 0;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings (void)
+{
+       spin_lock_init(&libcfs_nidstring_lock);
+}
+
+# define NIDSTR_LOCK(f)   spin_lock_irqsave(&libcfs_nidstring_lock, f)
+# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f)
+
+static char *
+libcfs_next_nidstring (void)
+{
+       char      *str;
+       unsigned long  flags;
+
+       NIDSTR_LOCK(flags);
+
+       str = libcfs_nidstrings[libcfs_nidstring_idx++];
+       if (libcfs_nidstring_idx ==
+           sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+               libcfs_nidstring_idx = 0;
+
+       NIDSTR_UNLOCK(flags);
+       return str;
+}
+
+static int  libcfs_lo_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_ip_addr2str(__u32 addr, char *str);
+static int  libcfs_ip_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_decnum_addr2str(__u32 addr, char *str);
+static void libcfs_hexnum_addr2str(__u32 addr, char *str);
+static int  libcfs_num_str2addr(const char *str, int nob, __u32 *addr);
+static int  libcfs_num_parse(char *str, int len, struct list_head *list);
+static int  libcfs_num_match(__u32 addr, struct list_head *list);
+
+struct netstrfns {
+       int       nf_type;
+       char    *nf_name;
+       char    *nf_modname;
+       void       (*nf_addr2str)(__u32 addr, char *str);
+       int     (*nf_str2addr)(const char *str, int nob, __u32 *addr);
+       int     (*nf_parse_addrlist)(char *str, int len,
+                                       struct list_head *list);
+       int     (*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+       {/* .nf_type      */  LOLND,
+        /* .nf_name      */  "lo",
+        /* .nf_modname   */  "klolnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_lo_str2addr,
+        /* .nf_parse_addr*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  SOCKLND,
+        /* .nf_name      */  "tcp",
+        /* .nf_modname   */  "ksocklnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  O2IBLND,
+        /* .nf_name      */  "o2ib",
+        /* .nf_modname   */  "ko2iblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  CIBLND,
+        /* .nf_name      */  "cib",
+        /* .nf_modname   */  "kciblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  OPENIBLND,
+        /* .nf_name      */  "openib",
+        /* .nf_modname   */  "kopeniblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  IIBLND,
+        /* .nf_name      */  "iib",
+        /* .nf_modname   */  "kiiblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  VIBLND,
+        /* .nf_name      */  "vib",
+        /* .nf_modname   */  "kviblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  RALND,
+        /* .nf_name      */  "ra",
+        /* .nf_modname   */  "kralnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  QSWLND,
+        /* .nf_name      */  "elan",
+        /* .nf_modname   */  "kqswlnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  GMLND,
+        /* .nf_name      */  "gm",
+        /* .nf_modname   */  "kgmlnd",
+        /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  MXLND,
+        /* .nf_name      */  "mx",
+        /* .nf_modname   */  "kmxlnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  PTLLND,
+        /* .nf_name      */  "ptl",
+        /* .nf_modname   */  "kptllnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  GNILND,
+        /* .nf_name      */  "gni",
+        /* .nf_modname   */  "kgnilnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       /* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+       {/* .nf_type      */  -1},
+};
+
+const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+       *addr = 0;
+       return 1;
+}
+
+void
+libcfs_ip_addr2str(__u32 addr, char *str)
+{
+#if 0   /* never lookup */
+#endif
+       snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+                (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+                (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+       int   a;
+       int   b;
+       int   c;
+       int   d;
+       int   n = nob;                    /* XscanfX */
+
+       /* numeric IP? */
+       if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+           n == nob &&
+           (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+           (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+               *addr = ((a<<24)|(b<<16)|(c<<8)|d);
+               return 1;
+       }
+
+       return 0;
+}
+
+void
+libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+       snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+void
+libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+       snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+       int     n;
+
+       n = nob;
+       if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       n = nob;
+       if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       n = nob;
+       if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       return 0;
+}
+
+struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+       int    i;
+
+       if (lnd >= 0)
+               for (i = 0; i < libcfs_nnetstrfns; i++)
+                       if (lnd == libcfs_netstrfns[i].nf_type)
+                               return &libcfs_netstrfns[i];
+
+       return NULL;
+}
+
+struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+       struct netstrfns *nf;
+       int            i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (nf->nf_type >= 0 &&
+                   !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+                       return nf;
+       }
+       return NULL;
+}
+
+struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+       int    i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++)
+               if (libcfs_netstrfns[i].nf_type >= 0 &&
+                   !strcmp(libcfs_netstrfns[i].nf_name, name))
+                       return &libcfs_netstrfns[i];
+
+       return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+       return libcfs_lnd2netstrfns(type) != NULL;
+}
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+       struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+       return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+char *
+libcfs_lnd2str(int lnd)
+{
+       char       *str;
+       struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+       if (nf != NULL)
+               return nf->nf_name;
+
+       str = libcfs_next_nidstring();
+       snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd);
+       return str;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+       struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+       if (nf != NULL)
+               return nf->nf_type;
+
+       return -1;
+}
+
+char *
+libcfs_net2str(__u32 net)
+{
+       int            lnd = LNET_NETTYP(net);
+       int            num = LNET_NETNUM(net);
+       struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+       char         *str = libcfs_next_nidstring();
+
+       if (nf == NULL)
+               snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num);
+       else if (num == 0)
+               snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+       else
+               snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num);
+
+       return str;
+}
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+       __u32        addr = LNET_NIDADDR(nid);
+       __u32        net = LNET_NIDNET(nid);
+       int            lnd = LNET_NETTYP(net);
+       int            nnum = LNET_NETNUM(net);
+       struct netstrfns *nf;
+       char         *str;
+       int            nob;
+
+       if (nid == LNET_NID_ANY)
+               return "<?>";
+
+       nf = libcfs_lnd2netstrfns(lnd);
+       str = libcfs_next_nidstring();
+
+       if (nf == NULL)
+               snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum);
+       else {
+               nf->nf_addr2str(addr, str);
+               nob = strlen(str);
+               if (nnum == 0)
+                       snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+                                nf->nf_name);
+               else
+                       snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u",
+                                nf->nf_name, nnum);
+       }
+
+       return str;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+       struct netstrfns *nf;
+       int            nob;
+       int            netnum;
+       int            i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (nf->nf_type >= 0 &&
+                   !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+                       break;
+       }
+
+       if (i == libcfs_nnetstrfns)
+               return NULL;
+
+       nob = strlen(nf->nf_name);
+
+       if (strlen(str) == (unsigned int)nob) {
+               netnum = 0;
+       } else {
+               if (nf->nf_type == LOLND) /* net number not allowed */
+                       return NULL;
+
+               str += nob;
+               i = strlen(str);
+               if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+                   i != (int)strlen(str))
+                       return NULL;
+       }
+
+       *net = LNET_MKNET(nf->nf_type, netnum);
+       return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+       __u32  net;
+
+       if (libcfs_str2net_internal(str, &net) != NULL)
+               return net;
+
+       return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+       const char       *sep = strchr(str, '@');
+       struct netstrfns *nf;
+       __u32        net;
+       __u32        addr;
+
+       if (sep != NULL) {
+               nf = libcfs_str2net_internal(sep + 1, &net);
+               if (nf == NULL)
+                       return LNET_NID_ANY;
+       } else {
+               sep = str + strlen(str);
+               net = LNET_MKNET(SOCKLND, 0);
+               nf = libcfs_lnd2netstrfns(SOCKLND);
+               LASSERT (nf != NULL);
+       }
+
+       if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+               return LNET_NID_ANY;
+
+       return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+       char *str = libcfs_next_nidstring();
+
+       if (id.pid == LNET_PID_ANY) {
+               snprintf(str, LNET_NIDSTR_SIZE,
+                        "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+               return str;
+       }
+
+       snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+                ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+                (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+       return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+       if (!strcmp(str, "*")) {
+               *nidp = LNET_NID_ANY;
+               return 1;
+       }
+
+       *nidp = libcfs_str2nid(str);
+       return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>    :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>  :== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *                    <ipaddr_range> |
+ *                      <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *                      <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *                    <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *                    <number> '-' <number> |
+ *                    <number> '-' <number> '/' <number>
+ * <net>            :== <netname> | <netname><number>
+ * <netname>    :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *                    "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+       /**
+        * Link to list of this structures which is built on nid range
+        * list parsing.
+        */
+       struct list_head nr_link;
+       /**
+        * List head for addrrange::ar_link.
+        */
+       struct list_head nr_addrranges;
+       /**
+        * Flag indicating that *@<net> is found.
+        */
+       int nr_all;
+       /**
+        * Pointer to corresponding element of libcfs_netstrfns.
+        */
+       struct netstrfns *nr_netstrfns;
+       /**
+        * Number of network. E.g. 5 if \<net\> is "elan5".
+        */
+       int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+       /**
+        * Link to nidrange::nr_addrranges.
+        */
+       struct list_head ar_link;
+       /**
+        * List head for cfs_expr_list::el_list.
+        */
+       struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+       struct cfs_expr_list *el;
+       int     rc;
+
+       rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+       if (rc == 0)
+               list_add_tail(&el->el_link, list);
+
+       return rc;
+}
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+       struct addrrange *addrrange;
+
+       if (src->ls_len == 1 && src->ls_str[0] == '*') {
+               nidrange->nr_all = 1;
+               return 1;
+       }
+
+       LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+       if (addrrange == NULL)
+               return 0;
+       list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+       INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+       return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+                                               src->ls_len,
+                                               &addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+            struct list_head *nidlist)
+{
+       struct netstrfns *nf;
+       struct nidrange *nr;
+       int endlen;
+       unsigned netnum;
+
+       if (src->ls_len >= LNET_NIDSTR_SIZE)
+               return NULL;
+
+       nf = libcfs_namenum2netstrfns(src->ls_str);
+       if (nf == NULL)
+               return NULL;
+       endlen = src->ls_len - strlen(nf->nf_name);
+       if (endlen == 0)
+               /* network name only, e.g. "elan" or "tcp" */
+               netnum = 0;
+       else {
+               /* e.g. "elan25" or "tcp23", refuse to parse if
+                * network name is not appended with decimal or
+                * hexadecimal number */
+               if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+                                      endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+                       return NULL;
+       }
+
+       list_for_each_entry(nr, nidlist, nr_link) {
+               if (nr->nr_netstrfns != nf)
+                       continue;
+               if (nr->nr_netnum != netnum)
+                       continue;
+               return nr;
+       }
+
+       LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+       if (nr == NULL)
+               return NULL;
+       list_add_tail(&nr->nr_link, nidlist);
+       INIT_LIST_HEAD(&nr->nr_addrranges);
+       nr->nr_netstrfns = nf;
+       nr->nr_all = 0;
+       nr->nr_netnum = netnum;
+
+       return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+       struct cfs_lstr addrrange;
+       struct cfs_lstr net;
+       struct cfs_lstr tmp;
+       struct nidrange *nr;
+
+       tmp = *src;
+       if (cfs_gettok(src, '@', &addrrange) == 0)
+               goto failed;
+
+       if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+               goto failed;
+
+       nr = add_nidrange(&net, nidlist);
+       if (nr == NULL)
+               goto failed;
+
+       if (parse_addrange(&addrrange, nr) != 0)
+               goto failed;
+
+       return 1;
+ failed:
+       CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+       return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+       while (!list_empty(list)) {
+               struct addrrange *ar;
+
+               ar = list_entry(list->next, struct addrrange, ar_link);
+
+               cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+               list_del(&ar->ar_link);
+               LIBCFS_FREE(ar, sizeof(struct addrrange));
+       }
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+       struct list_head *pos, *next;
+       struct nidrange *nr;
+
+       list_for_each_safe(pos, next, list) {
+               nr = list_entry(pos, struct nidrange, nr_link);
+               free_addrranges(&nr->nr_addrranges);
+               list_del(pos);
+               LIBCFS_FREE(nr, sizeof(struct nidrange));
+       }
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+       struct cfs_lstr src;
+       struct cfs_lstr res;
+       int rc;
+       ENTRY;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       INIT_LIST_HEAD(nidlist);
+       while (src.ls_str) {
+               rc = cfs_gettok(&src, ' ', &res);
+               if (rc == 0) {
+                       cfs_free_nidlist(nidlist);
+                       RETURN(0);
+               }
+               rc = parse_nidrange(&res, nidlist);
+               if (rc == 0) {
+                       cfs_free_nidlist(nidlist);
+                       RETURN(0);
+               }
+       }
+       RETURN(1);
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+       struct cfs_expr_list *el;
+
+       LASSERT(!list_empty(numaddr));
+       el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+       return cfs_expr_list_match(addr, el);
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+       struct nidrange *nr;
+       struct addrrange *ar;
+       ENTRY;
+
+       list_for_each_entry(nr, nidlist, nr_link) {
+               if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+                       continue;
+               if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+                       continue;
+               if (nr->nr_all)
+                       RETURN(1);
+               list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+                       if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+                                                      &ar->ar_numaddr_ranges))
+                               RETURN(1);
+       }
+       RETURN(0);
+}
+
+
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+EXPORT_SYMBOL(libcfs_lnd2modname);
+EXPORT_SYMBOL(libcfs_lnd2str);
+EXPORT_SYMBOL(libcfs_str2lnd);
+EXPORT_SYMBOL(libcfs_net2str);
+EXPORT_SYMBOL(libcfs_nid2str);
+EXPORT_SYMBOL(libcfs_str2net);
+EXPORT_SYMBOL(libcfs_str2nid);
+EXPORT_SYMBOL(libcfs_id2str);
+EXPORT_SYMBOL(libcfs_str2anynid);
+EXPORT_SYMBOL(cfs_free_nidlist);
+EXPORT_SYMBOL(cfs_parse_nidlist);
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644 (file)
index 0000000..69224d8
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new psuedo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+       seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+       seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+       return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the inital seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+       if (seed1)
+               seed_x = seed1; /* use default seeds if parameter is 0 */
+       if (seed2)
+               seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+       int *p = buf;
+       int rem, tmp;
+
+       LASSERT(size >= 0);
+
+       rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+       if (rem) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               tmp ^= cfs_rand();
+               memcpy(buf, &tmp, rem);
+               p = buf + rem;
+               size -= rem;
+       }
+
+       while (size >= sizeof(int)) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               *p = cfs_rand() ^ tmp;
+               size -= sizeof(int);
+               p++;
+       }
+       buf = p;
+       if (size) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               tmp ^= cfs_rand();
+               memcpy(buf, &tmp, size);
+       }
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644 (file)
index 0000000..439e71d
--- /dev/null
@@ -0,0 +1,1195 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include <linux/libcfs/libcfs.h>
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running = 0;
+
+atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                        struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+       return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(int gfp)
+{
+       struct page         *page;
+       struct cfs_trace_page *tage;
+
+       /* My caller is trying to free memory */
+       if (!in_interrupt() && memory_pressure_get())
+               return NULL;
+
+       /*
+        * Don't spam console with allocation failures: they will be reported
+        * by upper layer anyway.
+        */
+       gfp |= __GFP_NOWARN;
+       page = alloc_page(gfp);
+       if (page == NULL)
+               return NULL;
+
+       tage = kmalloc(sizeof(*tage), gfp);
+       if (tage == NULL) {
+               __free_page(page);
+               return NULL;
+       }
+
+       tage->page = page;
+       atomic_inc(&cfs_tage_allocated);
+       return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+       __LASSERT(tage != NULL);
+       __LASSERT(tage->page != NULL);
+
+       __free_page(tage->page);
+       kfree(tage);
+       atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+                            struct list_head *queue)
+{
+       __LASSERT(tage != NULL);
+       __LASSERT(queue != NULL);
+
+       list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+                          struct list_head *stock)
+{
+       int i;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+               struct cfs_trace_page *tage;
+
+               tage = cfs_tage_alloc(gfp);
+               if (tage == NULL)
+                       break;
+               list_add_tail(&tage->linkage, stock);
+       }
+       return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+       struct cfs_trace_page *tage;
+
+       if (tcd->tcd_cur_pages > 0) {
+               __LASSERT(!list_empty(&tcd->tcd_pages));
+               tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+               if (tage->used + len <= PAGE_CACHE_SIZE)
+                       return tage;
+       }
+
+       if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+               if (tcd->tcd_cur_stock_pages > 0) {
+                       tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+                       --tcd->tcd_cur_stock_pages;
+                       list_del_init(&tage->linkage);
+               } else {
+                       tage = cfs_tage_alloc(GFP_ATOMIC);
+                       if (unlikely(tage == NULL)) {
+                               if ((!memory_pressure_get() ||
+                                    in_interrupt()) && printk_ratelimit())
+                                       printk(KERN_WARNING
+                                              "cannot allocate a tage (%ld)\n",
+                                              tcd->tcd_cur_pages);
+                               return NULL;
+                       }
+               }
+
+               tage->used = 0;
+               tage->cpu = smp_processor_id();
+               tage->type = tcd->tcd_type;
+               list_add_tail(&tage->linkage, &tcd->tcd_pages);
+               tcd->tcd_cur_pages++;
+
+               if (tcd->tcd_cur_pages > 8 && thread_running) {
+                       struct tracefiled_ctl *tctl = &trace_tctl;
+                       /*
+                        * wake up tracefiled to process some pages.
+                        */
+                       wake_up(&tctl->tctl_waitq);
+               }
+               return tage;
+       }
+       return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+       int pgcount = tcd->tcd_cur_pages / 10;
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       if (printk_ratelimit())
+               printk(KERN_WARNING "debug daemon buffer overflowed; "
+                      "discarding 10%% of pages (%d of %ld)\n",
+                      pgcount + 1, tcd->tcd_cur_pages);
+
+       INIT_LIST_HEAD(&pc.pc_pages);
+       spin_lock_init(&pc.pc_lock);
+
+       list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+               if (pgcount-- == 0)
+                       break;
+
+               list_move_tail(&tage->linkage, &pc.pc_pages);
+               tcd->tcd_cur_pages--;
+       }
+       put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+                                                unsigned long len)
+{
+       struct cfs_trace_page *tage;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       if (len > PAGE_CACHE_SIZE) {
+               printk(KERN_ERR
+                      "cowardly refusing to write %lu bytes in a page\n", len);
+               return NULL;
+       }
+
+       tage = cfs_trace_get_tage_try(tcd, len);
+       if (tage != NULL)
+               return tage;
+       if (thread_running)
+               cfs_tcd_shrink(tcd);
+       if (tcd->tcd_cur_pages > 0) {
+               tage = cfs_tage_from_list(tcd->tcd_pages.next);
+               tage->used = 0;
+               cfs_tage_to_tail(tage, &tcd->tcd_pages);
+       }
+       return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                    const char *format, ...)
+{
+       va_list args;
+       int     rc;
+
+       va_start(args, format);
+       rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+       va_end(args);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                      const char *format1, va_list args,
+                      const char *format2, ...)
+{
+       struct cfs_trace_cpu_data *tcd = NULL;
+       struct ptldebug_header     header = {0};
+       struct cfs_trace_page     *tage;
+       /* string_buf is used only if tcd != NULL, and is always set then */
+       char                  *string_buf = NULL;
+       char                  *debug_buf;
+       int                     known_size;
+       int                     needed = 85; /* average message length */
+       int                     max_nob;
+       va_list             ap;
+       int                     depth;
+       int                     i;
+       int                     remain;
+       int                     mask = msgdata->msg_mask;
+       char                  *file = (char *)msgdata->msg_file;
+       cfs_debug_limit_state_t   *cdls = msgdata->msg_cdls;
+
+       if (strchr(file, '/'))
+               file = strrchr(file, '/') + 1;
+
+       tcd = cfs_trace_get_tcd();
+
+       /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+        * pins us to a particular CPU.  This avoids an smp_processor_id()
+        * warning on Linux when debugging is enabled. */
+       cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+       if (tcd == NULL)                /* arch may not log in IRQ context */
+               goto console;
+
+       if (tcd->tcd_cur_pages == 0)
+               header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+       if (tcd->tcd_shutting_down) {
+               cfs_trace_put_tcd(tcd);
+               tcd = NULL;
+               goto console;
+       }
+
+       depth = __current_nesting_level();
+       known_size = strlen(file) + 1 + depth;
+       if (msgdata->msg_fn)
+               known_size += strlen(msgdata->msg_fn) + 1;
+
+       if (libcfs_debug_binary)
+               known_size += sizeof(header);
+
+       /*/
+        * '2' used because vsnprintf return real size required for output
+        * _without_ terminating NULL.
+        * if needed is to small for this format.
+        */
+       for (i = 0; i < 2; i++) {
+               tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+               if (tage == NULL) {
+                       if (needed + known_size > PAGE_CACHE_SIZE)
+                               mask |= D_ERROR;
+
+                       cfs_trace_put_tcd(tcd);
+                       tcd = NULL;
+                       goto console;
+               }
+
+               string_buf = (char *)page_address(tage->page) +
+                                       tage->used + known_size;
+
+               max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+               if (max_nob <= 0) {
+                       printk(KERN_EMERG "negative max_nob: %d\n",
+                              max_nob);
+                       mask |= D_ERROR;
+                       cfs_trace_put_tcd(tcd);
+                       tcd = NULL;
+                       goto console;
+               }
+
+               needed = 0;
+               if (format1) {
+                       va_copy(ap, args);
+                       needed = vsnprintf(string_buf, max_nob, format1, ap);
+                       va_end(ap);
+               }
+
+               if (format2) {
+                       remain = max_nob - needed;
+                       if (remain < 0)
+                               remain = 0;
+
+                       va_start(ap, format2);
+                       needed += vsnprintf(string_buf + needed, remain,
+                                           format2, ap);
+                       va_end(ap);
+               }
+
+               if (needed < max_nob) /* well. printing ok.. */
+                       break;
+       }
+
+       if (*(string_buf+needed-1) != '\n')
+               printk(KERN_INFO "format at %s:%d:%s doesn't end in "
+                      "newline\n", file, msgdata->msg_line, msgdata->msg_fn);
+
+       header.ph_len = known_size + needed;
+       debug_buf = (char *)page_address(tage->page) + tage->used;
+
+       if (libcfs_debug_binary) {
+               memcpy(debug_buf, &header, sizeof(header));
+               tage->used += sizeof(header);
+               debug_buf += sizeof(header);
+       }
+
+       /* indent message according to the nesting level */
+       while (depth-- > 0) {
+               *(debug_buf++) = '.';
+               ++ tage->used;
+       }
+
+       strcpy(debug_buf, file);
+       tage->used += strlen(file) + 1;
+       debug_buf += strlen(file) + 1;
+
+       if (msgdata->msg_fn) {
+               strcpy(debug_buf, msgdata->msg_fn);
+               tage->used += strlen(msgdata->msg_fn) + 1;
+               debug_buf += strlen(msgdata->msg_fn) + 1;
+       }
+
+       __LASSERT(debug_buf == string_buf);
+
+       tage->used += needed;
+       __LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+       if ((mask & libcfs_printk) == 0) {
+               /* no console output requested */
+               if (tcd != NULL)
+                       cfs_trace_put_tcd(tcd);
+               return 1;
+       }
+
+       if (cdls != NULL) {
+               if (libcfs_console_ratelimit &&
+                   cdls->cdls_next != 0 &&     /* not first time ever */
+                   !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+                       /* skipping a console message */
+                       cdls->cdls_count++;
+                       if (tcd != NULL)
+                               cfs_trace_put_tcd(tcd);
+                       return 1;
+               }
+
+               if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+                                                      libcfs_console_max_delay
+                                                      + cfs_time_seconds(10))) {
+                       /* last timeout was a long time ago */
+                       cdls->cdls_delay /= libcfs_console_backoff * 4;
+               } else {
+                       cdls->cdls_delay *= libcfs_console_backoff;
+
+                       if (cdls->cdls_delay < libcfs_console_min_delay)
+                               cdls->cdls_delay = libcfs_console_min_delay;
+                       else if (cdls->cdls_delay > libcfs_console_max_delay)
+                               cdls->cdls_delay = libcfs_console_max_delay;
+               }
+
+               /* ensure cdls_next is never zero after it's been seen */
+               cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+       }
+
+       if (tcd != NULL) {
+               cfs_print_to_console(&header, mask, string_buf, needed, file,
+                                    msgdata->msg_fn);
+               cfs_trace_put_tcd(tcd);
+       } else {
+               string_buf = cfs_trace_get_console_buffer();
+
+               needed = 0;
+               if (format1 != NULL) {
+                       va_copy(ap, args);
+                       needed = vsnprintf(string_buf,
+                                          CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                          format1, ap);
+                       va_end(ap);
+               }
+               if (format2 != NULL) {
+                       remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+                       if (remain > 0) {
+                               va_start(ap, format2);
+                               needed += vsnprintf(string_buf+needed, remain,
+                                                   format2, ap);
+                               va_end(ap);
+                       }
+               }
+               cfs_print_to_console(&header, mask,
+                                    string_buf, needed, file, msgdata->msg_fn);
+
+               cfs_trace_put_console_buffer(string_buf);
+       }
+
+       if (cdls != NULL && cdls->cdls_count != 0) {
+               string_buf = cfs_trace_get_console_buffer();
+
+               needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                 "Skipped %d previous similar message%s\n",
+                                 cdls->cdls_count,
+                                 (cdls->cdls_count > 1) ? "s" : "");
+
+               cfs_print_to_console(&header, mask,
+                                    string_buf, needed, file, msgdata->msg_fn);
+
+               cfs_trace_put_console_buffer(string_buf);
+               cdls->cdls_count = 0;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+                          struct libcfs_debug_msg_data *msgdata)
+{
+       struct ptldebug_header hdr;
+
+       libcfs_panic_in_progress = 1;
+       libcfs_catastrophe = 1;
+       mb();
+
+       cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+       cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+                            msgdata->msg_file, msgdata->msg_fn);
+
+       panic("Lustre debug assertion failure\n");
+
+       /* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+       /* Do the collect_pages job on a single CPU: assumes that all other
+        * CPUs have been stopped during a panic.  If this isn't true for some
+        * arch, this will have to be implemented separately in each arch.  */
+       int                     i;
+       int                     j;
+       struct cfs_trace_cpu_data *tcd;
+
+       INIT_LIST_HEAD(&pc->pc_pages);
+
+       cfs_tcd_for_each(tcd, i, j) {
+               list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+               tcd->tcd_cur_pages = 0;
+
+               if (pc->pc_want_daemon_pages) {
+                       list_splice_init(&tcd->tcd_daemon_pages,
+                                            &pc->pc_pages);
+                       tcd->tcd_cur_daemon_pages = 0;
+               }
+       }
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int i, cpu;
+
+       spin_lock(&pc->pc_lock);
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+                       tcd->tcd_cur_pages = 0;
+                       if (pc->pc_want_daemon_pages) {
+                               list_splice_init(&tcd->tcd_daemon_pages,
+                                                    &pc->pc_pages);
+                               tcd->tcd_cur_daemon_pages = 0;
+                       }
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+       INIT_LIST_HEAD(&pc->pc_pages);
+
+       if (libcfs_panic_in_progress)
+               panic_collect_pages(pc);
+       else
+               collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       struct list_head *cur_head;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       int i, cpu;
+
+       spin_lock(&pc->pc_lock);
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       cur_head = tcd->tcd_pages.next;
+
+                       list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+                                                linkage) {
+
+                               __LASSERT_TAGE_INVARIANT(tage);
+
+                               if (tage->cpu != cpu || tage->type != i)
+                                       continue;
+
+                               cfs_tage_to_tail(tage, cur_head);
+                               tcd->tcd_cur_pages++;
+                       }
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+       if (!libcfs_panic_in_progress)
+               put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                        struct cfs_trace_cpu_data *tcd)
+{
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock(&pc->pc_lock);
+       list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+                       continue;
+
+               cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+               tcd->tcd_cur_daemon_pages++;
+
+               if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+                       struct cfs_trace_page *victim;
+
+                       __LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+                       victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+                       __LASSERT_TAGE_INVARIANT(victim);
+
+                       list_del(&victim->linkage);
+                       cfs_tage_free(victim);
+                       tcd->tcd_cur_daemon_pages--;
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int i, cpu;
+
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu)
+                       put_pages_on_tcd_daemon_list(pc, tcd);
+       }
+}
+
+void cfs_trace_debug_print(void)
+{
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock_init(&pc.pc_lock);
+
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+               char *p, *file, *fn;
+               struct page *page;
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               page = tage->page;
+               p = page_address(page);
+               while (p < ((char *)page_address(page) + tage->used)) {
+                       struct ptldebug_header *hdr;
+                       int len;
+                       hdr = (void *)p;
+                       p += sizeof(*hdr);
+                       file = p;
+                       p += strlen(file) + 1;
+                       fn = p;
+                       p += strlen(fn) + 1;
+                       len = hdr->ph_len - (int)(p - (char *)hdr);
+
+                       cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+                       p += len;
+               }
+
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+       struct page_collection  pc;
+       struct file             *filp;
+       struct cfs_trace_page   *tage;
+       struct cfs_trace_page   *tmp;
+       int rc;
+
+       DECL_MMSPACE;
+
+       cfs_tracefile_write_lock();
+
+       filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+       if (IS_ERR(filp)) {
+               rc = PTR_ERR(filp);
+               filp = NULL;
+               printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
+                     filename, rc);
+               goto out;
+       }
+
+       spin_lock_init(&pc.pc_lock);
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       if (list_empty(&pc.pc_pages)) {
+               rc = 0;
+               goto close;
+       }
+
+       /* ok, for now, just write the pages.  in the future we'll be building
+        * iobufs with the pages and calling generic_direct_IO */
+       MMSPACE_OPEN;
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               rc = filp_write(filp, page_address(tage->page),
+                               tage->used, filp_poff(filp));
+               if (rc != (int)tage->used) {
+                       printk(KERN_WARNING "wanted to write %u but wrote "
+                              "%d\n", tage->used, rc);
+                       put_pages_back(&pc);
+                       __LASSERT(list_empty(&pc.pc_pages));
+                       break;
+               }
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+       MMSPACE_CLOSE;
+       rc = filp_fsync(filp);
+       if (rc)
+               printk(KERN_ERR "sync returns %d\n", rc);
+close:
+       filp_close(filp, NULL);
+out:
+       cfs_tracefile_write_unlock();
+       return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock_init(&pc.pc_lock);
+
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                           const char *usr_buffer, int usr_buffer_nob)
+{
+       int    nob;
+
+       if (usr_buffer_nob > knl_buffer_nob)
+               return -EOVERFLOW;
+
+       if (copy_from_user((void *)knl_buffer,
+                          (void *)usr_buffer, usr_buffer_nob))
+               return -EFAULT;
+
+       nob = strnlen(knl_buffer, usr_buffer_nob);
+       while (nob-- >= 0)                    /* strip trailing whitespace */
+               if (!isspace(knl_buffer[nob]))
+                       break;
+
+       if (nob < 0)                        /* empty string */
+               return -EINVAL;
+
+       if (nob == knl_buffer_nob)            /* no space to terminate */
+               return -EOVERFLOW;
+
+       knl_buffer[nob + 1] = 0;                /* terminate */
+       return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                            const char *knl_buffer, char *append)
+{
+       /* NB if 'append' != NULL, it's a single character to append to the
+        * copied out string - usually "\n", for /proc entries and "" (i.e. a
+        * terminating zero byte) for sysctl entries */
+       int   nob = strlen(knl_buffer);
+
+       if (nob > usr_buffer_nob)
+               nob = usr_buffer_nob;
+
+       if (copy_to_user(usr_buffer, knl_buffer, nob))
+               return -EFAULT;
+
+       if (append != NULL && nob < usr_buffer_nob) {
+               if (copy_to_user(usr_buffer + nob, append, 1))
+                       return -EFAULT;
+
+               nob++;
+       }
+
+       return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+       if (nob > 2 * PAGE_CACHE_SIZE)      /* string must be "sensible" */
+               return -EINVAL;
+
+       *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+       if (*str == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+       kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob)
+{
+       char     *str;
+       int        rc;
+
+       rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+       if (rc != 0)
+               return rc;
+
+       rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                    usr_str, usr_str_nob);
+       if (rc != 0)
+               goto out;
+
+       if (str[0] != '/') {
+               rc = -EINVAL;
+               goto out;
+       }
+       rc = cfs_tracefile_dump_all_pages(str);
+out:
+       cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+       return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+       int       rc = 0;
+
+       cfs_tracefile_write_lock();
+
+       if (strcmp(str, "stop") == 0) {
+               cfs_tracefile_write_unlock();
+               cfs_trace_stop_thread();
+               cfs_tracefile_write_lock();
+               memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+       } else if (strncmp(str, "size=", 5) == 0) {
+               cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+               if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+                       cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+               else
+                       cfs_tracefile_size <<= 20;
+
+       } else if (strlen(str) >= sizeof(cfs_tracefile)) {
+               rc = -ENAMETOOLONG;
+       } else if (str[0] != '/') {
+               rc = -EINVAL;
+       } else {
+               strcpy(cfs_tracefile, str);
+
+               printk(KERN_INFO
+                      "Lustre: debug daemon will attempt to start writing "
+                      "to %s (%lukB max)\n", cfs_tracefile,
+                      (long)(cfs_tracefile_size >> 10));
+
+               cfs_trace_start_thread();
+       }
+
+       cfs_tracefile_write_unlock();
+       return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob)
+{
+       char *str;
+       int   rc;
+
+       rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+       if (rc != 0)
+               return rc;
+
+       rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                usr_str, usr_str_nob);
+       if (rc == 0)
+               rc = cfs_trace_daemon_command(str);
+
+       cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+       return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+       int i;
+       int j;
+       int pages;
+       int limit = cfs_trace_max_debug_mb();
+       struct cfs_trace_cpu_data *tcd;
+
+       if (mb < num_possible_cpus()) {
+               printk(KERN_WARNING
+                      "Lustre: %d MB is too small for debug buffer size, "
+                      "setting it to %d MB.\n", mb, num_possible_cpus());
+               mb = num_possible_cpus();
+       }
+
+       if (mb > limit) {
+               printk(KERN_WARNING
+                      "Lustre: %d MB is too large for debug buffer size, "
+                      "setting it to %d MB.\n", mb, limit);
+               mb = limit;
+       }
+
+       mb /= num_possible_cpus();
+       pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+       cfs_tracefile_write_lock();
+
+       cfs_tcd_for_each(tcd, i, j)
+               tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+       cfs_tracefile_write_unlock();
+
+       return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob)
+{
+       char     str[32];
+       int      rc;
+
+       rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+       if (rc < 0)
+               return rc;
+
+       return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+       int i;
+       int j;
+       struct cfs_trace_cpu_data *tcd;
+       int total_pages = 0;
+
+       cfs_tracefile_read_lock();
+
+       cfs_tcd_for_each(tcd, i, j)
+               total_pages += tcd->tcd_max_pages;
+
+       cfs_tracefile_read_unlock();
+
+       return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+       struct page_collection pc;
+       struct tracefiled_ctl *tctl = arg;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       struct file *filp;
+       int last_loop = 0;
+       int rc;
+
+       DECL_MMSPACE;
+
+       /* we're started late enough that we pick up init's fs context */
+       /* this is so broken in uml?  what on earth is going on? */
+
+       spin_lock_init(&pc.pc_lock);
+       complete(&tctl->tctl_start);
+
+       while (1) {
+               wait_queue_t __wait;
+
+               pc.pc_want_daemon_pages = 0;
+               collect_pages(&pc);
+               if (list_empty(&pc.pc_pages))
+                       goto end_loop;
+
+               filp = NULL;
+               cfs_tracefile_read_lock();
+               if (cfs_tracefile[0] != 0) {
+                       filp = filp_open(cfs_tracefile,
+                                        O_CREAT | O_RDWR | O_LARGEFILE,
+                                        0600);
+                       if (IS_ERR(filp)) {
+                               rc = PTR_ERR(filp);
+                               filp = NULL;
+                               printk(KERN_WARNING "couldn't open %s: "
+                                      "%d\n", cfs_tracefile, rc);
+                       }
+               }
+               cfs_tracefile_read_unlock();
+               if (filp == NULL) {
+                       put_pages_on_daemon_list(&pc);
+                       __LASSERT(list_empty(&pc.pc_pages));
+                       goto end_loop;
+               }
+
+               MMSPACE_OPEN;
+
+               list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+                                                  linkage) {
+                       static loff_t f_pos;
+
+                       __LASSERT_TAGE_INVARIANT(tage);
+
+                       if (f_pos >= (off_t)cfs_tracefile_size)
+                               f_pos = 0;
+                       else if (f_pos > (off_t)filp_size(filp))
+                               f_pos = filp_size(filp);
+
+                       rc = filp_write(filp, page_address(tage->page),
+                                       tage->used, &f_pos);
+                       if (rc != (int)tage->used) {
+                               printk(KERN_WARNING "wanted to write %u "
+                                      "but wrote %d\n", tage->used, rc);
+                               put_pages_back(&pc);
+                               __LASSERT(list_empty(&pc.pc_pages));
+                       }
+               }
+               MMSPACE_CLOSE;
+
+               filp_close(filp, NULL);
+               put_pages_on_daemon_list(&pc);
+               if (!list_empty(&pc.pc_pages)) {
+                       int i;
+
+                       printk(KERN_ALERT "Lustre: trace pages aren't "
+                              " empty\n");
+                       printk(KERN_ERR "total cpus(%d): ",
+                              num_possible_cpus());
+                       for (i = 0; i < num_possible_cpus(); i++)
+                               if (cpu_online(i))
+                                       printk(KERN_ERR "%d(on) ", i);
+                               else
+                                       printk(KERN_ERR "%d(off) ", i);
+                       printk(KERN_ERR "\n");
+
+                       i = 0;
+                       list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+                                                    linkage)
+                               printk(KERN_ERR "page %d belongs to cpu "
+                                      "%d\n", ++i, tage->cpu);
+                       printk(KERN_ERR "There are %d pages unwritten\n",
+                              i);
+               }
+               __LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+               if (atomic_read(&tctl->tctl_shutdown)) {
+                       if (last_loop == 0) {
+                               last_loop = 1;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+               init_waitqueue_entry_current(&__wait);
+               add_wait_queue(&tctl->tctl_waitq, &__wait);
+               set_current_state(TASK_INTERRUPTIBLE);
+               waitq_timedwait(&__wait, TASK_INTERRUPTIBLE,
+                                   cfs_time_seconds(1));
+               remove_wait_queue(&tctl->tctl_waitq, &__wait);
+       }
+       complete(&tctl->tctl_stop);
+       return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+       struct tracefiled_ctl *tctl = &trace_tctl;
+       int rc = 0;
+
+       mutex_lock(&cfs_trace_thread_mutex);
+       if (thread_running)
+               goto out;
+
+       init_completion(&tctl->tctl_start);
+       init_completion(&tctl->tctl_stop);
+       init_waitqueue_head(&tctl->tctl_waitq);
+       atomic_set(&tctl->tctl_shutdown, 0);
+
+       if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+               rc = -ECHILD;
+               goto out;
+       }
+
+       wait_for_completion(&tctl->tctl_start);
+       thread_running = 1;
+out:
+       mutex_unlock(&cfs_trace_thread_mutex);
+       return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+       struct tracefiled_ctl *tctl = &trace_tctl;
+
+       mutex_lock(&cfs_trace_thread_mutex);
+       if (thread_running) {
+               printk(KERN_INFO
+                      "Lustre: shutting down debug daemon thread...\n");
+               atomic_set(&tctl->tctl_shutdown, 1);
+               wait_for_completion(&tctl->tctl_stop);
+               thread_running = 0;
+       }
+       mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int                 i;
+       int                 j;
+       int                 rc;
+       int                 factor;
+
+       rc = cfs_tracefile_init_arch();
+       if (rc != 0)
+               return rc;
+
+       cfs_tcd_for_each(tcd, i, j) {
+               /* tcd_pages_factor is initialized int tracefile_init_arch. */
+               factor = tcd->tcd_pages_factor;
+               INIT_LIST_HEAD(&tcd->tcd_pages);
+               INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+               INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+               tcd->tcd_cur_pages = 0;
+               tcd->tcd_cur_stock_pages = 0;
+               tcd->tcd_cur_daemon_pages = 0;
+               tcd->tcd_max_pages = (max_pages * factor) / 100;
+               LASSERT(tcd->tcd_max_pages > 0);
+               tcd->tcd_shutting_down = 0;
+       }
+
+       return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+       struct cfs_trace_cpu_data *tcd;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       int i, cpu;
+
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       tcd->tcd_shutting_down = 1;
+
+                       list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+                                                          linkage) {
+                               __LASSERT_TAGE_INVARIANT(tage);
+
+                               list_del(&tage->linkage);
+                               cfs_tage_free(tage);
+                       }
+
+                       tcd->tcd_cur_pages = 0;
+               }
+       }
+}
+
+static void cfs_trace_cleanup(void)
+{
+       struct page_collection pc;
+
+       INIT_LIST_HEAD(&pc.pc_pages);
+       spin_lock_init(&pc.pc_lock);
+
+       trace_cleanup_on_all_cpus();
+
+       cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+       cfs_trace_stop_thread();
+       cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644 (file)
index 0000000..7e8d17c
--- /dev/null
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                           const char *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                            const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+       struct cfs_trace_cpu_data {
+               /*
+                * Even though this structure is meant to be per-CPU, locking
+                * is needed because in some places the data may be accessed
+                * from other CPUs. This lock is directly used in trace_get_tcd
+                * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+                * tcd_for_each_type_lock
+                */
+               spinlock_t              tcd_lock;
+               unsigned long      tcd_lock_flags;
+
+               /*
+                * pages with trace records not yet processed by tracefiled.
+                */
+               struct list_head              tcd_pages;
+               /* number of pages on ->tcd_pages */
+               unsigned long      tcd_cur_pages;
+
+               /*
+                * pages with trace records already processed by
+                * tracefiled. These pages are kept in memory, so that some
+                * portion of log can be written in the event of LBUG. This
+                * list is maintained in LRU order.
+                *
+                * Pages are moved to ->tcd_daemon_pages by tracefiled()
+                * (put_pages_on_daemon_list()). LRU pages from this list are
+                * discarded when list grows too large.
+                */
+               struct list_head              tcd_daemon_pages;
+               /* number of pages on ->tcd_daemon_pages */
+               unsigned long      tcd_cur_daemon_pages;
+
+               /*
+                * Maximal number of pages allowed on ->tcd_pages and
+                * ->tcd_daemon_pages each.
+                * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+                * implementation.
+                */
+               unsigned long      tcd_max_pages;
+
+               /*
+                * preallocated pages to write trace records into. Pages from
+                * ->tcd_stock_pages are moved to ->tcd_pages by
+                * portals_debug_msg().
+                *
+                * This list is necessary, because on some platforms it's
+                * impossible to perform efficient atomic page allocation in a
+                * non-blockable context.
+                *
+                * Such platforms fill ->tcd_stock_pages "on occasion", when
+                * tracing code is entered in blockable context.
+                *
+                * trace_get_tage_try() tries to get a page from
+                * ->tcd_stock_pages first and resorts to atomic page
+                * allocation only if this queue is empty. ->tcd_stock_pages
+                * is replenished when tracing code is entered in blocking
+                * context (darwin-tracefile.c:trace_get_tcd()). We try to
+                * maintain TCD_STOCK_PAGES (40 by default) pages in this
+                * queue. Atomic allocation is only required if more than
+                * TCD_STOCK_PAGES pagesful are consumed by trace records all
+                * emitted in non-blocking contexts. Which is quite unlikely.
+                */
+               struct list_head              tcd_stock_pages;
+               /* number of pages on ->tcd_stock_pages */
+               unsigned long      tcd_cur_stock_pages;
+
+               unsigned short    tcd_shutting_down;
+               unsigned short    tcd_cpu;
+               unsigned short    tcd_type;
+               /* The factors to share debug memory. */
+               unsigned short    tcd_pages_factor;
+       } tcd;
+       char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)                                   \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)                           \
+       for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);            \
+            j < num_possible_cpus();                            \
+            j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)                           \
+    for (i = 0; cfs_trace_data[i] &&                                 \
+        (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&                      \
+        cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+       struct list_head        pc_pages;
+       /*
+        * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+        * call-back functions. XXX nikita: Which is horrible: all processors
+        * receive NMI at the same time only to be serialized by this
+        * lock. Probably ->pc_pages should be replaced with an array of
+        * NR_CPUS elements accessed locklessly.
+        */
+       spinlock_t      pc_lock;
+       /*
+        * if this flag is set, collect_pages() will spill both
+        * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+        * only ->tcd_pages are spilled.
+        */
+       int             pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+       struct completion       tctl_start;
+       struct completion       tctl_stop;
+       wait_queue_head_t               tctl_waitq;
+       pid_t                   tctl_pid;
+       atomic_t                tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+       /*
+        * page itself
+        */
+       struct page       *page;
+       /*
+        * linkage into one of the lists in trace_data_union or
+        * page_collection
+        */
+       struct list_head           linkage;
+       /*
+        * number of bytes used within this page
+        */
+       unsigned int     used;
+       /*
+        * cpu that owns this page
+        */
+       unsigned short       cpu;
+       /*
+        * type(context) of this page
+        */
+       unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+                                   struct libcfs_debug_msg_data *m,
+                                   unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                                const char *buf, int len, const char *file,
+                                const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+       unsigned int i = get_cpu();
+       unsigned int j = cfs_trace_buf_idx_get();
+
+       return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+       put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+       struct cfs_trace_cpu_data *tcd =
+               &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+       cfs_trace_lock_tcd(tcd, 0);
+
+       return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+       cfs_trace_unlock_tcd(tcd, 0);
+
+       put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+                          struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                     struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+                                      struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)                                                 \
+do {                                                               \
+       if (unlikely(!(cond))) {                                        \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+               cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+                                          &msgdata);              \
+       }                                                              \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)                           \
+do {                                                               \
+       __LASSERT(tage != NULL);                                        \
+       __LASSERT(tage->page != NULL);                            \
+       __LASSERT(tage->used <= PAGE_CACHE_SIZE);                        \
+       __LASSERT(page_count(tage->page) > 0);                \
+} while (0)
+
+#endif /* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustre/lustre/libcfs/upcall_cache.c b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c
new file mode 100644 (file)
index 0000000..18c68c3
--- /dev/null
@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/lucache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+                                             __u64 key, void *args)
+{
+       struct upcall_cache_entry *entry;
+
+       LIBCFS_ALLOC(entry, sizeof(*entry));
+       if (!entry)
+               return NULL;
+
+       UC_CACHE_SET_NEW(entry);
+       INIT_LIST_HEAD(&entry->ue_hash);
+       entry->ue_key = key;
+       atomic_set(&entry->ue_refcount, 0);
+       init_waitqueue_head(&entry->ue_waitq);
+       if (cache->uc_ops->init_entry)
+               cache->uc_ops->init_entry(entry, args);
+       return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+                      struct upcall_cache_entry *entry)
+{
+       if (cache->uc_ops->free_entry)
+               cache->uc_ops->free_entry(cache, entry);
+
+       list_del(&entry->ue_hash);
+       CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n",
+              entry, entry->ue_key);
+       LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+                                struct upcall_cache_entry *entry,
+                                __u64 key, void *args)
+{
+       if (entry->ue_key != key)
+               return -1;
+
+       if (cache->uc_ops->upcall_compare)
+               return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+       return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+                                  struct upcall_cache_entry *entry,
+                                  __u64 key, void *args)
+{
+       if (entry->ue_key != key)
+               return -1;
+
+       if (cache->uc_ops->downcall_compare)
+               return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+       return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+       atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+                            struct upcall_cache_entry *entry)
+{
+       if (atomic_dec_and_test(&entry->ue_refcount) &&
+           (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+               free_entry(cache, entry);
+       }
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+                             struct upcall_cache_entry *entry)
+{
+       if (UC_CACHE_IS_VALID(entry) &&
+           cfs_time_before(cfs_time_current(), entry->ue_expire))
+               return 0;
+
+       if (UC_CACHE_IS_ACQUIRING(entry)) {
+               if (entry->ue_acquire_expire == 0 ||
+                   cfs_time_before(cfs_time_current(),
+                                   entry->ue_acquire_expire))
+                       return 0;
+
+               UC_CACHE_SET_EXPIRED(entry);
+               wake_up_all(&entry->ue_waitq);
+       } else if (!UC_CACHE_IS_INVALID(entry)) {
+               UC_CACHE_SET_EXPIRED(entry);
+       }
+
+       list_del_init(&entry->ue_hash);
+       if (!atomic_read(&entry->ue_refcount))
+               free_entry(cache, entry);
+       return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+                        struct upcall_cache_entry *entry)
+{
+       LASSERT(cache->uc_ops->do_upcall);
+       return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+                                                 __u64 key, void *args)
+{
+       struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+       struct list_head *head;
+       wait_queue_t wait;
+       int rc, found;
+       ENTRY;
+
+       LASSERT(cache);
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+       found = 0;
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry_safe(entry, next, head, ue_hash) {
+               /* check invalid & expired items */
+               if (check_unlink_entry(cache, entry))
+                       continue;
+               if (upcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               if (!new) {
+                       spin_unlock(&cache->uc_lock);
+                       new = alloc_entry(cache, key, args);
+                       if (!new) {
+                               CERROR("fail to alloc entry\n");
+                               RETURN(ERR_PTR(-ENOMEM));
+                       }
+                       goto find_again;
+               } else {
+                       list_add(&new->ue_hash, head);
+                       entry = new;
+               }
+       } else {
+               if (new) {
+                       free_entry(cache, new);
+                       new = NULL;
+               }
+               list_move(&entry->ue_hash, head);
+       }
+       get_entry(entry);
+
+       /* acquire for new one */
+       if (UC_CACHE_IS_NEW(entry)) {
+               UC_CACHE_SET_ACQUIRING(entry);
+               UC_CACHE_CLEAR_NEW(entry);
+               spin_unlock(&cache->uc_lock);
+               rc = refresh_entry(cache, entry);
+               spin_lock(&cache->uc_lock);
+               entry->ue_acquire_expire =
+                       cfs_time_shift(cache->uc_acquire_expire);
+               if (rc < 0) {
+                       UC_CACHE_CLEAR_ACQUIRING(entry);
+                       UC_CACHE_SET_INVALID(entry);
+                       wake_up_all(&entry->ue_waitq);
+                       if (unlikely(rc == -EREMCHG)) {
+                               put_entry(cache, entry);
+                               GOTO(out, entry = ERR_PTR(rc));
+                       }
+               }
+       }
+       /* someone (and only one) is doing upcall upon this item,
+        * wait it to complete */
+       if (UC_CACHE_IS_ACQUIRING(entry)) {
+               long expiry = (entry == new) ?
+                             cfs_time_seconds(cache->uc_acquire_expire) :
+                             MAX_SCHEDULE_TIMEOUT;
+               long left;
+
+               init_waitqueue_entry_current(&wait);
+               add_wait_queue(&entry->ue_waitq, &wait);
+               set_current_state(TASK_INTERRUPTIBLE);
+               spin_unlock(&cache->uc_lock);
+
+               left = waitq_timedwait(&wait, TASK_INTERRUPTIBLE,
+                                          expiry);
+
+               spin_lock(&cache->uc_lock);
+               remove_wait_queue(&entry->ue_waitq, &wait);
+               if (UC_CACHE_IS_ACQUIRING(entry)) {
+                       /* we're interrupted or upcall failed in the middle */
+                       rc = left > 0 ? -EINTR : -ETIMEDOUT;
+                       CERROR("acquire for key "LPU64": error %d\n",
+                              entry->ue_key, rc);
+                       put_entry(cache, entry);
+                       GOTO(out, entry = ERR_PTR(rc));
+               }
+       }
+
+       /* invalid means error, don't need to try again */
+       if (UC_CACHE_IS_INVALID(entry)) {
+               put_entry(cache, entry);
+               GOTO(out, entry = ERR_PTR(-EIDRM));
+       }
+
+       /* check expired
+        * We can't refresh the existing one because some
+        * memory might be shared by multiple processes.
+        */
+       if (check_unlink_entry(cache, entry)) {
+               /* if expired, try again. but if this entry is
+                * created by me but too quickly turn to expired
+                * without any error, should at least give a
+                * chance to use it once.
+                */
+               if (entry != new) {
+                       put_entry(cache, entry);
+                       spin_unlock(&cache->uc_lock);
+                       new = NULL;
+                       goto find_again;
+               }
+       }
+
+       /* Now we know it's good */
+out:
+       spin_unlock(&cache->uc_lock);
+       RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry)
+{
+       ENTRY;
+
+       if (!entry) {
+               EXIT;
+               return;
+       }
+
+       LASSERT(atomic_read(&entry->ue_refcount) > 0);
+       spin_lock(&cache->uc_lock);
+       put_entry(cache, entry);
+       spin_unlock(&cache->uc_lock);
+       EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+                         void *args)
+{
+       struct upcall_cache_entry *entry = NULL;
+       struct list_head *head;
+       int found = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(cache);
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry(entry, head, ue_hash) {
+               if (downcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       get_entry(entry);
+                       break;
+               }
+       }
+
+       if (!found) {
+               CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n",
+                      cache->uc_name, key);
+               /* haven't found, it's possible */
+               spin_unlock(&cache->uc_lock);
+               RETURN(-EINVAL);
+       }
+
+       if (err) {
+               CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n",
+                      cache->uc_name, entry->ue_key, err);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (!UC_CACHE_IS_ACQUIRING(entry)) {
+               CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n",
+                      cache->uc_name, entry, entry->ue_key);
+               GOTO(out, rc = 0);
+       }
+
+       if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+               CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n",
+                      cache->uc_name, entry, entry->ue_key);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       spin_unlock(&cache->uc_lock);
+       if (cache->uc_ops->parse_downcall)
+               rc = cache->uc_ops->parse_downcall(cache, entry, args);
+       spin_lock(&cache->uc_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+       UC_CACHE_SET_VALID(entry);
+       CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n",
+              cache->uc_name, entry, entry->ue_key);
+out:
+       if (rc) {
+               UC_CACHE_SET_INVALID(entry);
+               list_del_init(&entry->ue_hash);
+       }
+       UC_CACHE_CLEAR_ACQUIRING(entry);
+       spin_unlock(&cache->uc_lock);
+       wake_up_all(&entry->ue_waitq);
+       put_entry(cache, entry);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+static void cache_flush(struct upcall_cache *cache, int force)
+{
+       struct upcall_cache_entry *entry, *next;
+       int i;
+       ENTRY;
+
+       spin_lock(&cache->uc_lock);
+       for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+               list_for_each_entry_safe(entry, next,
+                                        &cache->uc_hashtable[i], ue_hash) {
+                       if (!force && atomic_read(&entry->ue_refcount)) {
+                               UC_CACHE_SET_EXPIRED(entry);
+                               continue;
+                       }
+                       LASSERT(!atomic_read(&entry->ue_refcount));
+                       free_entry(cache, entry);
+               }
+       }
+       spin_unlock(&cache->uc_lock);
+       EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+       cache_flush(cache, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+       cache_flush(cache, 1);
+}
+EXPORT_SYMBOL(upcall_cache_flush_all);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+       struct list_head *head;
+       struct upcall_cache_entry *entry;
+       int found = 0;
+       ENTRY;
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry(entry, head, ue_hash) {
+               if (upcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               CWARN("%s: flush entry %p: key "LPU64", ref %d, fl %x, "
+                     "cur %lu, ex %ld/%ld\n",
+                     cache->uc_name, entry, entry->ue_key,
+                     atomic_read(&entry->ue_refcount), entry->ue_flags,
+                     cfs_time_current_sec(), entry->ue_acquire_expire,
+                     entry->ue_expire);
+               UC_CACHE_SET_EXPIRED(entry);
+               if (!atomic_read(&entry->ue_refcount))
+                       free_entry(cache, entry);
+       }
+       spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+                                      struct upcall_cache_ops *ops)
+{
+       struct upcall_cache *cache;
+       int i;
+       ENTRY;
+
+       LIBCFS_ALLOC(cache, sizeof(*cache));
+       if (!cache)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       spin_lock_init(&cache->uc_lock);
+       rwlock_init(&cache->uc_upcall_rwlock);
+       for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+       strncpy(cache->uc_name, name, sizeof(cache->uc_name) - 1);
+       /* upcall pathname proc tunable */
+       strncpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall) - 1);
+       cache->uc_entry_expire = 20 * 60;
+       cache->uc_acquire_expire = 30;
+       cache->uc_ops = ops;
+
+       RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+       if (!cache)
+               return;
+       upcall_cache_flush_all(cache);
+       LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);
diff --git a/drivers/staging/lustre/lustre/libcfs/watchdog.c b/drivers/staging/lustre/lustre/libcfs/watchdog.c
new file mode 100644 (file)
index 0000000..7c385ad
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+struct lc_watchdog {
+       spinlock_t  lcw_lock;     /* check or change lcw_list */
+       int          lcw_refcount; /* must hold lcw_pending_timers_lock */
+       timer_list_t     lcw_timer;    /* kernel timer */
+       struct list_head      lcw_list;     /* chain on pending list */
+       cfs_time_t      lcw_last_touched; /* last touched stamp */
+       task_t     *lcw_task;     /* owner task */
+       void      (*lcw_callback)(pid_t, void *);
+       void       *lcw_data;
+
+       pid_t      lcw_pid;
+
+       enum {
+               LC_WATCHDOG_DISABLED,
+               LC_WATCHDOG_ENABLED,
+               LC_WATCHDOG_EXPIRED
+       } lcw_state;
+};
+
+#ifdef WITH_WATCHDOG
+/*
+ * The dispatcher will complete lcw_start_completion when it starts,
+ * and lcw_stop_completion when it exits.
+ * Wake lcw_event_waitq to signal timer callback dispatches.
+ */
+static struct completion lcw_start_completion;
+static struct completion  lcw_stop_completion;
+static wait_queue_head_t lcw_event_waitq;
+
+/*
+ * Set this and wake lcw_event_waitq to stop the dispatcher.
+ */
+enum {
+       LCW_FLAG_STOP = 0
+};
+static unsigned long lcw_flags = 0;
+
+/*
+ * Number of outstanding watchdogs.
+ * When it hits 1, we start the dispatcher.
+ * When it hits 0, we stop the dispatcher.
+ */
+static __u32    lcw_refcount = 0;
+static DEFINE_MUTEX(lcw_refcount_mutex);
+
+/*
+ * List of timers that have fired that need their callbacks run by the
+ * dispatcher.
+ */
+/* BH lock! */
+static DEFINE_SPINLOCK(lcw_pending_timers_lock);
+static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+       ENTRY;
+       rcu_read_lock();
+       if (lcw->lcw_task == NULL) {
+               LCONSOLE_WARN("Process " LPPID " was not found in the task "
+                             "list; watchdog callback may be incomplete\n",
+                             (int)lcw->lcw_pid);
+       } else {
+               libcfs_debug_dumpstack(lcw->lcw_task);
+       }
+
+       rcu_read_unlock();
+       EXIT;
+}
+
+static void lcw_cb(ulong_ptr_t data)
+{
+       struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+       ENTRY;
+
+       if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
+               EXIT;
+               return;
+       }
+
+       lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+
+       spin_lock_bh(&lcw->lcw_lock);
+       LASSERT(list_empty(&lcw->lcw_list));
+
+       spin_lock_bh(&lcw_pending_timers_lock);
+       lcw->lcw_refcount++; /* +1 for pending list */
+       list_add(&lcw->lcw_list, &lcw_pending_timers);
+       wake_up(&lcw_event_waitq);
+
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       spin_unlock_bh(&lcw->lcw_lock);
+       EXIT;
+}
+
+static int is_watchdog_fired(void)
+{
+       int rc;
+
+       if (test_bit(LCW_FLAG_STOP, &lcw_flags))
+               return 1;
+
+       spin_lock_bh(&lcw_pending_timers_lock);
+       rc = !list_empty(&lcw_pending_timers);
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       return rc;
+}
+
+static void lcw_dump_stack(struct lc_watchdog *lcw)
+{
+       cfs_time_t      current_time;
+       cfs_duration_t  delta_time;
+       struct timeval  timediff;
+
+       current_time = cfs_time_current();
+       delta_time = cfs_time_sub(current_time, lcw->lcw_last_touched);
+       cfs_duration_usec(delta_time, &timediff);
+
+       /*
+        * Check to see if we should throttle the watchdog timer to avoid
+        * too many dumps going to the console thus triggering an NMI.
+        */
+       delta_time = cfs_duration_sec(cfs_time_sub(current_time,
+                                                  lcw_last_watchdog_time));
+
+       if (delta_time < libcfs_watchdog_ratelimit &&
+           lcw_recent_watchdog_count > 3) {
+               LCONSOLE_WARN("Service thread pid %u was inactive for "
+                             "%lu.%.02lus. Watchdog stack traces are limited "
+                             "to 3 per %d seconds, skipping this one.\n",
+                             (int)lcw->lcw_pid,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000,
+                             libcfs_watchdog_ratelimit);
+       } else {
+               if (delta_time < libcfs_watchdog_ratelimit) {
+                       lcw_recent_watchdog_count++;
+               } else {
+                       memcpy(&lcw_last_watchdog_time, &current_time,
+                              sizeof(current_time));
+                       lcw_recent_watchdog_count = 0;
+               }
+
+               LCONSOLE_WARN("Service thread pid %u was inactive for "
+                             "%lu.%.02lus. The thread might be hung, or it "
+                             "might only be slow and will resume later. "
+                             "Dumping the stack trace for debugging purposes:"
+                             "\n",
+                             (int)lcw->lcw_pid,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000);
+               lcw_dump(lcw);
+       }
+}
+
+static int lcw_dispatch_main(void *data)
+{
+       int              rc = 0;
+       struct lc_watchdog *lcw;
+       LIST_HEAD      (zombies);
+
+       ENTRY;
+
+       complete(&lcw_start_completion);
+
+       while (1) {
+               int dumplog = 1;
+
+               cfs_wait_event_interruptible(lcw_event_waitq,
+                                            is_watchdog_fired(), rc);
+               CDEBUG(D_INFO, "Watchdog got woken up...\n");
+               if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
+                       CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n");
+
+                       spin_lock_bh(&lcw_pending_timers_lock);
+                       rc = !list_empty(&lcw_pending_timers);
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+                       if (rc) {
+                               CERROR("pending timers list was not empty at "
+                                      "time of watchdog dispatch shutdown\n");
+                       }
+                       break;
+               }
+
+               spin_lock_bh(&lcw_pending_timers_lock);
+               while (!list_empty(&lcw_pending_timers)) {
+                       int is_dumplog;
+
+                       lcw = list_entry(lcw_pending_timers.next,
+                                            struct lc_watchdog, lcw_list);
+                       /* +1 ref for callback to make sure lwc wouldn't be
+                        * deleted after releasing lcw_pending_timers_lock */
+                       lcw->lcw_refcount++;
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+
+                       /* lock ordering */
+                       spin_lock_bh(&lcw->lcw_lock);
+                       spin_lock_bh(&lcw_pending_timers_lock);
+
+                       if (list_empty(&lcw->lcw_list)) {
+                               /* already removed from pending list */
+                               lcw->lcw_refcount--; /* -1 ref for callback */
+                               if (lcw->lcw_refcount == 0)
+                                       list_add(&lcw->lcw_list, &zombies);
+                               spin_unlock_bh(&lcw->lcw_lock);
+                               /* still hold lcw_pending_timers_lock */
+                               continue;
+                       }
+
+                       list_del_init(&lcw->lcw_list);
+                       lcw->lcw_refcount--; /* -1 ref for pending list */
+
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+                       spin_unlock_bh(&lcw->lcw_lock);
+
+                       CDEBUG(D_INFO, "found lcw for pid " LPPID "\n",
+                              lcw->lcw_pid);
+                       lcw_dump_stack(lcw);
+
+                       is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog;
+                       if (lcw->lcw_state != LC_WATCHDOG_DISABLED &&
+                           (dumplog || !is_dumplog)) {
+                               lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
+                               if (dumplog && is_dumplog)
+                                       dumplog = 0;
+                       }
+
+                       spin_lock_bh(&lcw_pending_timers_lock);
+                       lcw->lcw_refcount--; /* -1 ref for callback */
+                       if (lcw->lcw_refcount == 0)
+                               list_add(&lcw->lcw_list, &zombies);
+               }
+               spin_unlock_bh(&lcw_pending_timers_lock);
+
+               while (!list_empty(&zombies)) {
+                       lcw = list_entry(lcw_pending_timers.next,
+                                        struct lc_watchdog, lcw_list);
+                       list_del(&lcw->lcw_list);
+                       LIBCFS_FREE(lcw, sizeof(*lcw));
+               }
+       }
+
+       complete(&lcw_stop_completion);
+
+       RETURN(rc);
+}
+
+static void lcw_dispatch_start(void)
+{
+       task_t *task;
+
+       ENTRY;
+       LASSERT(lcw_refcount == 1);
+
+       init_completion(&lcw_stop_completion);
+       init_completion(&lcw_start_completion);
+       init_waitqueue_head(&lcw_event_waitq);
+
+       CDEBUG(D_INFO, "starting dispatch thread\n");
+       task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd");
+       if (IS_ERR(task)) {
+               CERROR("error spawning watchdog dispatch thread: %ld\n",
+                       PTR_ERR(task));
+               EXIT;
+               return;
+       }
+       wait_for_completion(&lcw_start_completion);
+       CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n");
+
+       EXIT;
+}
+
+static void lcw_dispatch_stop(void)
+{
+       ENTRY;
+       LASSERT(lcw_refcount == 0);
+
+       CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
+
+       set_bit(LCW_FLAG_STOP, &lcw_flags);
+       wake_up(&lcw_event_waitq);
+
+       wait_for_completion(&lcw_stop_completion);
+
+       CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
+
+       EXIT;
+}
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                   void (*callback)(pid_t, void *),
+                                   void *data)
+{
+       struct lc_watchdog *lcw = NULL;
+       ENTRY;
+
+       LIBCFS_ALLOC(lcw, sizeof(*lcw));
+       if (lcw == NULL) {
+               CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       spin_lock_init(&lcw->lcw_lock);
+       lcw->lcw_refcount = 1; /* refcount for owner */
+       lcw->lcw_task     = current;
+       lcw->lcw_pid      = current_pid();
+       lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+       lcw->lcw_data     = data;
+       lcw->lcw_state    = LC_WATCHDOG_DISABLED;
+
+       INIT_LIST_HEAD(&lcw->lcw_list);
+       cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw);
+
+       mutex_lock(&lcw_refcount_mutex);
+       if (++lcw_refcount == 1)
+               lcw_dispatch_start();
+       mutex_unlock(&lcw_refcount_mutex);
+
+       /* Keep this working in case we enable them by default */
+       if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
+               lcw->lcw_last_touched = cfs_time_current();
+               cfs_timer_arm(&lcw->lcw_timer, cfs_time_seconds(timeout) +
+                             cfs_time_current());
+       }
+
+       RETURN(lcw);
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
+{
+       cfs_time_t newtime = cfs_time_current();;
+
+       if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
+               struct timeval timediff;
+               cfs_time_t delta_time = cfs_time_sub(newtime,
+                                                    lcw->lcw_last_touched);
+               cfs_duration_usec(delta_time, &timediff);
+
+               LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. "
+                             "This indicates the system was overloaded (too "
+                             "many service threads, or there were not enough "
+                             "hardware resources).\n",
+                             lcw->lcw_pid,
+                             message,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000);
+       }
+       lcw->lcw_last_touched = newtime;
+}
+
+static void lc_watchdog_del_pending(struct lc_watchdog *lcw)
+{
+       spin_lock_bh(&lcw->lcw_lock);
+       if (unlikely(!list_empty(&lcw->lcw_list))) {
+               spin_lock_bh(&lcw_pending_timers_lock);
+               list_del_init(&lcw->lcw_list);
+               lcw->lcw_refcount--; /* -1 ref for pending list */
+               spin_unlock_bh(&lcw_pending_timers_lock);
+       }
+
+       spin_unlock_bh(&lcw->lcw_lock);
+}
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       lc_watchdog_del_pending(lcw);
+
+       lcw_update_time(lcw, "resumed");
+       lcw->lcw_state = LC_WATCHDOG_ENABLED;
+
+       cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() +
+                     cfs_time_seconds(timeout));
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       lc_watchdog_del_pending(lcw);
+
+       lcw_update_time(lcw, "completed");
+       lcw->lcw_state = LC_WATCHDOG_DISABLED;
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+       int dead;
+
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       cfs_timer_disarm(&lcw->lcw_timer);
+
+       lcw_update_time(lcw, "stopped");
+
+       spin_lock_bh(&lcw->lcw_lock);
+       spin_lock_bh(&lcw_pending_timers_lock);
+       if (unlikely(!list_empty(&lcw->lcw_list))) {
+               list_del_init(&lcw->lcw_list);
+               lcw->lcw_refcount--; /* -1 ref for pending list */
+       }
+
+       lcw->lcw_refcount--; /* -1 ref for owner */
+       dead = lcw->lcw_refcount == 0;
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       spin_unlock_bh(&lcw->lcw_lock);
+
+       if (dead)
+               LIBCFS_FREE(lcw, sizeof(*lcw));
+
+       mutex_lock(&lcw_refcount_mutex);
+       if (--lcw_refcount == 0)
+               lcw_dispatch_stop();
+       mutex_unlock(&lcw_refcount_mutex);
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+/*
+ * Provided watchdog handlers
+ */
+
+void lc_watchdog_dumplog(pid_t pid, void *data)
+{
+       libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid));
+}
+EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                   void (*callback)(pid_t pid, void *),
+                                   void *data)
+{
+       static struct lc_watchdog      watchdog;
+       return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644 (file)
index 0000000..b533666
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *      Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN         16
+
+typedef struct cfs_wi_sched {
+       struct list_head                ws_list;        /* chain on global list */
+       /** serialised workitems */
+       spinlock_t              ws_lock;
+       /** where schedulers sleep */
+       wait_queue_head_t               ws_waitq;
+       /** concurrent workitems */
+       struct list_head                ws_runq;
+       /** rescheduled running-workitems, a workitem can be rescheduled
+        * while running in wi_action(), but we don't to execute it again
+        * unless it returns from wi_action(), so we put it on ws_rerunq
+        * while rescheduling, and move it to runq after it returns
+        * from wi_action() */
+       struct list_head                ws_rerunq;
+       /** CPT-table for this scheduler */
+       struct cfs_cpt_table    *ws_cptab;
+       /** CPT id for affinity */
+       int                     ws_cpt;
+       /** number of scheduled workitems */
+       int                     ws_nscheduled;
+       /** started scheduler thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_nthreads:30;
+       /** shutting down, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_stopping:1;
+       /** serialize starting thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_starting:1;
+       /** scheduler name */
+       char                    ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+struct cfs_workitem_data {
+       /** serialize */
+       spinlock_t              wi_glock;
+       /** list of all schedulers */
+       struct list_head                wi_scheds;
+       /** WI module is initialized */
+       int                     wi_init;
+       /** shutting down the whole WI module */
+       int                     wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+       spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+       spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+       cfs_wi_sched_lock(sched);
+       if (sched->ws_stopping) {
+               cfs_wi_sched_unlock(sched);
+               return 0;
+       }
+
+       if (!list_empty(&sched->ws_runq)) {
+               cfs_wi_sched_unlock(sched);
+               return 0;
+       }
+       cfs_wi_sched_unlock(sched);
+       return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       cfs_wi_sched_lock(sched);
+
+       LASSERT(wi->wi_running);
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
+
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
+       }
+
+       LASSERT(list_empty(&wi->wi_list));
+
+       wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+       cfs_wi_sched_unlock(sched);
+
+       return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       int     rc;
+
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       /*
+        * return 0 if it's running already, otherwise return 1, which
+        * means the workitem will not be scheduled and will not have
+        * any race with wi_action.
+        */
+       cfs_wi_sched_lock(sched);
+
+       rc = !(wi->wi_running);
+
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
+
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
+
+               wi->wi_scheduled = 0;
+       }
+
+       LASSERT (list_empty(&wi->wi_list));
+
+       cfs_wi_sched_unlock(sched);
+       return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       cfs_wi_sched_lock(sched);
+
+       if (!wi->wi_scheduled) {
+               LASSERT (list_empty(&wi->wi_list));
+
+               wi->wi_scheduled = 1;
+               sched->ws_nscheduled++;
+               if (!wi->wi_running) {
+                       list_add_tail(&wi->wi_list, &sched->ws_runq);
+                       wake_up(&sched->ws_waitq);
+               } else {
+                       list_add(&wi->wi_list, &sched->ws_rerunq);
+               }
+       }
+
+       LASSERT (!list_empty(&wi->wi_list));
+       cfs_wi_sched_unlock(sched);
+       return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+       struct cfs_wi_sched     *sched = (cfs_wi_sched_t *)arg;
+
+       cfs_block_allsigs();
+
+       /* CPT affinity scheduler? */
+       if (sched->ws_cptab != NULL)
+               cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+
+       LASSERT(sched->ws_starting == 1);
+       sched->ws_starting--;
+       sched->ws_nthreads++;
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       cfs_wi_sched_lock(sched);
+
+       while (!sched->ws_stopping) {
+               int          nloops = 0;
+               int          rc;
+               cfs_workitem_t *wi;
+
+               while (!list_empty(&sched->ws_runq) &&
+                      nloops < CFS_WI_RESCHED) {
+                       wi = list_entry(sched->ws_runq.next,
+                                           cfs_workitem_t, wi_list);
+                       LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+                       list_del_init(&wi->wi_list);
+
+                       LASSERT(sched->ws_nscheduled > 0);
+                       sched->ws_nscheduled--;
+
+                       wi->wi_running   = 1;
+                       wi->wi_scheduled = 0;
+
+
+                       cfs_wi_sched_unlock(sched);
+                       nloops++;
+
+                       rc = (*wi->wi_action) (wi);
+
+                       cfs_wi_sched_lock(sched);
+                       if (rc != 0) /* WI should be dead, even be freed! */
+                               continue;
+
+                       wi->wi_running = 0;
+                       if (list_empty(&wi->wi_list))
+                               continue;
+
+                       LASSERT(wi->wi_scheduled);
+                       /* wi is rescheduled, should be on rerunq now, we
+                        * move it to runq so it can run action now */
+                       list_move_tail(&wi->wi_list, &sched->ws_runq);
+               }
+
+               if (!list_empty(&sched->ws_runq)) {
+                       cfs_wi_sched_unlock(sched);
+                       /* don't sleep because some workitems still
+                        * expect me to come back soon */
+                       cond_resched();
+                       cfs_wi_sched_lock(sched);
+                       continue;
+               }
+
+               cfs_wi_sched_unlock(sched);
+               cfs_wait_event_interruptible_exclusive(sched->ws_waitq,
+                               !cfs_wi_sched_cansleep(sched), rc);
+               cfs_wi_sched_lock(sched);
+       }
+
+       cfs_wi_sched_unlock(sched);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       sched->ws_nthreads--;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+       int     i;
+
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       if (sched->ws_stopping) {
+               CDEBUG(D_INFO, "%s is in progress of stopping\n",
+                      sched->ws_name);
+               spin_unlock(&cfs_wi_data.wi_glock);
+               return;
+       }
+
+       LASSERT(!list_empty(&sched->ws_list));
+       sched->ws_stopping = 1;
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       i = 2;
+       wake_up_all(&sched->ws_waitq);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       while (sched->ws_nthreads > 0) {
+               CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+                      "waiting for %d threads of WI sched[%s] to terminate\n",
+                      sched->ws_nthreads, sched->ws_name);
+
+               spin_unlock(&cfs_wi_data.wi_glock);
+               cfs_pause(cfs_time_seconds(1) / 20);
+               spin_lock(&cfs_wi_data.wi_glock);
+       }
+
+       list_del(&sched->ws_list);
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+       LASSERT(sched->ws_nscheduled == 0);
+
+       LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+                   int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+       struct cfs_wi_sched     *sched;
+       int                     rc;
+
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
+       LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+               (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+       LIBCFS_ALLOC(sched, sizeof(*sched));
+       if (sched == NULL)
+               return -ENOMEM;
+
+       strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+       sched->ws_cptab = cptab;
+       sched->ws_cpt = cpt;
+
+       spin_lock_init(&sched->ws_lock);
+       init_waitqueue_head(&sched->ws_waitq);
+       INIT_LIST_HEAD(&sched->ws_runq);
+       INIT_LIST_HEAD(&sched->ws_rerunq);
+       INIT_LIST_HEAD(&sched->ws_list);
+
+       rc = 0;
+       while (nthrs > 0)  {
+               char    name[16];
+               task_t  *task;
+               spin_lock(&cfs_wi_data.wi_glock);
+               while (sched->ws_starting > 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       schedule();
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+
+               sched->ws_starting++;
+               spin_unlock(&cfs_wi_data.wi_glock);
+
+               if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+                       snprintf(name, sizeof(name), "%s_%02d_%02d",
+                                sched->ws_name, sched->ws_cpt,
+                                sched->ws_nthreads);
+               } else {
+                       snprintf(name, sizeof(name), "%s_%02d",
+                                sched->ws_name, sched->ws_nthreads);
+               }
+
+               task = kthread_run(cfs_wi_scheduler, sched, name);
+               if (!IS_ERR(task)) {
+                       nthrs--;
+                       continue;
+               }
+               rc = PTR_ERR(task);
+
+               CERROR("Failed to create thread for WI scheduler %s: %d\n",
+                      name, rc);
+
+               spin_lock(&cfs_wi_data.wi_glock);
+
+               /* make up for cfs_wi_sched_destroy */
+               list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+               sched->ws_starting--;
+
+               spin_unlock(&cfs_wi_data.wi_glock);
+
+               cfs_wi_sched_destroy(sched);
+               return rc;
+       }
+       spin_lock(&cfs_wi_data.wi_glock);
+       list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       *sched_pp = sched;
+       return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+       memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+       spin_lock_init(&cfs_wi_data.wi_glock);
+       INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+       cfs_wi_data.wi_init = 1;
+
+       return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+       struct cfs_wi_sched     *sched;
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       cfs_wi_data.wi_stopping = 1;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       /* nobody should contend on this list */
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               sched->ws_stopping = 1;
+               wake_up_all(&sched->ws_waitq);
+       }
+
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               spin_lock(&cfs_wi_data.wi_glock);
+
+               while (sched->ws_nthreads != 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       cfs_pause(cfs_time_seconds(1) / 20);
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+               spin_unlock(&cfs_wi_data.wi_glock);
+       }
+       while (!list_empty(&cfs_wi_data.wi_scheds)) {
+               sched = list_entry(cfs_wi_data.wi_scheds.next,
+                                      struct cfs_wi_sched, ws_list);
+               list_del(&sched->ws_list);
+               LIBCFS_FREE(sched, sizeof(*sched));
+       }
+
+       cfs_wi_data.wi_stopping = 0;
+       cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644 (file)
index 0000000..dff0c04
--- /dev/null
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_FS) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+           rw.o lproc_llite.o namei.o symlink.o llite_mmap.o \
+           xattr.o remote_perm.o llite_rmtacl.o llite_capa.o \
+           rw26.o super25.o statahead.o \
+           ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+           vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+llite_lloop-y := lloop.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644 (file)
index 0000000..e048538
--- /dev/null
@@ -0,0 +1,675 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+       struct ll_dentry_data *lld;
+
+       lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+       OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+       struct ll_dentry_data *lld;
+       ENTRY;
+       LASSERT(de != NULL);
+       lld = ll_d2d(de);
+       if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+               RETURN_EXIT;
+
+       if (lld->lld_it) {
+               ll_intent_release(lld->lld_it);
+               OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+       }
+       LASSERT(lld->lld_cwd_count == 0);
+       LASSERT(lld->lld_mnt_count == 0);
+       de->d_fsdata = NULL;
+       call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+       EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+               const struct dentry *dentry, const struct inode *inode,
+               unsigned int len, const char *str, const struct qstr *name)
+{
+       ENTRY;
+
+       if (len != name->len)
+               RETURN(1);
+
+       if (memcmp(str, name->name, len))
+               RETURN(1);
+
+       CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+              name->len, name->name, dentry, dentry->d_flags,
+              d_refcount(dentry));
+
+       /* mountpoint is always valid */
+       if (d_mountpoint((struct dentry *)dentry))
+               RETURN(0);
+
+       if (d_lustre_invalid(dentry))
+               RETURN(1);
+
+       RETURN(0);
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+       if ((lock->l_flags &
+            (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+           (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+               return LDLM_ITER_CONTINUE;
+       return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int find_cbdata(struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct lov_stripe_md *lsm;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(inode);
+       rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+                           return_if_equal, NULL);
+       if (rc != 0)
+                RETURN(rc);
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               RETURN(rc);
+
+       rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+       ccc_inode_lsm_put(inode, lsm);
+
+       RETURN(rc);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+       ENTRY;
+       LASSERT(de);
+
+       CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n",
+              d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+              de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+              d_unhashed((struct dentry *)de) ? "" : "hashed,",
+              list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+       /* kernel >= 2.6.38 last refcount is decreased after this function. */
+       LASSERT(d_refcount(de) == 1);
+
+       /* Disable this piece of code temproarily because this is called
+        * inside dcache_lock so it's not appropriate to do lots of work
+        * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+        * resolved. */
+#if 0
+       /* if not ldlm lock for this inode, set i_nlink to 0 so that
+        * this inode can be recycled later b=20433 */
+       if (de->d_inode && !find_cbdata(de->d_inode))
+               clear_nlink(de->d_inode);
+#endif
+
+       if (d_lustre_invalid((struct dentry *)de))
+               RETURN(1);
+       RETURN(0);
+}
+
+static int ll_set_dd(struct dentry *de)
+{
+       ENTRY;
+       LASSERT(de != NULL);
+
+       CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
+               de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+               d_refcount(de));
+
+       if (de->d_fsdata == NULL) {
+               struct ll_dentry_data *lld;
+
+               OBD_ALLOC_PTR(lld);
+               if (likely(lld != NULL)) {
+                       spin_lock(&de->d_lock);
+                       if (likely(de->d_fsdata == NULL))
+                               de->d_fsdata = lld;
+                       else
+                               OBD_FREE_PTR(lld);
+                       spin_unlock(&de->d_lock);
+               } else {
+                       RETURN(-ENOMEM);
+               }
+       }
+
+       RETURN(0);
+}
+
+int ll_dops_init(struct dentry *de, int block, int init_sa)
+{
+       struct ll_dentry_data *lld = ll_d2d(de);
+       int rc = 0;
+
+       if (lld == NULL && block != 0) {
+               rc = ll_set_dd(de);
+               if (rc)
+                       return rc;
+
+               lld = ll_d2d(de);
+       }
+
+       if (lld != NULL && init_sa != 0)
+               lld->lld_sa_generation = 0;
+
+       /* kernel >= 2.6.38 d_op is set in d_alloc() */
+       LASSERT(de->d_op == &ll_d_ops);
+       return rc;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+       if (it->it_op && it->d.lustre.it_lock_mode) {
+               struct lustre_handle handle;
+
+               handle.cookie = it->d.lustre.it_lock_handle;
+
+               CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
+                      " from it %p\n", handle.cookie, it);
+               ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+               /* bug 494: intent_release may be called multiple times, from
+                * this thread and we don't want to double-decref this lock */
+               it->d.lustre.it_lock_mode = 0;
+               if (it->d.lustre.it_remote_lock_mode != 0) {
+                       handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+                       CDEBUG(D_DLMTRACE, "releasing remote lock with cookie"
+                              LPX64" from it %p\n", handle.cookie, it);
+                       ldlm_lock_decref(&handle,
+                                        it->d.lustre.it_remote_lock_mode);
+                       it->d.lustre.it_remote_lock_mode = 0;
+               }
+       }
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "intent %p released\n", it);
+       ll_intent_drop_lock(it);
+       /* We are still holding extra reference on a request, need to free it */
+       if (it_disposition(it, DISP_ENQ_OPEN_REF))
+                ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+       if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+               ptlrpc_req_finished(it->d.lustre.it_data);
+       if (it_disposition(it, DISP_ENQ_COMPLETE)) /* saved req from revalidate
+                                                   * to lookup */
+               ptlrpc_req_finished(it->d.lustre.it_data);
+
+       it->d.lustre.it_disposition = 0;
+       it->d.lustre.it_data = NULL;
+       EXIT;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+       struct dentry *dentry;
+       struct ll_d_hlist_node *p;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       ll_lock_dcache(inode);
+       ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+               CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p "
+                      "inode %p flags %d\n", dentry->d_name.len,
+                      dentry->d_name.name, dentry, dentry->d_parent,
+                      dentry->d_inode, dentry->d_flags);
+
+               if (dentry->d_name.len == 1 && dentry->d_name.name[0] == '/') {
+                       CERROR("called on root (?) dentry=%p, inode=%p "
+                              "ino=%lu\n", dentry, inode, inode->i_ino);
+                       lustre_dump_dentry(dentry, 1);
+                       libcfs_debug_dumpstack(NULL);
+               }
+
+               d_lustre_invalidate(dentry);
+       }
+       ll_unlock_dcache(inode);
+
+       EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                           struct lookup_intent *it,
+                           struct dentry *de)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!request)
+               RETURN(0);
+
+       if (it_disposition(it, DISP_LOOKUP_NEG))
+               RETURN(-ENOENT);
+
+       rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+
+       RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+       LASSERT(it != NULL);
+       LASSERT(dentry != NULL);
+
+       if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+               struct inode *inode = dentry->d_inode;
+               struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+
+               CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+                      inode, inode->i_ino, inode->i_generation);
+               ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+       }
+
+       /* drop lookup or getattr locks immediately */
+       if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+               /* on 2.6 there are situation when several lookups and
+                * revalidations may be requested during single operation.
+                * therefore, we don't release intent here -bzzz */
+               ll_intent_drop_lock(it);
+       }
+}
+
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
+{
+       struct lookup_intent *it = *itp;
+
+       if (!it || it->it_op == IT_GETXATTR)
+               it = *itp = deft;
+
+}
+
+int ll_revalidate_it(struct dentry *de, int lookup_flags,
+                    struct lookup_intent *it)
+{
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+       struct obd_export *exp;
+       struct inode *parent = de->d_parent->d_inode;
+       int rc;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
+              LL_IT2STR(it));
+
+       if (de->d_inode == NULL) {
+               __u64 ibits;
+
+               /* We can only use negative dentries if this is stat or lookup,
+                  for opens and stuff we do need to query server. */
+               /* If there is IT_CREAT in intent op set, then we must throw
+                  away this negative dentry and actually do the request to
+                  kernel to create whatever needs to be created (if possible)*/
+               if (it && (it->it_op & IT_CREAT))
+                       RETURN(0);
+
+               if (d_lustre_invalid(de))
+                       RETURN(0);
+
+               ibits = MDS_INODELOCK_UPDATE;
+               rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE);
+               GOTO(out_sa, rc);
+       }
+
+       /* Never execute intents for mount points.
+        * Attributes will be fixed up in ll_inode_revalidate_it */
+       if (d_mountpoint(de))
+               GOTO(out_sa, rc = 1);
+
+       /* need to get attributes in case root got changed from other client */
+       if (de == de->d_sb->s_root) {
+               rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
+               if (rc == 0)
+                       rc = 1;
+               GOTO(out_sa, rc);
+       }
+
+       exp = ll_i2mdexp(de->d_inode);
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
+       ll_frob_intent(&it, &lookup_it);
+       LASSERT(it);
+
+       if (it->it_op == IT_LOOKUP && !d_lustre_invalid(de))
+               RETURN(1);
+
+       if (it->it_op == IT_OPEN) {
+               struct inode *inode = de->d_inode;
+               struct ll_inode_info *lli = ll_i2info(inode);
+               struct obd_client_handle **och_p;
+               __u64 *och_usecount;
+               __u64 ibits;
+
+               /*
+                * We used to check for MDS_INODELOCK_OPEN here, but in fact
+                * just having LOOKUP lock is enough to justify inode is the
+                * same. And if inode is the same and we have suitable
+                * openhandle, then there is no point in doing another OPEN RPC
+                * just to throw away newly received openhandle.  There are no
+                * security implications too, if file owner or access mode is
+                * change, LOOKUP lock is revoked.
+                */
+
+
+               if (it->it_flags & FMODE_WRITE) {
+                       och_p = &lli->lli_mds_write_och;
+                       och_usecount = &lli->lli_open_fd_write_count;
+               } else if (it->it_flags & FMODE_EXEC) {
+                       och_p = &lli->lli_mds_exec_och;
+                       och_usecount = &lli->lli_open_fd_exec_count;
+               } else {
+                       och_p = &lli->lli_mds_read_och;
+                       och_usecount = &lli->lli_open_fd_read_count;
+               }
+               /* Check for the proper lock. */
+               ibits = MDS_INODELOCK_LOOKUP;
+               if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE))
+                       goto do_lock;
+               mutex_lock(&lli->lli_och_mutex);
+               if (*och_p) { /* Everything is open already, do nothing */
+                       /*(*och_usecount)++;  Do not let them steal our open
+                         handle from under us */
+                       SET_BUT_UNUSED(och_usecount);
+                       /* XXX The code above was my original idea, but in case
+                          we have the handle, but we cannot use it due to later
+                          checks (e.g. O_CREAT|O_EXCL flags set), nobody
+                          would decrement counter increased here. So we just
+                          hope the lock won't be invalidated in between. But
+                          if it would be, we'll reopen the open request to
+                          MDS later during file open path */
+                       mutex_unlock(&lli->lli_och_mutex);
+                       RETURN(1);
+               } else {
+                       mutex_unlock(&lli->lli_och_mutex);
+               }
+       }
+
+       if (it->it_op == IT_GETATTR) {
+               rc = ll_statahead_enter(parent, &de, 0);
+               if (rc == 1)
+                       goto mark;
+               else if (rc != -EAGAIN && rc != 0)
+                       GOTO(out, rc = 0);
+       }
+
+do_lock:
+       op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
+                                    de->d_name.name, de->d_name.len,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       if (!IS_POSIXACL(parent) || !exp_connect_umask(exp))
+               it->it_create_mode &= ~current_umask();
+       it->it_create_mode |= M_CHECK_STALE;
+       rc = md_intent_lock(exp, op_data, NULL, 0, it,
+                           lookup_flags,
+                           &req, ll_md_blocking_ast, 0);
+       it->it_create_mode &= ~M_CHECK_STALE;
+       ll_finish_md_op_data(op_data);
+
+       /* If req is NULL, then md_intent_lock only tried to do a lock match;
+        * if all was well, it will return 1 if it found locks, 0 otherwise. */
+       if (req == NULL && rc >= 0) {
+               if (!rc)
+                       goto do_lookup;
+               GOTO(out, rc);
+       }
+
+       if (rc < 0) {
+               if (rc != -ESTALE) {
+                       CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
+                              "%d\n", rc, it->d.lustre.it_status);
+               }
+               GOTO(out, rc = 0);
+       }
+
+revalidate_finish:
+       rc = ll_revalidate_it_finish(req, it, de);
+       if (rc != 0) {
+               if (rc != -ESTALE && rc != -ENOENT)
+                       ll_intent_release(it);
+               GOTO(out, rc = 0);
+       }
+
+       if ((it->it_op & IT_OPEN) && de->d_inode &&
+           !S_ISREG(de->d_inode->i_mode) &&
+           !S_ISDIR(de->d_inode->i_mode)) {
+               ll_release_openhandle(de, it);
+       }
+       rc = 1;
+
+out:
+       /* We do not free request as it may be reused during following lookup
+        * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
+        * be freed in ll_lookup_it or in ll_intent_release. But if
+        * request was not completed, we need to free it. (bug 5154, 9903) */
+       if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
+               ptlrpc_req_finished(req);
+       if (rc == 0) {
+               /* mdt may grant layout lock for the newly created file, so
+                * release the lock to avoid leaking */
+               ll_intent_drop_lock(it);
+               ll_invalidate_aliases(de->d_inode);
+       } else {
+               __u64 bits = 0;
+               __u64 matched_bits = 0;
+
+               CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
+                      "inode %p refc %d\n", de->d_name.len,
+                      de->d_name.name, de, de->d_parent, de->d_inode,
+                      d_refcount(de));
+
+               ll_set_lock_data(exp, de->d_inode, it, &bits);
+
+               /* Note: We have to match both LOOKUP and PERM lock
+                * here to make sure the dentry is valid and no one
+                * changing the permission.
+                * But if the client connects < 2.4 server, which will
+                * only grant LOOKUP lock, so we can only Match LOOKUP
+                * lock for old server */
+               if (exp_connect_flags(ll_i2mdexp(de->d_inode)) &&
+                                                       OBD_CONNECT_LVB_TYPE)
+                       matched_bits =
+                               MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
+               else
+                       matched_bits = MDS_INODELOCK_LOOKUP;
+
+               if (((bits & matched_bits) == matched_bits) &&
+                   d_lustre_invalid(de))
+                       d_lustre_revalidate(de);
+               ll_lookup_finish_locks(it, de);
+       }
+
+mark:
+       if (it != NULL && it->it_op == IT_GETATTR && rc > 0)
+               ll_statahead_mark(parent, de);
+       RETURN(rc);
+
+       /*
+        * This part is here to combat evil-evil race in real_lookup on 2.6
+        * kernels.  The race details are: We enter do_lookup() looking for some
+        * name, there is nothing in dcache for this name yet and d_lookup()
+        * returns NULL.  We proceed to real_lookup(), and while we do this,
+        * another process does open on the same file we looking up (most simple
+        * reproducer), open succeeds and the dentry is added. Now back to
+        * us. In real_lookup() we do d_lookup() again and suddenly find the
+        * dentry, so we call d_revalidate on it, but there is no lock, so
+        * without this code we would return 0, but unpatched real_lookup just
+        * returns -ENOENT in such a case instead of retrying the lookup. Once
+        * this is dealt with in real_lookup(), all of this ugly mess can go and
+        * we can just check locks in ->d_revalidate without doing any RPCs
+        * ever.
+        */
+do_lookup:
+       if (it != &lookup_it) {
+               /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
+               if (it->it_op == IT_GETATTR)
+                       lookup_it.it_op = IT_GETATTR;
+               ll_lookup_finish_locks(it, de);
+               it = &lookup_it;
+       }
+
+       /* Do real lookup here. */
+       op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
+                                    de->d_name.len, 0, (it->it_op & IT_CREAT ?
+                                                        LUSTRE_OPC_CREATE :
+                                                        LUSTRE_OPC_ANY), NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
+                           ll_md_blocking_ast, 0);
+       if (rc >= 0) {
+               struct mdt_body *mdt_body;
+               struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
+               mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+               if (de->d_inode)
+                       fid = *ll_inode2fid(de->d_inode);
+
+               /* see if we got same inode, if not - return error */
+               if (lu_fid_eq(&fid, &mdt_body->fid1)) {
+                       ll_finish_md_op_data(op_data);
+                       op_data = NULL;
+                       goto revalidate_finish;
+               }
+               ll_intent_release(it);
+       }
+       ll_finish_md_op_data(op_data);
+       GOTO(out, rc = 0);
+
+out_sa:
+       /*
+        * For rc == 1 case, should not return directly to prevent losing
+        * statahead windows; for rc == 0 case, the "lookup" will be done later.
+        */
+       if (it != NULL && it->it_op == IT_GETATTR && rc == 1)
+               ll_statahead_enter(parent, &de, 1);
+       goto mark;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+       struct inode *parent = dentry->d_parent->d_inode;
+       int unplug = 0;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%s,flags=%u\n",
+              dentry->d_name.name, flags);
+
+       if (!(flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) &&
+           ll_need_statahead(parent, dentry) > 0) {
+               if (flags & LOOKUP_RCU)
+                       RETURN(-ECHILD);
+
+               if (dentry->d_inode == NULL)
+                       unplug = 1;
+               do_statahead_enter(parent, &dentry, unplug);
+               ll_statahead_mark(parent, dentry);
+       }
+
+       RETURN(1);
+}
+
+
+void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+       LASSERT(inode);
+       if (!find_cbdata(inode))
+               clear_nlink(inode);
+       iput(inode);
+}
+
+struct dentry_operations ll_d_ops = {
+       .d_revalidate = ll_revalidate_nd,
+       .d_release = ll_release,
+       .d_delete  = ll_ddelete,
+       .d_iput    = ll_d_iput,
+       .d_compare = ll_dcompare,
+};
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644 (file)
index 0000000..23c61fe
--- /dev/null
@@ -0,0 +1,1978 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+       struct inode *inode = page0->mapping->host;
+       int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+       struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+       struct ptlrpc_request *request;
+       struct mdt_body *body;
+       struct md_op_data *op_data;
+       __u64 hash = *((__u64 *)_hash);
+       struct page **page_pool;
+       struct page *page;
+       struct lu_dirpage *dp;
+       int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+       int nrdpgs = 0; /* number of pages read actually */
+       int npages;
+       int i;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash "LPU64"\n",
+              inode->i_ino, inode->i_generation, inode, hash);
+
+       LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+       OBD_ALLOC(page_pool, sizeof(page) * max_pages);
+       if (page_pool != NULL) {
+               page_pool[0] = page0;
+       } else {
+               page_pool = &page0;
+               max_pages = 1;
+       }
+       for (npages = 1; npages < max_pages; npages++) {
+               page = page_cache_alloc_cold(inode->i_mapping);
+               if (!page)
+                       break;
+               page_pool[npages] = page;
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       op_data->op_npages = npages;
+       op_data->op_offset = hash;
+       rc = md_readpage(exp, op_data, page_pool, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+               /* Checked by mdc_readpage() */
+               LASSERT(body != NULL);
+
+               if (body->valid & OBD_MD_FLSIZE)
+                       cl_isize_write(inode, body->size);
+
+               nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+                        >> PAGE_CACHE_SHIFT;
+               SetPageUptodate(page0);
+       }
+       unlock_page(page0);
+       ptlrpc_req_finished(request);
+
+       CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+       ll_pagevec_init(&lru_pvec, 0);
+       for (i = 1; i < npages; i++) {
+               unsigned long offset;
+               int ret;
+
+               page = page_pool[i];
+
+               if (rc < 0 || i >= nrdpgs) {
+                       page_cache_release(page);
+                       continue;
+               }
+
+               SetPageUptodate(page);
+
+               dp = kmap(page);
+               hash = le64_to_cpu(dp->ldp_hash_start);
+               kunmap(page);
+
+               offset = hash_x_index(hash, hash64);
+
+               prefetchw(&page->flags);
+               ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+                                           GFP_KERNEL);
+               if (ret == 0) {
+                       unlock_page(page);
+                       if (ll_pagevec_add(&lru_pvec, page) == 0)
+                               ll_pagevec_lru_add_file(&lru_pvec);
+               } else {
+                       CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+                              " %d\n", offset, ret);
+               }
+               page_cache_release(page);
+       }
+       ll_pagevec_lru_add_file(&lru_pvec);
+
+       if (page_pool != &page0)
+               OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+       EXIT;
+       return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+       /* XXX: check page format later */
+       SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+       kunmap(page);
+       if (remove) {
+               lock_page(page);
+               if (likely(page->mapping != NULL))
+                       truncate_complete_page(page->mapping, page);
+               unlock_page(page);
+       }
+       page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+                                      __u64 *start, __u64 *end)
+{
+       int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+       struct address_space *mapping = dir->i_mapping;
+       /*
+        * Complement of hash is used as an index so that
+        * radix_tree_gang_lookup() can be used to find a page with starting
+        * hash _smaller_ than one we are looking for.
+        */
+       unsigned long offset = hash_x_index(*hash, hash64);
+       struct page *page;
+       int found;
+
+       TREE_READ_LOCK_IRQ(mapping);
+       found = radix_tree_gang_lookup(&mapping->page_tree,
+                                      (void **)&page, offset, 1);
+       if (found > 0) {
+               struct lu_dirpage *dp;
+
+               page_cache_get(page);
+               TREE_READ_UNLOCK_IRQ(mapping);
+               /*
+                * In contrast to find_lock_page() we are sure that directory
+                * page cannot be truncated (while DLM lock is held) and,
+                * hence, can avoid restart.
+                *
+                * In fact, page cannot be locked here at all, because
+                * ll_dir_filler() does synchronous io.
+                */
+               wait_on_page_locked(page);
+               if (PageUptodate(page)) {
+                       dp = kmap(page);
+                       if (BITS_PER_LONG == 32 && hash64) {
+                               *start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+                               *end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+                               *hash  = *hash >> 32;
+                       } else {
+                               *start = le64_to_cpu(dp->ldp_hash_start);
+                               *end   = le64_to_cpu(dp->ldp_hash_end);
+                       }
+                       LASSERTF(*start <= *hash, "start = "LPX64",end = "
+                                LPX64",hash = "LPX64"\n", *start, *end, *hash);
+                       CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash "LPU64"\n",
+                              offset, *start, *end, *hash);
+                       if (*hash > *end) {
+                               ll_release_page(page, 0);
+                               page = NULL;
+                       } else if (*end != *start && *hash == *end) {
+                               /*
+                                * upon hash collision, remove this page,
+                                * otherwise put page reference, and
+                                * ll_get_dir_page() will issue RPC to fetch
+                                * the page we want.
+                                */
+                               ll_release_page(page,
+                                   le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                               page = NULL;
+                       }
+               } else {
+                       page_cache_release(page);
+                       page = ERR_PTR(-EIO);
+               }
+
+       } else {
+               TREE_READ_UNLOCK_IRQ(mapping);
+               page = NULL;
+       }
+       return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+                            struct ll_dir_chain *chain)
+{
+       ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+       struct address_space *mapping = dir->i_mapping;
+       struct lustre_handle lockh;
+       struct lu_dirpage *dp;
+       struct page *page;
+       ldlm_mode_t mode;
+       int rc;
+       __u64 start = 0;
+       __u64 end = 0;
+       __u64 lhash = hash;
+       struct ll_inode_info *lli = ll_i2info(dir);
+       int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+       mode = LCK_PR;
+       rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+                          ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+       if (!rc) {
+               struct ldlm_enqueue_info einfo = {.ei_type = LDLM_IBITS,
+                                                 .ei_mode = mode,
+                                                 .ei_cb_bl =
+                                                 ll_md_blocking_ast,
+                                                 .ei_cb_cp =
+                                                 ldlm_completion_ast,
+                                                 .ei_cb_gl = NULL,
+                                                 .ei_cb_wg = NULL,
+                                                 .ei_cbdata = NULL};
+               struct lookup_intent it = { .it_op = IT_READDIR };
+               struct ptlrpc_request *request;
+               struct md_op_data *op_data;
+
+               op_data = ll_prep_md_op_data(NULL, dir, NULL, NULL, 0, 0,
+               LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       return (void *)op_data;
+
+               rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+                               op_data, &lockh, NULL, 0, NULL, 0);
+
+               ll_finish_md_op_data(op_data);
+
+               request = (struct ptlrpc_request *)it.d.lustre.it_data;
+               if (request)
+                       ptlrpc_req_finished(request);
+               if (rc < 0) {
+                       CERROR("lock enqueue: "DFID" at "LPU64": rc %d\n",
+                               PFID(ll_inode2fid(dir)), hash, rc);
+                       return ERR_PTR(rc);
+               }
+
+               CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+                      dir, dir->i_ino, dir->i_generation);
+               md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+                                &it.d.lustre.it_lock_handle, dir, NULL);
+       } else {
+               /* for cross-ref object, l_ast_data of the lock may not be set,
+                * we reset it here */
+               md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+                                dir, NULL);
+       }
+       ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+       mutex_lock(&lli->lli_readdir_mutex);
+       page = ll_dir_page_locate(dir, &lhash, &start, &end);
+       if (IS_ERR(page)) {
+               CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
+                      PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+               GOTO(out_unlock, page);
+       } else if (page != NULL) {
+               /*
+                * XXX nikita: not entirely correct handling of a corner case:
+                * suppose hash chain of entries with hash value HASH crosses
+                * border between pages P0 and P1. First both P0 and P1 are
+                * cached, seekdir() is called for some entry from the P0 part
+                * of the chain. Later P0 goes out of cache. telldir(HASH)
+                * happens and finds P1, as it starts with matching hash
+                * value. Remaining entries from P0 part of the chain are
+                * skipped. (Is that really a bug?)
+                *
+                * Possible solutions: 0. don't cache P1 is such case, handle
+                * it as an "overflow" page. 1. invalidate all pages at
+                * once. 2. use HASH|1 as an index for P1.
+                */
+               GOTO(hash_collision, page);
+       }
+
+       page = read_cache_page(mapping, hash_x_index(hash, hash64),
+                              ll_dir_filler, &lhash);
+       if (IS_ERR(page)) {
+               CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
+                      PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+               GOTO(out_unlock, page);
+       }
+
+       wait_on_page_locked(page);
+       (void)kmap(page);
+       if (!PageUptodate(page)) {
+               CERROR("page not updated: "DFID" at "LPU64": rc %d\n",
+                      PFID(ll_inode2fid(dir)), hash, -5);
+               goto fail;
+       }
+       if (!PageChecked(page))
+               ll_check_page(dir, page);
+       if (PageError(page)) {
+               CERROR("page error: "DFID" at "LPU64": rc %d\n",
+                      PFID(ll_inode2fid(dir)), hash, -5);
+               goto fail;
+       }
+hash_collision:
+       dp = page_address(page);
+       if (BITS_PER_LONG == 32 && hash64) {
+               start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+               end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+               lhash = hash >> 32;
+       } else {
+               start = le64_to_cpu(dp->ldp_hash_start);
+               end   = le64_to_cpu(dp->ldp_hash_end);
+               lhash = hash;
+       }
+       if (end == start) {
+               LASSERT(start == lhash);
+               CWARN("Page-wide hash collision: "LPU64"\n", end);
+               if (BITS_PER_LONG == 32 && hash64)
+                       CWARN("Real page-wide hash collision at ["LPU64" "LPU64
+                             "] with hash "LPU64"\n",
+                             le64_to_cpu(dp->ldp_hash_start),
+                             le64_to_cpu(dp->ldp_hash_end), hash);
+               /*
+                * Fetch whole overflow chain...
+                *
+                * XXX not yet.
+                */
+               goto fail;
+       }
+out_unlock:
+       mutex_unlock(&lli->lli_readdir_mutex);
+       ldlm_lock_decref(&lockh, mode);
+       return page;
+
+fail:
+       ll_release_page(page, 1);
+       page = ERR_PTR(-EIO);
+       goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+               filldir_t filldir)
+{
+       struct ll_inode_info *info       = ll_i2info(inode);
+       struct ll_sb_info    *sbi       = ll_i2sbi(inode);
+       __u64            pos    = *_pos;
+       int                api32      = ll_need_32bit_api(sbi);
+       int                hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
+       struct page       *page;
+       struct ll_dir_chain   chain;
+       int                done = 0;
+       int                rc = 0;
+       ENTRY;
+
+       ll_dir_chain_init(&chain);
+
+       page = ll_get_dir_page(inode, pos, &chain);
+
+       while (rc == 0 && !done) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (!IS_ERR(page)) {
+                       /*
+                        * If page is empty (end of directory is reached),
+                        * use this value.
+                        */
+                       __u64 hash = MDS_DIR_END_OFF;
+                       __u64 next;
+
+                       dp = page_address(page);
+                       for (ent = lu_dirent_start(dp); ent != NULL && !done;
+                            ent = lu_dirent_next(ent)) {
+                               __u16     type;
+                               int         namelen;
+                               struct lu_fid  fid;
+                               __u64     lhash;
+                               __u64     ino;
+
+                               /*
+                                * XXX: implement correct swabbing here.
+                                */
+
+                               hash = le64_to_cpu(ent->lde_hash);
+                               if (hash < pos)
+                                       /*
+                                        * Skip until we find target hash
+                                        * value.
+                                        */
+                                       continue;
+
+                               namelen = le16_to_cpu(ent->lde_namelen);
+                               if (namelen == 0)
+                                       /*
+                                        * Skip dummy record.
+                                        */
+                                       continue;
+
+                               if (api32 && hash64)
+                                       lhash = hash >> 32;
+                               else
+                                       lhash = hash;
+                               fid_le_to_cpu(&fid, &ent->lde_fid);
+                               ino = cl_fid_build_ino(&fid, api32);
+                               type = ll_dirent_type_get(ent);
+                               /* For 'll_nfs_get_name_filldir()', it will try
+                                * to access the 'ent' through its 'lde_name',
+                                * so the parameter 'name' for 'filldir()' must
+                                * be part of the 'ent'. */
+                               done = filldir(cookie, ent->lde_name, namelen,
+                                              lhash, ino, type);
+                       }
+                       next = le64_to_cpu(dp->ldp_hash_end);
+                       if (!done) {
+                               pos = next;
+                               if (pos == MDS_DIR_END_OFF) {
+                                       /*
+                                        * End of directory reached.
+                                        */
+                                       done = 1;
+                                       ll_release_page(page, 0);
+                               } else if (1 /* chain is exhausted*/) {
+                                       /*
+                                        * Normal case: continue to the next
+                                        * page.
+                                        */
+                                       ll_release_page(page,
+                                           le32_to_cpu(dp->ldp_flags) &
+                                                       LDF_COLLIDE);
+                                       next = pos;
+                                       page = ll_get_dir_page(inode, pos,
+                                                              &chain);
+                               } else {
+                                       /*
+                                        * go into overflow page.
+                                        */
+                                       LASSERT(le32_to_cpu(dp->ldp_flags) &
+                                               LDF_COLLIDE);
+                                       ll_release_page(page, 1);
+                               }
+                       } else {
+                               pos = hash;
+                               ll_release_page(page, 0);
+                       }
+               } else {
+                       rc = PTR_ERR(page);
+                       CERROR("error reading dir "DFID" at %lu: rc %d\n",
+                              PFID(&info->lli_fid), (unsigned long)pos, rc);
+               }
+       }
+
+       *_pos = pos;
+       ll_dir_chain_fini(&chain);
+       RETURN(rc);
+}
+
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+{
+       struct inode            *inode  = filp->f_dentry->d_inode;
+       struct ll_file_data     *lfd    = LUSTRE_FPRIVATE(filp);
+       struct ll_sb_info       *sbi    = ll_i2sbi(inode);
+       __u64                   pos     = lfd->lfd_pos;
+       int                     hash64  = sbi->ll_flags & LL_SBI_64BIT_HASH;
+       int                     api32   = ll_need_32bit_api(sbi);
+       int                     rc;
+       struct path             path;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu "
+              " 32bit_api %d\n", inode->i_ino, inode->i_generation,
+              inode, (unsigned long)pos, i_size_read(inode), api32);
+
+       if (pos == MDS_DIR_END_OFF)
+               /*
+                * end-of-file.
+                */
+               GOTO(out, rc = 0);
+
+       rc = ll_dir_read(inode, &pos, cookie, filldir);
+       lfd->lfd_pos = pos;
+       if (pos == MDS_DIR_END_OFF) {
+               if (api32)
+                       filp->f_pos = LL_DIR_END_OFF_32BIT;
+               else
+                       filp->f_pos = LL_DIR_END_OFF;
+       } else {
+               if (api32 && hash64)
+                       filp->f_pos = pos >> 32;
+               else
+                       filp->f_pos = pos;
+       }
+       filp->f_version = inode->i_version;
+       path.mnt = filp->f_path.mnt;
+       path.dentry = filp->f_dentry;
+       touch_atime(&path);
+
+out:
+       if (!rc)
+               ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+       RETURN(rc);
+}
+
+int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+       struct mgs_send_param *msp;
+       int rc = 0;
+
+       OBD_ALLOC_PTR(msp);
+       if (!msp)
+               return -ENOMEM;
+
+       strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+       rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+                               sizeof(struct mgs_send_param), msp, NULL);
+       if (rc)
+               CERROR("Failed to set parameter: %d\n", rc);
+       OBD_FREE_PTR(msp);
+
+       return rc;
+}
+
+int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+                       char *filename)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int mode;
+       int err;
+
+       ENTRY;
+
+       mode = (0755 & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+                                    strlen(filename), mode, LUSTRE_OPC_MKDIR,
+                                    lump);
+       if (IS_ERR(op_data))
+               GOTO(err_exit, err = PTR_ERR(op_data));
+
+       op_data->op_cli_flags |= CLI_SET_MEA;
+       err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+                       current_fsuid(), current_fsgid(),
+                       cfs_curproc_cap_pack(), 0, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(err_exit, err);
+err_exit:
+       ptlrpc_req_finished(request);
+       return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                    int set_default)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       int rc = 0;
+       struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int lum_size;
+       ENTRY;
+
+       if (lump != NULL) {
+               /*
+                * This is coming from userspace, so should be in
+                * local endian.  But the MDS would like it in little
+                * endian, so we swab it before we send it.
+                */
+               switch (lump->lmm_magic) {
+               case LOV_USER_MAGIC_V1: {
+                       if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                               lustre_swab_lov_user_md_v1(lump);
+                       lum_size = sizeof(struct lov_user_md_v1);
+                       break;
+               }
+               case LOV_USER_MAGIC_V3: {
+                       if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                               lustre_swab_lov_user_md_v3(
+                                       (struct lov_user_md_v3 *)lump);
+                       lum_size = sizeof(struct lov_user_md_v3);
+                       break;
+               }
+               default: {
+                       CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                       " %#08x != %#08x nor %#08x\n",
+                                       lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                       LOV_USER_MAGIC_V3);
+                       RETURN(-EINVAL);
+               }
+               }
+       } else {
+               lum_size = sizeof(struct lov_user_md_v1);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+               op_data->op_cli_flags |= CLI_SET_MEA;
+
+       /* swabbing is done in lov_setstripe() on server side */
+       rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+                       NULL, 0, &req, NULL);
+       ll_finish_md_op_data(op_data);
+       ptlrpc_req_finished(req);
+       if (rc) {
+               if (rc != -EPERM && rc != -EACCES)
+                       CERROR("mdc_setattr fails: rc = %d\n", rc);
+       }
+
+       /* In the following we use the fact that LOV_USER_MAGIC_V1 and
+        LOV_USER_MAGIC_V3 have the same initial fields so we do not
+        need the make the distiction between the 2 versions */
+       if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+               char *param = NULL;
+               char *buf;
+
+               OBD_ALLOC(param, MGS_PARAM_MAXLEN);
+               if (param == NULL)
+                       GOTO(end, rc = -ENOMEM);
+
+               buf = param;
+               /* Get fsname and assume devname to be -MDT0000. */
+               ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+               strcat(buf, "-MDT0000.lov");
+               buf += strlen(buf);
+
+               /* Set root stripesize */
+               sprintf(buf, ".stripesize=%u",
+                       lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+               if (rc)
+                       GOTO(end, rc);
+
+               /* Set root stripecount */
+               sprintf(buf, ".stripecount=%hd",
+                       lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+               if (rc)
+                       GOTO(end, rc);
+
+               /* Set root stripeoffset */
+               sprintf(buf, ".stripeoffset=%hd",
+                       lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+                       (typeof(lump->lmm_stripe_offset))(-1));
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+               if (param != NULL)
+                       OBD_FREE(param, MGS_PARAM_MAXLEN);
+       }
+       RETURN(rc);
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+                    int *lmm_size, struct ptlrpc_request **request)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct mdt_body   *body;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *req = NULL;
+       int rc, lmmsize;
+       struct md_op_data *op_data;
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc)
+               RETURN(rc);
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                    0, lmmsize, LUSTRE_OPC_ANY,
+                                    NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+       rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr failed on inode "
+                      "%lu/%u: rc %d\n", inode->i_ino,
+                      inode->i_generation, rc);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       lmmsize = body->eadatasize;
+
+       if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+           lmmsize == 0) {
+               GOTO(out, rc = -ENODATA);
+       }
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill,
+                                          &RMF_MDT_MD, lmmsize);
+       LASSERT(lmm != NULL);
+
+       /*
+        * This is coming from the MDS, so is probably in
+        * little endian.  We convert it to host endian before
+        * passing it to userspace.
+        */
+       /* We don't swab objects for directories */
+       switch (le32_to_cpu(lmm->lmm_magic)) {
+       case LOV_MAGIC_V1:
+               if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+               break;
+       case LOV_MAGIC_V3:
+               if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+               break;
+       default:
+               CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+               rc = -EPROTO;
+       }
+out:
+       *lmmp = lmm;
+       *lmm_size = lmmsize;
+       *request = req;
+       return rc;
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       int rc, mdtidx;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_flags |= MF_GET_MDT_IDX;
+       rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+       mdtidx = op_data->op_mds;
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+               RETURN(rc);
+       }
+       return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+       struct ll_sb_info               *sbi = ll_s2sbi(sb);
+       struct hsm_progress_kernel       hpk;
+       int                              rc;
+       ENTRY;
+
+       /* Forge a hsm_progress based on data from copy. */
+       hpk.hpk_fid = copy->hc_hai.hai_fid;
+       hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+       hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+       hpk.hpk_extent.length = 0;
+       hpk.hpk_flags = 0;
+       hpk.hpk_errval = 0;
+       hpk.hpk_data_version = 0;
+
+
+       /* For archive request, we need to read the current file version. */
+       if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+               struct inode    *inode;
+               __u64            data_version = 0;
+
+               /* Get inode for this fid */
+               inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+               if (IS_ERR(inode)) {
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval is >= 0 */
+                       hpk.hpk_errval = -PTR_ERR(inode);
+                       GOTO(progress, rc = PTR_ERR(inode));
+               }
+
+               /* Read current file data version */
+               rc = ll_data_version(inode, &data_version, 1);
+               iput(inode);
+               if (rc != 0) {
+                       CDEBUG(D_HSM, "Could not read file data version of "
+                                     DFID" (rc = %d). Archive request ("
+                                     LPX64") could not be done.\n",
+                                     PFID(&copy->hc_hai.hai_fid), rc,
+                                     copy->hc_hai.hai_cookie);
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = -rc;
+                       GOTO(progress, rc);
+               }
+
+               /* Store it the hsm_copy for later copytool use.
+                * Always modified even if no lsm. */
+               copy->hc_data_version = data_version;
+       }
+
+progress:
+       rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+                          &hpk, NULL);
+
+       RETURN(rc);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+       struct ll_sb_info               *sbi = ll_s2sbi(sb);
+       struct hsm_progress_kernel       hpk;
+       int                              rc;
+       ENTRY;
+
+       /* If you modify the logic here, also check llapi_hsm_copy_end(). */
+       /* Take care: copy->hc_hai.hai_action, len, gid and data are not
+        * initialized if copy_end was called with copy == NULL.
+        */
+
+       /* Forge a hsm_progress based on data from copy. */
+       hpk.hpk_fid = copy->hc_hai.hai_fid;
+       hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+       hpk.hpk_extent = copy->hc_hai.hai_extent;
+       hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+       hpk.hpk_errval = copy->hc_errval;
+       hpk.hpk_data_version = 0;
+
+       /* For archive request, we need to check the file data was not changed.
+        *
+        * For restore request, we need to send the file data version, this is
+        * useful when the file was created using hsm_import.
+        */
+       if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+            (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+           (copy->hc_errval == 0)) {
+               struct inode    *inode;
+               __u64            data_version = 0;
+
+               /* Get lsm for this fid */
+               inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+               if (IS_ERR(inode)) {
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = -PTR_ERR(inode);
+                       GOTO(progress, rc = PTR_ERR(inode));
+               }
+
+               rc = ll_data_version(inode, &data_version,
+                                    copy->hc_hai.hai_action == HSMA_ARCHIVE);
+               iput(inode);
+               if (rc) {
+                       CDEBUG(D_HSM, "Could not read file data version. "
+                                     "Request could not be confirmed.\n");
+                       if (hpk.hpk_errval == 0)
+                               hpk.hpk_errval = -rc;
+                       GOTO(progress, rc);
+               }
+
+               /* Store it the hsm_copy for later copytool use.
+                * Always modified even if no lsm. */
+               hpk.hpk_data_version = data_version;
+
+               /* File could have been stripped during archiving, so we need
+                * to check anyway. */
+               if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+                   (copy->hc_data_version != data_version)) {
+                       CDEBUG(D_HSM, "File data version mismatched. "
+                             "File content was changed during archiving. "
+                              DFID", start:"LPX64" current:"LPX64"\n",
+                              PFID(&copy->hc_hai.hai_fid),
+                              copy->hc_data_version, data_version);
+                       /* File was changed, send error to cdt. Do not ask for
+                        * retry because if a file is modified frequently,
+                        * the cdt will loop on retried archive requests.
+                        * The policy engine will ask for a new archive later
+                        * when the file will not be modified for some tunable
+                        * time */
+                       /* we do not notify caller */
+                       hpk.hpk_flags &= ~HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = EBUSY;
+               }
+
+       }
+
+progress:
+       rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+                          &hpk, NULL);
+
+       RETURN(rc);
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp, void *data, int len)
+{
+       void *ptr;
+       int rc;
+
+       OBD_ALLOC(ptr, len);
+       if (ptr == NULL)
+               return -ENOMEM;
+       if (copy_from_user(ptr, data, len)) {
+               OBD_FREE(ptr, len);
+               return -EFAULT;
+       }
+       rc = obd_iocontrol(cmd, exp, len, data, NULL);
+       OBD_FREE(ptr, len);
+       return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+       int cmd = qctl->qc_cmd;
+       int type = qctl->qc_type;
+       int id = qctl->qc_id;
+       int valid = qctl->qc_valid;
+       int rc = 0;
+       ENTRY;
+
+       switch (cmd) {
+       case LUSTRE_Q_INVALIDATE:
+       case LUSTRE_Q_FINVALIDATE:
+       case Q_QUOTAON:
+       case Q_QUOTAOFF:
+       case Q_SETQUOTA:
+       case Q_SETINFO:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+               break;
+       case Q_GETQUOTA:
+               if (((type == USRQUOTA && current_euid() != id) ||
+                    (type == GRPQUOTA && !in_egroup_p(id))) &&
+                   (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                    sbi->ll_flags & LL_SBI_RMT_CLIENT))
+                       RETURN(-EPERM);
+               break;
+       case Q_GETINFO:
+               break;
+       default:
+               CERROR("unsupported quotactl op: %#x\n", cmd);
+               RETURN(-ENOTTY);
+       }
+
+       if (valid != QC_GENERAL) {
+               if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EOPNOTSUPP);
+
+               if (cmd == Q_GETINFO)
+                       qctl->qc_cmd = Q_GETOINFO;
+               else if (cmd == Q_GETQUOTA)
+                       qctl->qc_cmd = Q_GETOQUOTA;
+               else
+                       RETURN(-EINVAL);
+
+               switch (valid) {
+               case QC_MDTIDX:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       break;
+               case QC_OSTIDX:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       break;
+               case QC_UUID:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       if (rc == -EAGAIN)
+                               rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+                                                  sbi->ll_dt_exp,
+                                                  sizeof(*qctl), qctl, NULL);
+                       break;
+               default:
+                       rc = -EINVAL;
+                       break;
+               }
+
+               if (rc)
+                       RETURN(rc);
+
+               qctl->qc_cmd = cmd;
+       } else {
+               struct obd_quotactl *oqctl;
+
+               OBD_ALLOC_PTR(oqctl);
+               if (oqctl == NULL)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+               if (rc) {
+                       if (rc != -EALREADY && cmd == Q_QUOTAON) {
+                               oqctl->qc_cmd = Q_QUOTAOFF;
+                               obd_quotactl(sbi->ll_md_exp, oqctl);
+                       }
+                       OBD_FREE_PTR(oqctl);
+                       RETURN(rc);
+               }
+               /* If QIF_SPACE is not set, client should collect the
+                * space usage from OSSs by itself */
+               if (cmd == Q_GETQUOTA &&
+                   !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+                   !oqctl->qc_dqblk.dqb_curspace) {
+                       struct obd_quotactl *oqctl_tmp;
+
+                       OBD_ALLOC_PTR(oqctl_tmp);
+                       if (oqctl_tmp == NULL)
+                               GOTO(out, rc = -ENOMEM);
+
+                       oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+                       oqctl_tmp->qc_id = oqctl->qc_id;
+                       oqctl_tmp->qc_type = oqctl->qc_type;
+
+                       /* collect space usage from OSTs */
+                       oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                       rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+                       if (!rc || rc == -EREMOTEIO) {
+                               oqctl->qc_dqblk.dqb_curspace =
+                                       oqctl_tmp->qc_dqblk.dqb_curspace;
+                               oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+                       }
+
+                       /* collect space & inode usage from MDTs */
+                       oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                       oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+                       rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+                       if (!rc || rc == -EREMOTEIO) {
+                               oqctl->qc_dqblk.dqb_curspace +=
+                                       oqctl_tmp->qc_dqblk.dqb_curspace;
+                               oqctl->qc_dqblk.dqb_curinodes =
+                                       oqctl_tmp->qc_dqblk.dqb_curinodes;
+                               oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+                       } else {
+                               oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+                       }
+
+                       OBD_FREE_PTR(oqctl_tmp);
+               }
+out:
+               QCTL_COPY(qctl, oqctl);
+               OBD_FREE_PTR(oqctl);
+       }
+
+       RETURN(rc);
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+       int ret = 0, len;
+       char *tmp = __getname();
+
+       if (!tmp)
+               return ERR_PTR(-ENOMEM);
+
+       len = strncpy_from_user(tmp, filename, PATH_MAX);
+       if (len == 0)
+               ret = -ENOENT;
+       else if (len > PATH_MAX)
+               ret = -ENAMETOOLONG;
+
+       if (ret) {
+               __putname(tmp);
+               tmp =  ERR_PTR(ret);
+       }
+       return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_ioctl_data *data;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+              inode->i_ino, inode->i_generation, inode, cmd);
+
+       /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+       if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+               return -ENOTTY;
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+       switch(cmd) {
+       case FSFILT_IOC_GETFLAGS:
+       case FSFILT_IOC_SETFLAGS:
+               RETURN(ll_iocontrol(inode, file, cmd, arg));
+       case FSFILT_IOC_GETVERSION_OLD:
+       case FSFILT_IOC_GETVERSION:
+               RETURN(put_user(inode->i_generation, (int *)arg));
+       /* We need to special case any other ioctls we want to handle,
+        * to send them to the MDS/OST as appropriate and to properly
+        * network encode the arg field.
+       case FSFILT_IOC_SETVERSION_OLD:
+       case FSFILT_IOC_SETVERSION:
+       */
+       case LL_IOC_GET_MDTIDX: {
+               int mdtidx;
+
+               mdtidx = ll_get_mdt_idx(inode);
+               if (mdtidx < 0)
+                       RETURN(mdtidx);
+
+               if (put_user((int)mdtidx, (int*)arg))
+                       RETURN(-EFAULT);
+
+               return 0;
+       }
+       case IOC_MDC_LOOKUP: {
+               struct ptlrpc_request *request = NULL;
+               int namelen, len = 0;
+               char *buf = NULL;
+               char *filename;
+               struct md_op_data *op_data;
+
+               rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+               if (rc)
+                       RETURN(rc);
+               data = (void *)buf;
+
+               filename = data->ioc_inlbuf1;
+               namelen = strlen(filename);
+
+               if (namelen < 1) {
+                       CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+                       GOTO(out_free, rc = -EINVAL);
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+                                            0, LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       GOTO(out_free, rc = PTR_ERR(op_data));
+
+               op_data->op_valid = OBD_MD_FLID;
+               rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+               ll_finish_md_op_data(op_data);
+               if (rc < 0) {
+                       CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+                       GOTO(out_free, rc);
+               }
+               ptlrpc_req_finished(request);
+               EXIT;
+out_free:
+               obd_ioctl_freedata(buf, len);
+               return rc;
+       }
+       case LL_IOC_LMV_SETSTRIPE: {
+               struct lmv_user_md  *lum;
+               char            *buf = NULL;
+               char            *filename;
+               int              namelen = 0;
+               int              lumlen = 0;
+               int              len;
+               int              rc;
+
+               rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+               if (rc)
+                       RETURN(rc);
+
+               data = (void *)buf;
+               if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+                   data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+                       GOTO(lmv_out_free, rc = -EINVAL);
+
+               filename = data->ioc_inlbuf1;
+               namelen = data->ioc_inllen1;
+
+               if (namelen < 1) {
+                       CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+                       GOTO(lmv_out_free, rc = -EINVAL);
+               }
+               lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+               lumlen = data->ioc_inllen2;
+
+               if (lum->lum_magic != LMV_USER_MAGIC ||
+                   lumlen != sizeof(*lum)) {
+                       CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+                              filename, lum->lum_magic, lumlen, -EFAULT);
+                       GOTO(lmv_out_free, rc = -EINVAL);
+               }
+
+               /**
+                * ll_dir_setdirstripe will be used to set dir stripe
+                *  mdc_create--->mdt_reint_create (with dirstripe)
+                */
+               rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+               obd_ioctl_freedata(buf, len);
+               RETURN(rc);
+
+       }
+       case LL_IOC_LOV_SETSTRIPE: {
+               struct lov_user_md_v3 lumv3;
+               struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+               struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+               struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+               int set_default = 0;
+
+               LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+               LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+                       sizeof(lumv3p->lmm_objects[0]));
+               /* first try with v1 which is smaller than v3 */
+               if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+                       RETURN(-EFAULT);
+
+               if ((lumv1->lmm_magic == LOV_USER_MAGIC_V3) ) {
+                       if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+                               RETURN(-EFAULT);
+               }
+
+               if (inode->i_sb->s_root == file->f_dentry)
+                       set_default = 1;
+
+               /* in v1 and v3 cases lumv1 points to data */
+               rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+               RETURN(rc);
+       }
+       case LL_IOC_LMV_GETSTRIPE: {
+               struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+               struct lmv_user_md lum;
+               struct lmv_user_md *tmp;
+               int lum_size;
+               int rc = 0;
+               int mdtindex;
+
+               if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+                       RETURN(-EFAULT);
+
+               if (lum.lum_magic != LMV_MAGIC_V1)
+                       RETURN(-EINVAL);
+
+               lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+               OBD_ALLOC(tmp, lum_size);
+               if (tmp == NULL)
+                       GOTO(free_lmv, rc = -ENOMEM);
+
+               memcpy(tmp, &lum, sizeof(lum));
+               tmp->lum_type = LMV_STRIPE_TYPE;
+               tmp->lum_stripe_count = 1;
+               mdtindex = ll_get_mdt_idx(inode);
+               if (mdtindex < 0)
+                       GOTO(free_lmv, rc = -ENOMEM);
+
+               tmp->lum_stripe_offset = mdtindex;
+               tmp->lum_objects[0].lum_mds = mdtindex;
+               memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+                      sizeof(struct lu_fid));
+               if (copy_to_user((void *)arg, tmp, lum_size))
+                       GOTO(free_lmv, rc = -EFAULT);
+free_lmv:
+               if (tmp)
+                       OBD_FREE(tmp, lum_size);
+               RETURN(rc);
+       }
+       case LL_IOC_REMOVE_ENTRY: {
+               char            *filename = NULL;
+               int              namelen = 0;
+               int              rc;
+
+               /* Here is a little hack to avoid sending REINT_RMENTRY to
+                * unsupported server, which might crash the server(LU-2730),
+                * Because both LVB_TYPE and REINT_RMENTRY will be supported
+                * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+                * server will support REINT_RMENTRY XXX*/
+               if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+                       return -ENOTSUPP;
+
+               filename = ll_getname((const char *)arg);
+               if (IS_ERR(filename))
+                       RETURN(PTR_ERR(filename));
+
+               namelen = strlen(filename);
+               if (namelen < 1)
+                       GOTO(out_rmdir, rc = -EINVAL);
+
+               rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+               if (filename)
+                       ll_putname(filename);
+               RETURN(rc);
+       }
+       case LL_IOC_LOV_SWAP_LAYOUTS:
+               RETURN(-EPERM);
+       case LL_IOC_OBD_STATFS:
+               RETURN(ll_obd_statfs(inode, (void *)arg));
+       case LL_IOC_LOV_GETSTRIPE:
+       case LL_IOC_MDC_GETINFO:
+       case IOC_MDC_GETFILEINFO:
+       case IOC_MDC_GETFILESTRIPE: {
+               struct ptlrpc_request *request = NULL;
+               struct lov_user_md *lump;
+               struct lov_mds_md *lmm = NULL;
+               struct mdt_body *body;
+               char *filename = NULL;
+               int lmmsize;
+
+               if (cmd == IOC_MDC_GETFILEINFO ||
+                   cmd == IOC_MDC_GETFILESTRIPE) {
+                       filename = ll_getname((const char *)arg);
+                       if (IS_ERR(filename))
+                               RETURN(PTR_ERR(filename));
+
+                       rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+                                                     &lmmsize, &request);
+               } else {
+                       rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+               }
+
+               if (request) {
+                       body = req_capsule_server_get(&request->rq_pill,
+                                                     &RMF_MDT_BODY);
+                       LASSERT(body != NULL);
+               } else {
+                       GOTO(out_req, rc);
+               }
+
+               if (rc < 0) {
+                       if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+                                              cmd == LL_IOC_MDC_GETINFO))
+                               GOTO(skip_lmm, rc = 0);
+                       else
+                               GOTO(out_req, rc);
+               }
+
+               if (cmd == IOC_MDC_GETFILESTRIPE ||
+                   cmd == LL_IOC_LOV_GETSTRIPE) {
+                       lump = (struct lov_user_md *)arg;
+               } else {
+                       struct lov_user_mds_data *lmdp;
+                       lmdp = (struct lov_user_mds_data *)arg;
+                       lump = &lmdp->lmd_lmm;
+               }
+               if (copy_to_user(lump, lmm, lmmsize)) {
+                       if (copy_to_user(lump, lmm, sizeof(*lump)))
+                               GOTO(out_req, rc = -EFAULT);
+                       rc = -EOVERFLOW;
+               }
+       skip_lmm:
+               if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+                       struct lov_user_mds_data *lmdp;
+                       lstat_t st = { 0 };
+
+                       st.st_dev     = inode->i_sb->s_dev;
+                       st.st_mode    = body->mode;
+                       st.st_nlink   = body->nlink;
+                       st.st_uid     = body->uid;
+                       st.st_gid     = body->gid;
+                       st.st_rdev    = body->rdev;
+                       st.st_size    = body->size;
+                       st.st_blksize = PAGE_CACHE_SIZE;
+                       st.st_blocks  = body->blocks;
+                       st.st_atime   = body->atime;
+                       st.st_mtime   = body->mtime;
+                       st.st_ctime   = body->ctime;
+                       st.st_ino     = inode->i_ino;
+
+                       lmdp = (struct lov_user_mds_data *)arg;
+                       if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+                               GOTO(out_req, rc = -EFAULT);
+               }
+
+               EXIT;
+       out_req:
+               ptlrpc_req_finished(request);
+               if (filename)
+                       ll_putname(filename);
+               return rc;
+       }
+       case IOC_LOV_GETINFO: {
+               struct lov_user_mds_data *lumd;
+               struct lov_stripe_md *lsm;
+               struct lov_user_md *lum;
+               struct lov_mds_md *lmm;
+               int lmmsize;
+               lstat_t st;
+
+               lumd = (struct lov_user_mds_data *)arg;
+               lum = &lumd->lmd_lmm;
+
+               rc = ll_get_max_mdsize(sbi, &lmmsize);
+               if (rc)
+                       RETURN(rc);
+
+               OBD_ALLOC_LARGE(lmm, lmmsize);
+               if (copy_from_user(lmm, lum, lmmsize))
+                       GOTO(free_lmm, rc = -EFAULT);
+
+               switch (lmm->lmm_magic) {
+               case LOV_USER_MAGIC_V1:
+                       if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+                               break;
+                       /* swab objects first so that stripes num will be sane */
+                       lustre_swab_lov_user_md_objects(
+                               ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                               ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                       break;
+               case LOV_USER_MAGIC_V3:
+                       if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+                               break;
+                       /* swab objects first so that stripes num will be sane */
+                       lustre_swab_lov_user_md_objects(
+                               ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                               ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                       break;
+               default:
+                       GOTO(free_lmm, rc = -EINVAL);
+               }
+
+               rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+               if (rc < 0)
+                       GOTO(free_lmm, rc = -ENOMEM);
+
+               /* Perform glimpse_size operation. */
+               memset(&st, 0, sizeof(st));
+
+               rc = ll_glimpse_ioctl(sbi, lsm, &st);
+               if (rc)
+                       GOTO(free_lsm, rc);
+
+               if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
+                       GOTO(free_lsm, rc = -EFAULT);
+
+               EXIT;
+       free_lsm:
+               obd_free_memmd(sbi->ll_dt_exp, &lsm);
+       free_lmm:
+               OBD_FREE_LARGE(lmm, lmmsize);
+               return rc;
+       }
+       case OBD_IOC_LLOG_CATINFO: {
+               RETURN(-EOPNOTSUPP);
+       }
+       case OBD_IOC_QUOTACHECK: {
+               struct obd_quotactl *oqctl;
+               int error = 0;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+               oqctl->qc_type = arg;
+               rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+               if (rc < 0) {
+                       CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+                       error = rc;
+               }
+
+               rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+               if (rc < 0)
+                       CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+               OBD_FREE_PTR(oqctl);
+               return error ?: rc;
+       }
+       case OBD_IOC_POLL_QUOTACHECK: {
+               struct if_quotacheck *check;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+
+               OBD_ALLOC_PTR(check);
+               if (!check)
+                       RETURN(-ENOMEM);
+
+               rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+                                  NULL);
+               if (rc) {
+                       CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+                       if (copy_to_user((void *)arg, check,
+                                            sizeof(*check)))
+                               CDEBUG(D_QUOTA, "copy_to_user failed\n");
+                       GOTO(out_poll, rc);
+               }
+
+               rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+                                  NULL);
+               if (rc) {
+                       CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+                       if (copy_to_user((void *)arg, check,
+                                            sizeof(*check)))
+                               CDEBUG(D_QUOTA, "copy_to_user failed\n");
+                       GOTO(out_poll, rc);
+               }
+       out_poll:
+               OBD_FREE_PTR(check);
+               RETURN(rc);
+       }
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+       case LL_IOC_QUOTACTL_18: {
+               /* copy the old 1.x quota struct for internal use, then copy
+                * back into old format struct.  For 1.8 compatibility. */
+               struct if_quotactl_18 *qctl_18;
+               struct if_quotactl *qctl_20;
+
+               OBD_ALLOC_PTR(qctl_18);
+               if (!qctl_18)
+                       RETURN(-ENOMEM);
+
+               OBD_ALLOC_PTR(qctl_20);
+               if (!qctl_20)
+                       GOTO(out_quotactl_18, rc = -ENOMEM);
+
+               if (copy_from_user(qctl_18, (void *)arg, sizeof(*qctl_18)))
+                       GOTO(out_quotactl_20, rc = -ENOMEM);
+
+               QCTL_COPY(qctl_20, qctl_18);
+               qctl_20->qc_idx = 0;
+
+               /* XXX: dqb_valid was borrowed as a flag to mark that
+                *      only mds quota is wanted */
+               if (qctl_18->qc_cmd == Q_GETQUOTA &&
+                   qctl_18->qc_dqblk.dqb_valid) {
+                       qctl_20->qc_valid = QC_MDTIDX;
+                       qctl_20->qc_dqblk.dqb_valid = 0;
+               } else if (qctl_18->obd_uuid.uuid[0] != '\0') {
+                       qctl_20->qc_valid = QC_UUID;
+                       qctl_20->obd_uuid = qctl_18->obd_uuid;
+               } else {
+                       qctl_20->qc_valid = QC_GENERAL;
+               }
+
+               rc = quotactl_ioctl(sbi, qctl_20);
+
+               if (rc == 0) {
+                       QCTL_COPY(qctl_18, qctl_20);
+                       qctl_18->obd_uuid = qctl_20->obd_uuid;
+
+                       if (copy_to_user((void *)arg, qctl_18,
+                                            sizeof(*qctl_18)))
+                               rc = -EFAULT;
+               }
+
+       out_quotactl_20:
+               OBD_FREE_PTR(qctl_20);
+       out_quotactl_18:
+               OBD_FREE_PTR(qctl_18);
+               RETURN(rc);
+       }
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+       case LL_IOC_QUOTACTL: {
+               struct if_quotactl *qctl;
+
+               OBD_ALLOC_PTR(qctl);
+               if (!qctl)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
+                       GOTO(out_quotactl, rc = -EFAULT);
+
+               rc = quotactl_ioctl(sbi, qctl);
+
+               if (rc == 0 && copy_to_user((void *)arg,qctl,sizeof(*qctl)))
+                       rc = -EFAULT;
+
+       out_quotactl:
+               OBD_FREE_PTR(qctl);
+               RETURN(rc);
+       }
+       case OBD_IOC_GETDTNAME:
+       case OBD_IOC_GETMDNAME:
+               RETURN(ll_get_obd_name(inode, cmd, arg));
+       case LL_IOC_FLUSHCTX:
+               RETURN(ll_flush_ctx(inode));
+#ifdef CONFIG_FS_POSIX_ACL
+       case LL_IOC_RMTACL: {
+           if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+               inode == inode->i_sb->s_root->d_inode) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(fd != NULL);
+               rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+               if (!rc)
+                       fd->fd_flags |= LL_FILE_RMTACL;
+               RETURN(rc);
+           } else
+               RETURN(0);
+       }
+#endif
+       case LL_IOC_GETOBDCOUNT: {
+               int count, vallen;
+               struct obd_export *exp;
+
+               if (copy_from_user(&count, (int *)arg, sizeof(int)))
+                       RETURN(-EFAULT);
+
+               /* get ost count when count is zero, get mdt count otherwise */
+               exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+               vallen = sizeof(count);
+               rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+                                 KEY_TGT_COUNT, &vallen, &count, NULL);
+               if (rc) {
+                       CERROR("get target count failed: %d\n", rc);
+                       RETURN(rc);
+               }
+
+               if (copy_to_user((int *)arg, &count, sizeof(int)))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case LL_IOC_PATH2FID:
+               if (copy_to_user((void *)arg, ll_inode2fid(inode),
+                                    sizeof(struct lu_fid)))
+                       RETURN(-EFAULT);
+               RETURN(0);
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void*)arg));
+       }
+       case OBD_IOC_CHANGELOG_SEND:
+       case OBD_IOC_CHANGELOG_CLEAR:
+               rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+                                   sizeof(struct ioc_changelog));
+               RETURN(rc);
+       case OBD_IOC_FID2PATH:
+               RETURN(ll_fid2path(inode, (void *)arg));
+       case LL_IOC_HSM_REQUEST: {
+               struct hsm_user_request *hur;
+               int                      totalsize;
+
+               OBD_ALLOC_PTR(hur);
+               if (hur == NULL)
+                       RETURN(-ENOMEM);
+
+               /* We don't know the true size yet; copy the fixed-size part */
+               if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+                       OBD_FREE_PTR(hur);
+                       RETURN(-EFAULT);
+               }
+
+               /* Compute the whole struct size */
+               totalsize = hur_len(hur);
+               OBD_FREE_PTR(hur);
+               OBD_ALLOC_LARGE(hur, totalsize);
+               if (hur == NULL)
+                       RETURN(-ENOMEM);
+
+               /* Copy the whole struct */
+               if (copy_from_user(hur, (void *)arg, totalsize)) {
+                       OBD_FREE_LARGE(hur, totalsize);
+                       RETURN(-EFAULT);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+                                  hur, NULL);
+
+               OBD_FREE_LARGE(hur, totalsize);
+
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_PROGRESS: {
+               struct hsm_progress_kernel      hpk;
+               struct hsm_progress             hp;
+
+               if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+                       RETURN(-EFAULT);
+
+               hpk.hpk_fid = hp.hp_fid;
+               hpk.hpk_cookie = hp.hp_cookie;
+               hpk.hpk_extent = hp.hp_extent;
+               hpk.hpk_flags = hp.hp_flags;
+               hpk.hpk_errval = hp.hp_errval;
+               hpk.hpk_data_version = 0;
+
+               /* File may not exist in Lustre; all progress
+                * reported to Lustre root */
+               rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+                                  NULL);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_CT_START:
+               rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+                                   sizeof(struct lustre_kernelcomm));
+               RETURN(rc);
+
+       case LL_IOC_HSM_COPY_START: {
+               struct hsm_copy *copy;
+               int              rc;
+
+               OBD_ALLOC_PTR(copy);
+               if (copy == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+                       OBD_FREE_PTR(copy);
+                       RETURN(-EFAULT);
+               }
+
+               rc = ll_ioc_copy_start(inode->i_sb, copy);
+               if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+                       rc = -EFAULT;
+
+               OBD_FREE_PTR(copy);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_COPY_END: {
+               struct hsm_copy *copy;
+               int              rc;
+
+               OBD_ALLOC_PTR(copy);
+               if (copy == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+                       OBD_FREE_PTR(copy);
+                       RETURN(-EFAULT);
+               }
+
+               rc = ll_ioc_copy_end(inode->i_sb, copy);
+               if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+                       rc = -EFAULT;
+
+               OBD_FREE_PTR(copy);
+               RETURN(rc);
+       }
+       default:
+               RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+                                    (void *)arg));
+       }
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       int api32 = ll_need_32bit_api(sbi);
+       loff_t ret = -EINVAL;
+       ENTRY;
+
+       mutex_lock(&inode->i_mutex);
+       switch (origin) {
+               case SEEK_SET:
+                       break;
+               case SEEK_CUR:
+                       offset += file->f_pos;
+                       break;
+               case SEEK_END:
+                       if (offset > 0)
+                               GOTO(out, ret);
+                       if (api32)
+                               offset += LL_DIR_END_OFF_32BIT;
+                       else
+                               offset += LL_DIR_END_OFF;
+                       break;
+               default:
+                       GOTO(out, ret);
+       }
+
+       if (offset >= 0 &&
+           ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+            (!api32 && offset <= LL_DIR_END_OFF))) {
+               if (offset != file->f_pos) {
+                       if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+                           (!api32 && offset == LL_DIR_END_OFF))
+                               fd->lfd_pos = MDS_DIR_END_OFF;
+                       else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+                               fd->lfd_pos = offset << 32;
+                       else
+                               fd->lfd_pos = offset;
+                       file->f_pos = offset;
+                       file->f_version = 0;
+               }
+               ret = offset;
+       }
+       GOTO(out, ret);
+
+out:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}
+
+int ll_dir_open(struct inode *inode, struct file *file)
+{
+       ENTRY;
+       RETURN(ll_file_open(inode, file));
+}
+
+int ll_dir_release(struct inode *inode, struct file *file)
+{
+       ENTRY;
+       RETURN(ll_file_release(inode, file));
+}
+
+struct file_operations ll_dir_operations = {
+       .llseek   = ll_dir_seek,
+       .open     = ll_dir_open,
+       .release  = ll_dir_release,
+       .read     = generic_read_dir,
+       .readdir  = ll_readdir,
+       .unlocked_ioctl   = ll_dir_ioctl,
+       .fsync    = ll_fsync,
+};
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644 (file)
index 0000000..d423de1
--- /dev/null
@@ -0,0 +1,3196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <lustre_lite.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
+
+#include "cl_object.h"
+
+struct ll_file_data *ll_file_data_get(void)
+{
+       struct ll_file_data *fd;
+
+       OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
+       fd->fd_write_failed = false;
+       return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+       if (fd != NULL)
+               OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+                         struct lustre_handle *fh)
+{
+       op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+       op_data->op_attr.ia_mode = inode->i_mode;
+       op_data->op_attr.ia_atime = inode->i_atime;
+       op_data->op_attr.ia_mtime = inode->i_mtime;
+       op_data->op_attr.ia_ctime = inode->i_ctime;
+       op_data->op_attr.ia_size = i_size_read(inode);
+       op_data->op_attr_blocks = inode->i_blocks;
+       ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+                                       ll_inode_to_ext_flags(inode->i_flags);
+       op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+       if (fh)
+               op_data->op_handle = *fh;
+       op_data->op_capa1 = ll_mdscapa_get(inode);
+
+       if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+               op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+                            struct obd_client_handle *och)
+{
+       ENTRY;
+
+       op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
+                                ATTR_MTIME_SET | ATTR_CTIME_SET;
+
+       if (!(och->och_flags & FMODE_WRITE))
+               goto out;
+
+       if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+               op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+       else
+               ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+       ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+       ll_prep_md_op_data(op_data, inode, NULL, NULL,
+                          0, 0, LUSTRE_OPC_ANY, NULL);
+       EXIT;
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+                                    struct inode *inode,
+                                    struct obd_client_handle *och)
+{
+       struct obd_export *exp = ll_i2mdexp(inode);
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       struct obd_device *obd = class_exp2obd(exp);
+       int epoch_close = 1;
+       int rc;
+       ENTRY;
+
+       if (obd == NULL) {
+               /*
+                * XXX: in case of LMV, is this correct to access
+                * ->exp_handle?
+                */
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      ll_i2mdexp(inode)->exp_handle.h_cookie);
+               GOTO(out, rc = 0);
+       }
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+
+       ll_prepare_close(inode, op_data, och);
+       epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
+       rc = md_close(md_exp, op_data, och->och_mod, &req);
+       if (rc == -EAGAIN) {
+               /* This close must have the epoch closed. */
+               LASSERT(epoch_close);
+               /* MDS has instructed us to obtain Size-on-MDS attribute from
+                * OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+               if (rc) {
+                       CERROR("inode %lu mdc Size-on-MDS update failed: "
+                              "rc = %d\n", inode->i_ino, rc);
+                       rc = 0;
+               }
+       } else if (rc) {
+               CERROR("inode %lu mdc close failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+
+       /* DATA_MODIFIED flag was successfully sent on close, cancel data
+        * modification flag. */
+       if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       ll_finish_md_op_data(op_data);
+
+       if (rc == 0) {
+               rc = ll_objects_destroy(req, inode);
+               if (rc)
+                       CERROR("inode %lu ll_objects destroy: rc = %d\n",
+                              inode->i_ino, rc);
+       }
+
+       EXIT;
+out:
+
+       if (exp_connect_som(exp) && !epoch_close &&
+           S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+               ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+       } else {
+               md_clear_open_replay_data(md_exp, och);
+               /* Free @och if it is not waiting for DONE_WRITING. */
+               och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+               OBD_FREE_PTR(och);
+       }
+       if (req) /* This is close request */
+               ptlrpc_req_finished(req);
+       return rc;
+}
+
+int ll_md_real_close(struct inode *inode, int flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_client_handle **och_p;
+       struct obd_client_handle *och;
+       __u64 *och_usecount;
+       int rc = 0;
+       ENTRY;
+
+       if (flags & FMODE_WRITE) {
+               och_p = &lli->lli_mds_write_och;
+               och_usecount = &lli->lli_open_fd_write_count;
+       } else if (flags & FMODE_EXEC) {
+               och_p = &lli->lli_mds_exec_och;
+               och_usecount = &lli->lli_open_fd_exec_count;
+       } else {
+               LASSERT(flags & FMODE_READ);
+               och_p = &lli->lli_mds_read_och;
+               och_usecount = &lli->lli_open_fd_read_count;
+       }
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (*och_usecount) { /* There are still users of this handle, so
+                               skip freeing it. */
+               mutex_unlock(&lli->lli_och_mutex);
+               RETURN(0);
+       }
+       och=*och_p;
+       *och_p = NULL;
+       mutex_unlock(&lli->lli_och_mutex);
+
+       if (och) { /* There might be a race and somebody have freed this och
+                     already */
+               rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+                                              inode, och);
+       }
+
+       RETURN(rc);
+}
+
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+               struct file *file)
+{
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc = 0;
+       ENTRY;
+
+       /* clear group lock, if present */
+       if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+               ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+       /* Let's see if we have good enough OPEN lock on the file and if
+          we can skip talking to MDS */
+       if (file->f_dentry->d_inode) { /* Can this ever be false? */
+               int lockmode;
+               int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+               struct lustre_handle lockh;
+               struct inode *inode = file->f_dentry->d_inode;
+               ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
+
+               mutex_lock(&lli->lli_och_mutex);
+               if (fd->fd_omode & FMODE_WRITE) {
+                       lockmode = LCK_CW;
+                       LASSERT(lli->lli_open_fd_write_count);
+                       lli->lli_open_fd_write_count--;
+               } else if (fd->fd_omode & FMODE_EXEC) {
+                       lockmode = LCK_PR;
+                       LASSERT(lli->lli_open_fd_exec_count);
+                       lli->lli_open_fd_exec_count--;
+               } else {
+                       lockmode = LCK_CR;
+                       LASSERT(lli->lli_open_fd_read_count);
+                       lli->lli_open_fd_read_count--;
+               }
+               mutex_unlock(&lli->lli_och_mutex);
+
+               if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+                                  LDLM_IBITS, &policy, lockmode,
+                                  &lockh)) {
+                       rc = ll_md_real_close(file->f_dentry->d_inode,
+                                             fd->fd_omode);
+               }
+       } else {
+               CERROR("Releasing a file %p with negative dentry %p. Name %s",
+                      file, file->f_dentry, file->f_dentry->d_name.name);
+       }
+
+       LUSTRE_FPRIVATE(file) = NULL;
+       ll_file_data_put(fd);
+       ll_capa_close(inode);
+
+       RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+       struct ll_file_data *fd;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           inode == inode->i_sb->s_root->d_inode) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(fd != NULL);
+               if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+                       fd->fd_flags &= ~LL_FILE_RMTACL;
+                       rct_del(&sbi->ll_rct, current_pid());
+                       et_search_free(&sbi->ll_et, current_pid());
+               }
+       }
+#endif
+
+       if (inode->i_sb->s_root != file->f_dentry)
+               ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+       fd = LUSTRE_FPRIVATE(file);
+       LASSERT(fd != NULL);
+
+       /* The last ref on @file, maybe not the the owner pid of statahead.
+        * Different processes can open the same dir, "ll_opendir_key" means:
+        * it is me that should stop the statahead thread. */
+       if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+           lli->lli_opendir_pid != 0)
+               ll_stop_statahead(inode, lli->lli_opendir_key);
+
+       if (inode->i_sb->s_root == file->f_dentry) {
+               LUSTRE_FPRIVATE(file) = NULL;
+               ll_file_data_put(fd);
+               RETURN(0);
+       }
+
+       if (!S_ISDIR(inode->i_mode)) {
+               lov_read_and_clear_async_rc(lli->lli_clob);
+               lli->lli_async_rc = 0;
+       }
+
+       rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+       if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+               libcfs_debug_dumplog();
+
+       RETURN(rc);
+}
+
+static int ll_intent_file_open(struct file *file, void *lmm,
+                              int lmmsize, struct lookup_intent *itp)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+       struct dentry *parent = file->f_dentry->d_parent;
+       const char *name = file->f_dentry->d_name.name;
+       const int len = file->f_dentry->d_name.len;
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req;
+       __u32 opc = LUSTRE_OPC_ANY;
+       int rc;
+       ENTRY;
+
+       if (!parent)
+               RETURN(-ENOENT);
+
+       /* Usually we come here only for NFSD, and we want open lock.
+          But we can also get here with pre 2.6.15 patchless kernels, and in
+          that case that lock is also ok */
+       /* We can also get here if there was cached open handle in revalidate_it
+        * but it disappeared while we were getting from there to ll_file_open.
+        * But this means this file was closed and immediatelly opened which
+        * makes a good candidate for using OPEN lock */
+       /* If lmmsize & lmm are not 0, we are just setting stripe info
+        * parameters. No need for the open lock */
+       if (lmm == NULL && lmmsize == 0) {
+               itp->it_flags |= MDS_OPEN_LOCK;
+               if (itp->it_flags & FMODE_WRITE)
+                       opc = LUSTRE_OPC_CREATE;
+       }
+
+       op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
+                                     file->f_dentry->d_inode, name, len,
+                                     O_RDWR, opc, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       itp->it_flags |= MDS_OPEN_BY_FID;
+       rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+                           0 /*unused */, &req, ll_md_blocking_ast, 0);
+       ll_finish_md_op_data(op_data);
+       if (rc == -ESTALE) {
+               /* reason for keep own exit path - don`t flood log
+               * with messages with -ESTALE errors.
+               */
+               if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+                    it_open_error(DISP_OPEN_OPEN, itp))
+                       GOTO(out, rc);
+               ll_release_openhandle(file->f_dentry, itp);
+               GOTO(out, rc);
+       }
+
+       if (it_disposition(itp, DISP_LOOKUP_NEG))
+               GOTO(out, rc = -ENOENT);
+
+       if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+               rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+               CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
+       if (!rc && itp->d.lustre.it_lock_mode)
+               ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
+                                itp, NULL);
+
+out:
+       ptlrpc_req_finished(itp->d.lustre.it_data);
+       it_clear_disposition(itp, DISP_ENQ_COMPLETE);
+       ll_intent_drop_lock(itp);
+
+       RETURN(rc);
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+       if (ioepoch && lli->lli_ioepoch != ioepoch) {
+               lli->lli_ioepoch = ioepoch;
+               CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+                      ioepoch, PFID(&lli->lli_fid));
+       }
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
+                      struct lookup_intent *it, struct obd_client_handle *och)
+{
+       struct ptlrpc_request *req = it->d.lustre.it_data;
+       struct mdt_body *body;
+
+       LASSERT(och);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);                /* reply already checked out */
+
+       memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+       och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+       och->och_fid = lli->lli_fid;
+       och->och_flags = it->it_flags;
+       ll_ioepoch_open(lli, body->ioepoch);
+
+       return md_set_open_replay_data(md_exp, och, req);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+                 struct ll_file_data *fd, struct obd_client_handle *och)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       LASSERT(!LUSTRE_FPRIVATE(file));
+
+       LASSERT(fd != NULL);
+
+       if (och) {
+               struct ptlrpc_request *req = it->d.lustre.it_data;
+               struct mdt_body *body;
+               int rc;
+
+               rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
+               if (rc)
+                       RETURN(rc);
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               if ((it->it_flags & FMODE_WRITE) &&
+                   (body->valid & OBD_MD_FLSIZE))
+                       CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+                              lli->lli_ioepoch, PFID(&lli->lli_fid));
+       }
+
+       LUSTRE_FPRIVATE(file) = fd;
+       ll_readahead_init(inode, &fd->fd_ras);
+       fd->fd_omode = it->it_flags;
+       RETURN(0);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+                                         .it_flags = file->f_flags };
+       struct obd_client_handle **och_p = NULL;
+       __u64 *och_usecount = NULL;
+       struct ll_file_data *fd;
+       int rc = 0, opendir_set = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+              inode->i_generation, inode, file->f_flags);
+
+       it = file->private_data; /* XXX: compat macro */
+       file->private_data = NULL; /* prevent ll_local_open assertion */
+
+       fd = ll_file_data_get();
+       if (fd == NULL)
+               GOTO(out_och_free, rc = -ENOMEM);
+
+       fd->fd_file = file;
+       if (S_ISDIR(inode->i_mode)) {
+               spin_lock(&lli->lli_sa_lock);
+               if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+                   lli->lli_opendir_pid == 0) {
+                       lli->lli_opendir_key = fd;
+                       lli->lli_opendir_pid = current_pid();
+                       opendir_set = 1;
+               }
+               spin_unlock(&lli->lli_sa_lock);
+       }
+
+       if (inode->i_sb->s_root == file->f_dentry) {
+               LUSTRE_FPRIVATE(file) = fd;
+               RETURN(0);
+       }
+
+       if (!it || !it->d.lustre.it_disposition) {
+               /* Convert f_flags into access mode. We cannot use file->f_mode,
+                * because everything but O_ACCMODE mask was stripped from
+                * there */
+               if ((oit.it_flags + 1) & O_ACCMODE)
+                       oit.it_flags++;
+               if (file->f_flags & O_TRUNC)
+                       oit.it_flags |= FMODE_WRITE;
+
+               /* kernel only call f_op->open in dentry_open.  filp_open calls
+                * dentry_open after call to open_namei that checks permissions.
+                * Only nfsd_open call dentry_open directly without checking
+                * permissions and because of that this code below is safe. */
+               if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+                       oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+               /* We do not want O_EXCL here, presumably we opened the file
+                * already? XXX - NFS implications? */
+               oit.it_flags &= ~O_EXCL;
+
+               /* bug20584, if "it_flags" contains O_CREAT, the file will be
+                * created if necessary, then "IT_CREAT" should be set to keep
+                * consistent with it */
+               if (oit.it_flags & O_CREAT)
+                       oit.it_op |= IT_CREAT;
+
+               it = &oit;
+       }
+
+restart:
+       /* Let's see if we have file open on MDS already. */
+       if (it->it_flags & FMODE_WRITE) {
+               och_p = &lli->lli_mds_write_och;
+               och_usecount = &lli->lli_open_fd_write_count;
+       } else if (it->it_flags & FMODE_EXEC) {
+               och_p = &lli->lli_mds_exec_och;
+               och_usecount = &lli->lli_open_fd_exec_count;
+        } else {
+               och_p = &lli->lli_mds_read_och;
+               och_usecount = &lli->lli_open_fd_read_count;
+       }
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (*och_p) { /* Open handle is present */
+               if (it_disposition(it, DISP_OPEN_OPEN)) {
+                       /* Well, there's extra open request that we do not need,
+                          let's close it somehow. This will decref request. */
+                       rc = it_open_error(DISP_OPEN_OPEN, it);
+                       if (rc) {
+                               mutex_unlock(&lli->lli_och_mutex);
+                               GOTO(out_openerr, rc);
+                       }
+
+                       ll_release_openhandle(file->f_dentry, it);
+               }
+               (*och_usecount)++;
+
+               rc = ll_local_open(file, it, fd, NULL);
+               if (rc) {
+                       (*och_usecount)--;
+                       mutex_unlock(&lli->lli_och_mutex);
+                       GOTO(out_openerr, rc);
+               }
+       } else {
+               LASSERT(*och_usecount == 0);
+               if (!it->d.lustre.it_disposition) {
+                       /* We cannot just request lock handle now, new ELC code
+                          means that one of other OPEN locks for this file
+                          could be cancelled, and since blocking ast handler
+                          would attempt to grab och_mutex as well, that would
+                          result in a deadlock */
+                       mutex_unlock(&lli->lli_och_mutex);
+                       it->it_create_mode |= M_CHECK_STALE;
+                       rc = ll_intent_file_open(file, NULL, 0, it);
+                       it->it_create_mode &= ~M_CHECK_STALE;
+                       if (rc)
+                               GOTO(out_openerr, rc);
+
+                       goto restart;
+               }
+               OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
+               if (!*och_p)
+                       GOTO(out_och_free, rc = -ENOMEM);
+
+               (*och_usecount)++;
+
+               /* md_intent_lock() didn't get a request ref if there was an
+                * open error, so don't do cleanup on the request here
+                * (bug 3430) */
+               /* XXX (green): Should not we bail out on any error here, not
+                * just open error? */
+               rc = it_open_error(DISP_OPEN_OPEN, it);
+               if (rc)
+                       GOTO(out_och_free, rc);
+
+               LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+               rc = ll_local_open(file, it, fd, *och_p);
+               if (rc)
+                       GOTO(out_och_free, rc);
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+       fd = NULL;
+
+       /* Must do this outside lli_och_mutex lock to prevent deadlock where
+          different kind of OPEN lock for this same inode gets cancelled
+          by ldlm_cancel_lru */
+       if (!S_ISREG(inode->i_mode))
+               GOTO(out_och_free, rc);
+
+       ll_capa_open(inode);
+
+       if (!lli->lli_has_smd) {
+               if (file->f_flags & O_LOV_DELAY_CREATE ||
+                   !(file->f_mode & FMODE_WRITE)) {
+                       CDEBUG(D_INODE, "object creation was delayed\n");
+                       GOTO(out_och_free, rc);
+               }
+       }
+       file->f_flags &= ~O_LOV_DELAY_CREATE;
+       GOTO(out_och_free, rc);
+
+out_och_free:
+       if (rc) {
+               if (och_p && *och_p) {
+                       OBD_FREE(*och_p, sizeof (struct obd_client_handle));
+                       *och_p = NULL; /* OBD_FREE writes some magic there */
+                       (*och_usecount)--;
+               }
+               mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+               if (opendir_set != 0)
+                       ll_stop_statahead(inode, lli->lli_opendir_key);
+               if (fd != NULL)
+                       ll_file_data_put(fd);
+       } else {
+               ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+       }
+
+       if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+               ptlrpc_req_finished(it->d.lustre.it_data);
+               it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+       }
+
+       return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+                         struct obd_capa *capa, struct obdo *obdo,
+                         __u64 ioepoch, int sync)
+{
+       struct ptlrpc_request_set *set;
+       struct obd_info     oinfo = { { { 0 } } };
+       int                     rc;
+
+       ENTRY;
+
+       LASSERT(lsm != NULL);
+
+       oinfo.oi_md = lsm;
+       oinfo.oi_oa = obdo;
+       oinfo.oi_oa->o_oi = lsm->lsm_oi;
+       oinfo.oi_oa->o_mode = S_IFREG;
+       oinfo.oi_oa->o_ioepoch = ioepoch;
+       oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+                              OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                              OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+                              OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                              OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+                              OBD_MD_FLDATAVERSION;
+       oinfo.oi_capa = capa;
+       if (sync) {
+               oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+               oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+       }
+
+       set = ptlrpc_prep_set();
+       if (set == NULL) {
+               CERROR("can't allocate ptlrpc set\n");
+               rc = -ENOMEM;
+       } else {
+               rc = obd_getattr_async(exp, &oinfo, set);
+               if (rc == 0)
+                       rc = ptlrpc_set_wait(set);
+               ptlrpc_set_destroy(set);
+       }
+       if (rc == 0)
+               oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                        OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                        OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+                                        OBD_MD_FLDATAVERSION);
+       RETURN(rc);
+}
+
+/**
+  * Performs the getattr on the inode and updates its fields.
+  * If @sync != 0, perform the getattr under the server-side lock.
+  */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+                    __u64 ioepoch, int sync)
+{
+       struct obd_capa      *capa = ll_mdscapa_get(inode);
+       struct lov_stripe_md *lsm;
+       int rc;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+                           capa, obdo, ioepoch, sync);
+       capa_put(capa);
+       if (rc == 0) {
+               struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+               obdo_refresh_inode(inode, obdo, obdo->o_valid);
+               CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
+                      " blksize %lu\n", POSTID(oi), i_size_read(inode),
+                      (unsigned long long)inode->i_blocks,
+                      (unsigned long)ll_inode_blksize(inode));
+       }
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_object *obj = lli->lli_clob;
+       struct cl_attr *attr = ccc_env_thread_attr(env);
+       struct ost_lvb lvb;
+       int rc = 0;
+
+       ENTRY;
+
+       ll_inode_size_lock(inode);
+       /* merge timestamps the most recently obtained from mds with
+          timestamps obtained from osts */
+       LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+       LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+       LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+       inode_init_lvb(inode, &lvb);
+
+       cl_object_attr_lock(obj);
+       rc = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+
+       if (rc == 0) {
+               if (lvb.lvb_atime < attr->cat_atime)
+                       lvb.lvb_atime = attr->cat_atime;
+               if (lvb.lvb_ctime < attr->cat_ctime)
+                       lvb.lvb_ctime = attr->cat_ctime;
+               if (lvb.lvb_mtime < attr->cat_mtime)
+                       lvb.lvb_mtime = attr->cat_mtime;
+
+               CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
+                               PFID(&lli->lli_fid), attr->cat_size);
+               cl_isize_write_nolock(inode, attr->cat_size);
+
+               inode->i_blocks = attr->cat_blocks;
+
+               LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+               LTIME_S(inode->i_atime) = lvb.lvb_atime;
+               LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+       }
+       ll_inode_size_unlock(inode);
+
+       RETURN(rc);
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+                    lstat_t *st)
+{
+       struct obdo obdo = { 0 };
+       int rc;
+
+       rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+       if (rc == 0) {
+               st->st_size   = obdo.o_size;
+               st->st_blocks = obdo.o_blocks;
+               st->st_mtime  = obdo.o_mtime;
+               st->st_atime  = obdo.o_atime;
+               st->st_ctime  = obdo.o_ctime;
+       }
+       return rc;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+
+       io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+       if (write) {
+               io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+               io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+                                     file->f_flags & O_DIRECT ||
+                                     IS_SYNC(inode);
+       }
+       io->ci_obj     = ll_i2info(inode)->lli_clob;
+       io->ci_lockreq = CILR_MAYBE;
+       if (ll_file_nolock(file)) {
+               io->ci_lockreq = CILR_NEVER;
+               io->ci_no_srvlock = 1;
+       } else if (file->f_flags & O_APPEND) {
+               io->ci_lockreq = CILR_MANDATORY;
+       }
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+                  struct file *file, enum cl_io_type iot,
+                  loff_t *ppos, size_t count)
+{
+       struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
+       struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
+       struct cl_io     *io;
+       ssize_t        result;
+       ENTRY;
+
+restart:
+       io = ccc_env_thread_io(env);
+       ll_io_init(io, file, iot == CIT_WRITE);
+
+       if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+               struct vvp_io *vio = vvp_env_io(env);
+               struct ccc_io *cio = ccc_env_io(env);
+               int write_mutex_locked = 0;
+
+               cio->cui_fd  = LUSTRE_FPRIVATE(file);
+               vio->cui_io_subtype = args->via_io_subtype;
+
+               switch (vio->cui_io_subtype) {
+               case IO_NORMAL:
+                       cio->cui_iov = args->u.normal.via_iov;
+                       cio->cui_nrsegs = args->u.normal.via_nrsegs;
+                       cio->cui_tot_nrsegs = cio->cui_nrsegs;
+                       cio->cui_iocb = args->u.normal.via_iocb;
+                       if ((iot == CIT_WRITE) &&
+                           !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+                               if (mutex_lock_interruptible(&lli->
+                                                              lli_write_mutex))
+                                       GOTO(out, result = -ERESTARTSYS);
+                               write_mutex_locked = 1;
+                       } else if (iot == CIT_READ) {
+                               down_read(&lli->lli_trunc_sem);
+                       }
+                       break;
+               case IO_SENDFILE:
+                       vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
+                       vio->u.sendfile.cui_target = args->u.sendfile.via_target;
+                       break;
+               case IO_SPLICE:
+                       vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+                       vio->u.splice.cui_flags = args->u.splice.via_flags;
+                       break;
+               default:
+                       CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
+                       LBUG();
+               }
+               result = cl_io_loop(env, io);
+               if (write_mutex_locked)
+                       mutex_unlock(&lli->lli_write_mutex);
+               else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+                       up_read(&lli->lli_trunc_sem);
+       } else {
+               /* cl_io_rw_init() handled IO */
+               result = io->ci_result;
+       }
+
+       if (io->ci_nob > 0) {
+               result = io->ci_nob;
+               *ppos = io->u.ci_wr.wr.crw_pos;
+       }
+       GOTO(out, result);
+out:
+       cl_io_fini(env, io);
+       /* If any bit been read/written (result != 0), we just return
+        * short read/write instead of restart io. */
+       if (result == 0 && io->ci_need_restart) {
+               CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
+                      iot == CIT_READ ? "read" : "write",
+                      file->f_dentry->d_name.name, *ppos, count);
+               LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+               goto restart;
+       }
+
+       if (iot == CIT_READ) {
+               if (result >= 0)
+                       ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+                                          LPROC_LL_READ_BYTES, result);
+       } else if (iot == CIT_WRITE) {
+               if (result >= 0) {
+                       ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+                                          LPROC_LL_WRITE_BYTES, result);
+                       fd->fd_write_failed = false;
+               } else if (result != -ERESTARTSYS) {
+                       fd->fd_write_failed = true;
+               }
+       }
+
+       return result;
+}
+
+
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+                                unsigned long *nr_segs, size_t *count)
+{
+       size_t cnt = 0;
+       unsigned long seg;
+
+       for (seg = 0; seg < *nr_segs; seg++) {
+               const struct iovec *iv = &iov[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               cnt += iv->iov_len;
+               if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+                       return -EINVAL;
+               if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                       continue;
+               if (seg == 0)
+                       return -EFAULT;
+               *nr_segs = seg;
+               cnt -= iv->iov_len;   /* This segment is no good */
+               break;
+       }
+       *count = cnt;
+       return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       size_t        count;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       result = ll_file_get_iov_count(iov, &nr_segs, &count);
+       if (result)
+               RETURN(result);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_NORMAL);
+       args->u.normal.via_iov = (struct iovec *)iov;
+       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iocb = iocb;
+
+       result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+                                   &iocb->ki_pos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+                           loff_t *ppos)
+{
+       struct lu_env *env;
+       struct iovec  *local_iov;
+       struct kiocb  *kiocb;
+       ssize_t result;
+       int         refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       local_iov = &vvp_env_info(env)->vti_local_iov;
+       kiocb = &vvp_env_info(env)->vti_kiocb;
+       local_iov->iov_base = (void __user *)buf;
+       local_iov->iov_len = count;
+       init_sync_kiocb(kiocb, file);
+       kiocb->ki_pos = *ppos;
+       kiocb->ki_left = count;
+
+       result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
+
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       size_t        count;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       result = ll_file_get_iov_count(iov, &nr_segs, &count);
+       if (result)
+               RETURN(result);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_NORMAL);
+       args->u.normal.via_iov = (struct iovec *)iov;
+       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iocb = iocb;
+
+       result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+                                 &iocb->ki_pos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+                            loff_t *ppos)
+{
+       struct lu_env *env;
+       struct iovec  *local_iov;
+       struct kiocb  *kiocb;
+       ssize_t result;
+       int         refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       local_iov = &vvp_env_info(env)->vti_local_iov;
+       kiocb = &vvp_env_info(env)->vti_kiocb;
+       local_iov->iov_base = (void __user *)buf;
+       local_iov->iov_len = count;
+       init_sync_kiocb(kiocb, file);
+       kiocb->ki_pos = *ppos;
+       kiocb->ki_left = count;
+
+       result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
+
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+                                  struct pipe_inode_info *pipe, size_t count,
+                                  unsigned int flags)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_SPLICE);
+       args->u.splice.via_pipe = pipe;
+       args->u.splice.via_flags = flags;
+
+       result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
+                          obd_count ost_idx)
+{
+       struct obd_export *exp = ll_i2dtexp(inode);
+       struct obd_trans_info oti = { 0 };
+       struct obdo *oa = NULL;
+       int lsm_size;
+       int rc = 0;
+       struct lov_stripe_md *lsm = NULL, *lsm2;
+       ENTRY;
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               RETURN(-ENOMEM);
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               GOTO(out, rc = -ENOENT);
+
+       lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+                  (lsm->lsm_stripe_count));
+
+       OBD_ALLOC_LARGE(lsm2, lsm_size);
+       if (lsm2 == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       oa->o_oi = *oi;
+       oa->o_nlink = ost_idx;
+       oa->o_flags |= OBD_FL_RECREATE_OBJS;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+       obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+                                  OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+       obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+       memcpy(lsm2, lsm, lsm_size);
+       ll_inode_size_lock(inode);
+       rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+       ll_inode_size_unlock(inode);
+
+       OBD_FREE_LARGE(lsm2, lsm_size);
+       GOTO(out, rc);
+out:
+       ccc_inode_lsm_put(inode, lsm);
+       OBDO_FREE(oa);
+       return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+       struct ll_recreate_obj ucreat;
+       struct ost_id           oi;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+                          sizeof(ucreat)))
+               RETURN(-EFAULT);
+
+       ostid_set_seq_mdt0(&oi);
+       ostid_set_id(&oi, ucreat.lrc_id);
+       RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+       struct lu_fid   fid;
+       struct ost_id   oi;
+       obd_count       ost_idx;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+               RETURN(-EFAULT);
+
+       fid_to_ostid(&fid, &oi);
+       ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+       RETURN(ll_lov_recreate(inode, &oi, ost_idx));
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+                            int flags, struct lov_user_md *lum, int lum_size)
+{
+       struct lov_stripe_md *lsm = NULL;
+       struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+       int rc = 0;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm != NULL) {
+               ccc_inode_lsm_put(inode, lsm);
+               CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+                      inode->i_ino);
+               RETURN(-EEXIST);
+       }
+
+       ll_inode_size_lock(inode);
+       rc = ll_intent_file_open(file, lum, lum_size, &oit);
+       if (rc)
+               GOTO(out, rc);
+       rc = oit.d.lustre.it_status;
+       if (rc < 0)
+               GOTO(out_req_free, rc);
+
+       ll_release_openhandle(file->f_dentry, &oit);
+
+ out:
+       ll_inode_size_unlock(inode);
+       ll_intent_release(&oit);
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+out_req_free:
+       ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+       goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                            struct lov_mds_md **lmmp, int *lmm_size,
+                            struct ptlrpc_request **request)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct mdt_body  *body;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *req = NULL;
+       struct md_op_data *op_data;
+       int rc, lmmsize;
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc)
+               RETURN(rc);
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+                                    strlen(filename), lmmsize,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+       rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr_name failed "
+                      "on %s: rc %d\n", filename, rc);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+       lmmsize = body->eadatasize;
+
+       if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+                       lmmsize == 0) {
+               GOTO(out, rc = -ENODATA);
+       }
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+       LASSERT(lmm != NULL);
+
+       if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+           (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       /*
+        * This is coming from the MDS, so is probably in
+        * little endian.  We convert it to host endian before
+        * passing it to userspace.
+        */
+       if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+               /* if function called for directory - we should
+                * avoid swab not existent lsm objects */
+               if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                       if (S_ISREG(body->mode))
+                               lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+               } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                       if (S_ISREG(body->mode))
+                               lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+               }
+       }
+
+out:
+       *lmmp = lmm;
+       *lmm_size = lmmsize;
+       *request = req;
+       return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+                           unsigned long arg)
+{
+       int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+       struct lov_user_md      *lump;
+       int                      lum_size = sizeof(struct lov_user_md) +
+                                           sizeof(struct lov_user_ost_data);
+       int                      rc;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       OBD_ALLOC_LARGE(lump, lum_size);
+       if (lump == NULL)
+               RETURN(-ENOMEM);
+
+       if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
+               OBD_FREE_LARGE(lump, lum_size);
+               RETURN(-EFAULT);
+       }
+
+       rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
+
+       OBD_FREE_LARGE(lump, lum_size);
+       RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+                           unsigned long arg)
+{
+       struct lov_user_md_v3    lumv3;
+       struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+       struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
+       struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
+       int                      lum_size, rc;
+       int                      flags = FMODE_WRITE;
+       ENTRY;
+
+       /* first try with v1 which is smaller than v3 */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(lumv1, lumv1p, lum_size))
+               RETURN(-EFAULT);
+
+       if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+               lum_size = sizeof(struct lov_user_md_v3);
+               if (copy_from_user(&lumv3, lumv3p, lum_size))
+                       RETURN(-EFAULT);
+       }
+
+       rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
+       if (rc == 0) {
+               struct lov_stripe_md *lsm;
+               __u32 gen;
+
+               put_user(0, &lumv1p->lmm_stripe_count);
+
+               ll_layout_refresh(inode, &gen);
+               lsm = ccc_inode_lsm_get(inode);
+               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+                                  0, lsm, (void *)arg);
+               ccc_inode_lsm_put(inode, lsm);
+       }
+       RETURN(rc);
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+       struct lov_stripe_md *lsm;
+       int rc = -ENODATA;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm != NULL)
+               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+                                  lsm, (void *)arg);
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+       struct ll_inode_info   *lli = ll_i2info(inode);
+       struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+       struct ccc_grouplock    grouplock;
+       int                  rc;
+       ENTRY;
+
+       if (ll_file_nolock(file))
+               RETURN(-EOPNOTSUPP);
+
+       spin_lock(&lli->lli_lock);
+       if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+               CWARN("group lock already existed with gid %lu\n",
+                     fd->fd_grouplock.cg_gid);
+               spin_unlock(&lli->lli_lock);
+               RETURN(-EINVAL);
+       }
+       LASSERT(fd->fd_grouplock.cg_lock == NULL);
+       spin_unlock(&lli->lli_lock);
+
+       rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+                             arg, (file->f_flags & O_NONBLOCK), &grouplock);
+       if (rc)
+               RETURN(rc);
+
+       spin_lock(&lli->lli_lock);
+       if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+               spin_unlock(&lli->lli_lock);
+               CERROR("another thread just won the race\n");
+               cl_put_grouplock(&grouplock);
+               RETURN(-EINVAL);
+       }
+
+       fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+       fd->fd_grouplock = grouplock;
+       spin_unlock(&lli->lli_lock);
+
+       CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+       RETURN(0);
+}
+
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+       struct ll_inode_info   *lli = ll_i2info(inode);
+       struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+       struct ccc_grouplock    grouplock;
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+               spin_unlock(&lli->lli_lock);
+               CWARN("no group lock held\n");
+               RETURN(-EINVAL);
+       }
+       LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+       if (fd->fd_grouplock.cg_gid != arg) {
+               CWARN("group lock %lu doesn't match current id %lu\n",
+                      arg, fd->fd_grouplock.cg_gid);
+               spin_unlock(&lli->lli_lock);
+               RETURN(-EINVAL);
+       }
+
+       grouplock = fd->fd_grouplock;
+       memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+       fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+       spin_unlock(&lli->lli_lock);
+
+       cl_put_grouplock(&grouplock);
+       CDEBUG(D_INFO, "group lock %lu released\n", arg);
+       RETURN(0);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+       struct inode *inode = dentry->d_inode;
+       struct obd_client_handle *och;
+       int rc;
+       ENTRY;
+
+       LASSERT(inode);
+
+       /* Root ? Do nothing. */
+       if (dentry->d_inode->i_sb->s_root == dentry)
+               RETURN(0);
+
+       /* No open handle to close? Move away */
+       if (!it_disposition(it, DISP_OPEN_OPEN))
+               RETURN(0);
+
+       LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+       OBD_ALLOC(och, sizeof(*och));
+       if (!och)
+               GOTO(out, rc = -ENOMEM);
+
+       ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
+                   ll_i2info(inode), it, och);
+
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+                                      inode, och);
+ out:
+       /* this one is in place of ll_file_open */
+       if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+               ptlrpc_req_finished(it->d.lustre.it_data);
+               it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+       }
+       RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+             int num_bytes)
+{
+       struct obd_export *exp = ll_i2dtexp(inode);
+       struct lov_stripe_md *lsm = NULL;
+       struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+       int vallen = num_bytes;
+       int rc;
+       ENTRY;
+
+       /* Checks for fiemap flags */
+       if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+               fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+               return -EBADR;
+       }
+
+       /* Check for FIEMAP_FLAG_SYNC */
+       if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+               rc = filemap_fdatawrite(inode->i_mapping);
+               if (rc)
+                       return rc;
+       }
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               return -ENOENT;
+
+       /* If the stripe_count > 1 and the application does not understand
+        * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+        */
+       if (lsm->lsm_stripe_count > 1 &&
+           !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+               GOTO(out, rc = -EOPNOTSUPP);
+
+       fm_key.oa.o_oi = lsm->lsm_oi;
+       fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+       obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+       obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+       /* If filesize is 0, then there would be no objects for mapping */
+       if (fm_key.oa.o_size == 0) {
+               fiemap->fm_mapped_extents = 0;
+               GOTO(out, rc = 0);
+       }
+
+       memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+       rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+                         fiemap, lsm);
+       if (rc)
+               CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void *arg)
+{
+       struct obd_export       *exp = ll_i2mdexp(inode);
+       struct getinfo_fid2path *gfout, *gfin;
+       int                      outsize, rc;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+           !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+               RETURN(-EPERM);
+
+       /* Need to get the buflen */
+       OBD_ALLOC_PTR(gfin);
+       if (gfin == NULL)
+               RETURN(-ENOMEM);
+       if (copy_from_user(gfin, arg, sizeof(*gfin))) {
+               OBD_FREE_PTR(gfin);
+               RETURN(-EFAULT);
+       }
+
+       outsize = sizeof(*gfout) + gfin->gf_pathlen;
+       OBD_ALLOC(gfout, outsize);
+       if (gfout == NULL) {
+               OBD_FREE_PTR(gfin);
+               RETURN(-ENOMEM);
+       }
+       memcpy(gfout, gfin, sizeof(*gfout));
+       OBD_FREE_PTR(gfin);
+
+       /* Call mdc_iocontrol */
+       rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+       if (rc)
+               GOTO(gf_free, rc);
+
+       if (copy_to_user(arg, gfout, outsize))
+               rc = -EFAULT;
+
+gf_free:
+       OBD_FREE(gfout, outsize);
+       RETURN(rc);
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+       struct ll_user_fiemap *fiemap_s;
+       size_t num_bytes, ret_bytes;
+       unsigned int extent_count;
+       int rc = 0;
+
+       /* Get the extent count so we can calculate the size of
+        * required fiemap buffer */
+       if (get_user(extent_count,
+           &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+               RETURN(-EFAULT);
+       num_bytes = sizeof(*fiemap_s) + (extent_count *
+                                        sizeof(struct ll_fiemap_extent));
+
+       OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+       if (fiemap_s == NULL)
+               RETURN(-ENOMEM);
+
+       /* get the fiemap value */
+       if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+                          sizeof(*fiemap_s)))
+               GOTO(error, rc = -EFAULT);
+
+       /* If fm_extent_count is non-zero, read the first extent since
+        * it is used to calculate end_offset and device from previous
+        * fiemap call. */
+       if (extent_count) {
+               if (copy_from_user(&fiemap_s->fm_extents[0],
+                   (char __user *)arg + sizeof(*fiemap_s),
+                   sizeof(struct ll_fiemap_extent)))
+                       GOTO(error, rc = -EFAULT);
+       }
+
+       rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+       if (rc)
+               GOTO(error, rc);
+
+       ret_bytes = sizeof(struct ll_user_fiemap);
+
+       if (extent_count != 0)
+               ret_bytes += (fiemap_s->fm_mapped_extents *
+                                sizeof(struct ll_fiemap_extent));
+
+       if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+               rc = -EFAULT;
+
+error:
+       OBD_FREE_LARGE(fiemap_s, num_bytes);
+       RETURN(rc);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock  Take extent lock. Not needed if a process is already
+ *                    holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+                   int extent_lock)
+{
+       struct lov_stripe_md    *lsm = NULL;
+       struct ll_sb_info       *sbi = ll_i2sbi(inode);
+       struct obdo             *obdo = NULL;
+       int                      rc;
+       ENTRY;
+
+       /* If no stripe, we consider version is 0. */
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL) {
+               *data_version = 0;
+               CDEBUG(D_INODE, "No object for inode\n");
+               RETURN(0);
+       }
+
+       OBD_ALLOC_PTR(obdo);
+       if (obdo == NULL) {
+               ccc_inode_lsm_put(inode, lsm);
+               RETURN(-ENOMEM);
+       }
+
+       rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+       if (!rc) {
+               if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+                       rc = -EOPNOTSUPP;
+               else
+                       *data_version = obdo->o_data_version;
+       }
+
+       OBD_FREE_PTR(obdo);
+       ccc_inode_lsm_put(inode, lsm);
+
+       RETURN(rc);
+}
+
+struct ll_swap_stack {
+       struct iattr             ia1, ia2;
+       __u64                    dv1, dv2;
+       struct inode            *inode1, *inode2;
+       bool                     check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+                          struct lustre_swap_layouts *lsl)
+{
+       struct mdc_swap_layouts  msl;
+       struct md_op_data       *op_data;
+       __u32                    gid;
+       __u64                    dv;
+       struct ll_swap_stack    *llss = NULL;
+       int                      rc;
+
+       OBD_ALLOC_PTR(llss);
+       if (llss == NULL)
+               RETURN(-ENOMEM);
+
+       llss->inode1 = file1->f_dentry->d_inode;
+       llss->inode2 = file2->f_dentry->d_inode;
+
+       if (!S_ISREG(llss->inode2->i_mode))
+               GOTO(free, rc = -EINVAL);
+
+       if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
+           ll_permission(llss->inode2, MAY_WRITE, NULL))
+               GOTO(free, rc = -EPERM);
+
+       if (llss->inode2->i_sb != llss->inode1->i_sb)
+               GOTO(free, rc = -EXDEV);
+
+       /* we use 2 bool because it is easier to swap than 2 bits */
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+               llss->check_dv1 = true;
+
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+               llss->check_dv2 = true;
+
+       /* we cannot use lsl->sl_dvX directly because we may swap them */
+       llss->dv1 = lsl->sl_dv1;
+       llss->dv2 = lsl->sl_dv2;
+
+       rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+       if (rc == 0) /* same file, done! */
+               GOTO(free, rc = 0);
+
+       if (rc < 0) { /* sequentialize it */
+               swap(llss->inode1, llss->inode2);
+               swap(file1, file2);
+               swap(llss->dv1, llss->dv2);
+               swap(llss->check_dv1, llss->check_dv2);
+       }
+
+       gid = lsl->sl_gid;
+       if (gid != 0) { /* application asks to flush dirty cache */
+               rc = ll_get_grouplock(llss->inode1, file1, gid);
+               if (rc < 0)
+                       GOTO(free, rc);
+
+               rc = ll_get_grouplock(llss->inode2, file2, gid);
+               if (rc < 0) {
+                       ll_put_grouplock(llss->inode1, file1, gid);
+                       GOTO(free, rc);
+               }
+       }
+
+       /* to be able to restore mtime and atime after swap
+        * we need to first save them */
+       if (lsl->sl_flags &
+           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_mtime = llss->inode1->i_mtime;
+               llss->ia1.ia_atime = llss->inode1->i_atime;
+               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+               llss->ia2.ia_mtime = llss->inode2->i_mtime;
+               llss->ia2.ia_atime = llss->inode2->i_atime;
+               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+       }
+
+       /* ultimate check, before swaping the layouts we check if
+        * dataversion has changed (if requested) */
+       if (llss->check_dv1) {
+               rc = ll_data_version(llss->inode1, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv1)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
+       if (llss->check_dv2) {
+               rc = ll_data_version(llss->inode2, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv2)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
+       /* struct md_op_data is used to send the swap args to the mdt
+        * only flags is missing, so we use struct mdc_swap_layouts
+        * through the md_op_data->op_data */
+       /* flags from user space have to be converted before they are send to
+        * server, no flag is sent today, they are only used on the client */
+       msl.msl_flags = 0;
+       rc = -ENOMEM;
+       op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+                                    0, LUSTRE_OPC_ANY, &msl);
+       if (op_data != NULL) {
+               rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
+                                  ll_i2mdexp(llss->inode1),
+                                  sizeof(*op_data), op_data, NULL);
+               ll_finish_md_op_data(op_data);
+       }
+
+putgl:
+       if (gid != 0) {
+               ll_put_grouplock(llss->inode2, file2, gid);
+               ll_put_grouplock(llss->inode1, file1, gid);
+       }
+
+       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+       if (rc != 0)
+               GOTO(free, rc);
+
+       /* clear useless flags */
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+               llss->ia1.ia_valid &= ~ATTR_MTIME;
+               llss->ia2.ia_valid &= ~ATTR_MTIME;
+       }
+
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_valid &= ~ATTR_ATIME;
+               llss->ia2.ia_valid &= ~ATTR_ATIME;
+       }
+
+       /* update time if requested */
+       rc = 0;
+       if (llss->ia2.ia_valid != 0) {
+               mutex_lock(&llss->inode1->i_mutex);
+               rc = ll_setattr(file1->f_dentry, &llss->ia2);
+               mutex_unlock(&llss->inode1->i_mutex);
+       }
+
+       if (llss->ia1.ia_valid != 0) {
+               int rc1;
+
+               mutex_lock(&llss->inode2->i_mutex);
+               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
+               mutex_unlock(&llss->inode2->i_mutex);
+               if (rc == 0)
+                       rc = rc1;
+       }
+
+free:
+       if (llss != NULL)
+               OBD_FREE_PTR(llss);
+
+       RETURN(rc);
+}
+
+long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct inode            *inode = file->f_dentry->d_inode;
+       struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
+       int                      flags, rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+              inode->i_generation, inode, cmd);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+       /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+       if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+               RETURN(-ENOTTY);
+
+       switch(cmd) {
+       case LL_IOC_GETFLAGS:
+               /* Get the current value of the file flags */
+               return put_user(fd->fd_flags, (int *)arg);
+       case LL_IOC_SETFLAGS:
+       case LL_IOC_CLRFLAGS:
+               /* Set or clear specific file flags */
+               /* XXX This probably needs checks to ensure the flags are
+                *     not abused, and to handle any flag side effects.
+                */
+               if (get_user(flags, (int *) arg))
+                       RETURN(-EFAULT);
+
+               if (cmd == LL_IOC_SETFLAGS) {
+                       if ((flags & LL_FILE_IGNORE_LOCK) &&
+                           !(file->f_flags & O_DIRECT)) {
+                               CERROR("%s: unable to disable locking on "
+                                      "non-O_DIRECT file\n", current->comm);
+                               RETURN(-EINVAL);
+                       }
+
+                       fd->fd_flags |= flags;
+               } else {
+                       fd->fd_flags &= ~flags;
+               }
+               RETURN(0);
+       case LL_IOC_LOV_SETSTRIPE:
+               RETURN(ll_lov_setstripe(inode, file, arg));
+       case LL_IOC_LOV_SETEA:
+               RETURN(ll_lov_setea(inode, file, arg));
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               struct file *file2;
+               struct lustre_swap_layouts lsl;
+
+               if (copy_from_user(&lsl, (char *)arg,
+                                      sizeof(struct lustre_swap_layouts)))
+                       RETURN(-EFAULT);
+
+               if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+                       RETURN(-EPERM);
+
+               file2 = fget(lsl.sl_fd);
+               if (file2 == NULL)
+                       RETURN(-EBADF);
+
+               rc = -EPERM;
+               if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+                       rc = ll_swap_layouts(file, file2, &lsl);
+               fput(file2);
+               RETURN(rc);
+       }
+       case LL_IOC_LOV_GETSTRIPE:
+               RETURN(ll_lov_getstripe(inode, arg));
+       case LL_IOC_RECREATE_OBJ:
+               RETURN(ll_lov_recreate_obj(inode, arg));
+       case LL_IOC_RECREATE_FID:
+               RETURN(ll_lov_recreate_fid(inode, arg));
+       case FSFILT_IOC_FIEMAP:
+               RETURN(ll_ioctl_fiemap(inode, arg));
+       case FSFILT_IOC_GETFLAGS:
+       case FSFILT_IOC_SETFLAGS:
+               RETURN(ll_iocontrol(inode, file, cmd, arg));
+       case FSFILT_IOC_GETVERSION_OLD:
+       case FSFILT_IOC_GETVERSION:
+               RETURN(put_user(inode->i_generation, (int *)arg));
+       case LL_IOC_GROUP_LOCK:
+               RETURN(ll_get_grouplock(inode, file, arg));
+       case LL_IOC_GROUP_UNLOCK:
+               RETURN(ll_put_grouplock(inode, file, arg));
+       case IOC_OBD_STATFS:
+               RETURN(ll_obd_statfs(inode, (void *)arg));
+
+       /* We need to special case any other ioctls we want to handle,
+        * to send them to the MDS/OST as appropriate and to properly
+        * network encode the arg field.
+       case FSFILT_IOC_SETVERSION_OLD:
+       case FSFILT_IOC_SETVERSION:
+       */
+       case LL_IOC_FLUSHCTX:
+               RETURN(ll_flush_ctx(inode));
+       case LL_IOC_PATH2FID: {
+               if (copy_to_user((void *)arg, ll_inode2fid(inode),
+                                sizeof(struct lu_fid)))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case OBD_IOC_FID2PATH:
+               RETURN(ll_fid2path(inode, (void *)arg));
+       case LL_IOC_DATA_VERSION: {
+               struct ioc_data_version idv;
+               int                     rc;
+
+               if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+                       RETURN(-EFAULT);
+
+               rc = ll_data_version(inode, &idv.idv_version,
+                               !(idv.idv_flags & LL_DV_NOFLUSH));
+
+               if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+                       RETURN(-EFAULT);
+
+               RETURN(rc);
+       }
+
+       case LL_IOC_GET_MDTIDX: {
+               int mdtidx;
+
+               mdtidx = ll_get_mdt_idx(inode);
+               if (mdtidx < 0)
+                       RETURN(mdtidx);
+
+               if (put_user((int)mdtidx, (int*)arg))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case OBD_IOC_GETDTNAME:
+       case OBD_IOC_GETMDNAME:
+               RETURN(ll_get_obd_name(inode, cmd, arg));
+       case LL_IOC_HSM_STATE_GET: {
+               struct md_op_data       *op_data;
+               struct hsm_user_state   *hus;
+               int                      rc;
+
+               OBD_ALLOC_PTR(hus);
+               if (hus == NULL)
+                       RETURN(-ENOMEM);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hus);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hus);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+                       rc = -EFAULT;
+
+               ll_finish_md_op_data(op_data);
+               OBD_FREE_PTR(hus);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_STATE_SET: {
+               struct md_op_data       *op_data;
+               struct hsm_state_set    *hss;
+               int                      rc;
+
+               OBD_ALLOC_PTR(hss);
+               if (hss == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-EFAULT);
+               }
+
+               /* Non-root users are forbidden to set or clear flags which are
+                * NOT defined in HSM_USER_MASK. */
+               if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
+                   && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-EPERM);
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hss);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               ll_finish_md_op_data(op_data);
+
+               OBD_FREE_PTR(hss);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_ACTION: {
+               struct md_op_data               *op_data;
+               struct hsm_current_action       *hca;
+               int                              rc;
+
+               OBD_ALLOC_PTR(hca);
+               if (hca == NULL)
+                       RETURN(-ENOMEM);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hca);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hca);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+                       rc = -EFAULT;
+
+               ll_finish_md_op_data(op_data);
+               OBD_FREE_PTR(hca);
+               RETURN(rc);
+       }
+       default: {
+               int err;
+
+               if (LLIOC_STOP ==
+                    ll_iocontrol_call(inode, file, cmd, arg, &err))
+                       RETURN(err);
+
+               RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+                                    (void *)arg));
+       }
+       }
+}
+
+
+loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       loff_t retval, eof = 0;
+
+       ENTRY;
+       retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+                          (origin == SEEK_CUR) ? file->f_pos : 0);
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+              inode->i_ino, inode->i_generation, inode, retval, retval,
+              origin);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+       if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+               retval = ll_glimpse_size(inode);
+               if (retval != 0)
+                       RETURN(retval);
+               eof = i_size_read(inode);
+       }
+
+       retval = ll_generic_file_llseek_size(file, offset, origin,
+                                         ll_file_maxbytes(inode), eof);
+       RETURN(retval);
+}
+
+int ll_flush(struct file *file, fl_owner_t id)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       int rc, err;
+
+       LASSERT(!S_ISDIR(inode->i_mode));
+
+       /* catch async errors that were recorded back when async writeback
+        * failed for pages in this mapping. */
+       rc = lli->lli_async_rc;
+       lli->lli_async_rc = 0;
+       err = lov_read_and_clear_async_rc(lli->lli_clob);
+       if (rc == 0)
+               rc = err;
+
+       /* The application has been told write failure already.
+        * Do not report failure again. */
+       if (fd->fd_write_failed)
+               return 0;
+       return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+                      enum cl_fsync_mode mode)
+{
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       struct cl_io *io;
+       struct obd_capa *capa = NULL;
+       struct cl_fsync_io *fio;
+       int result;
+       ENTRY;
+
+       if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+           mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+               RETURN(-EINVAL);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = cl_i2info(inode)->lli_clob;
+       io->ci_ignore_layout = 1;
+
+       /* initialize parameters for sync */
+       fio = &io->u.ci_fsync;
+       fio->fi_capa = capa;
+       fio->fi_start = start;
+       fio->fi_end = end;
+       fio->fi_fid = ll_inode2fid(inode);
+       fio->fi_mode = mode;
+       fio->fi_nr_written = 0;
+
+       if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+               result = cl_io_loop(env, io);
+       else
+               result = io->ci_result;
+       if (result == 0)
+               result = fio->fi_nr_written;
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       capa_put(capa);
+
+       RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), *file->f_dentry may be
+ * null and dentry must be used directly rather than pulled from
+ * *file->f_dentry as is done otherwise.
+ */
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct dentry *dentry = file->f_dentry;
+       struct inode *inode = dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ptlrpc_request *req;
+       struct obd_capa *oc;
+       int rc, err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+       rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       mutex_lock(&inode->i_mutex);
+
+       /* catch async errors that were recorded back when async writeback
+        * failed for pages in this mapping. */
+       if (!S_ISDIR(inode->i_mode)) {
+               err = lli->lli_async_rc;
+               lli->lli_async_rc = 0;
+               if (rc == 0)
+                       rc = err;
+               err = lov_read_and_clear_async_rc(lli->lli_clob);
+               if (rc == 0)
+                       rc = err;
+       }
+
+       oc = ll_mdscapa_get(inode);
+       err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+                     &req);
+       capa_put(oc);
+       if (!rc)
+               rc = err;
+       if (!err)
+               ptlrpc_req_finished(req);
+
+       if (datasync && S_ISREG(inode->i_mode)) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+                               CL_FSYNC_ALL);
+               if (rc == 0 && err < 0)
+                       rc = err;
+               if (rc < 0)
+                       fd->fd_write_failed = true;
+               else
+                       fd->fd_write_failed = false;
+       }
+
+       mutex_unlock(&inode->i_mutex);
+       RETURN(rc);
+}
+
+int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
+                                          .ei_cb_cp =ldlm_flock_completion_ast,
+                                          .ei_cbdata = file_lock };
+       struct md_op_data *op_data;
+       struct lustre_handle lockh = {0};
+       ldlm_policy_data_t flock = {{0}};
+       int flags = 0;
+       int rc;
+       int rc2 = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+              inode->i_ino, file_lock);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+       if (file_lock->fl_flags & FL_FLOCK) {
+               LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+               /* flocks are whole-file locks */
+               flock.l_flock.end = OFFSET_MAX;
+               /* For flocks owner is determined by the local file desctiptor*/
+               flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+       } else if (file_lock->fl_flags & FL_POSIX) {
+               flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+               flock.l_flock.start = file_lock->fl_start;
+               flock.l_flock.end = file_lock->fl_end;
+       } else {
+               RETURN(-EINVAL);
+       }
+       flock.l_flock.pid = file_lock->fl_pid;
+
+       /* Somewhat ugly workaround for svc lockd.
+        * lockd installs custom fl_lmops->lm_compare_owner that checks
+        * for the fl_owner to be the same (which it always is on local node
+        * I guess between lockd processes) and then compares pid.
+        * As such we assign pid to the owner field to make it all work,
+        * conflict with normal locks is unlikely since pid space and
+        * pointer space for current->files are not intersecting */
+       if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+               flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+       switch (file_lock->fl_type) {
+       case F_RDLCK:
+               einfo.ei_mode = LCK_PR;
+               break;
+       case F_UNLCK:
+               /* An unlock request may or may not have any relation to
+                * existing locks so we may not be able to pass a lock handle
+                * via a normal ldlm_lock_cancel() request. The request may even
+                * unlock a byte range in the middle of an existing lock. In
+                * order to process an unlock request we need all of the same
+                * information that is given with a normal read or write record
+                * lock request. To avoid creating another ldlm unlock (cancel)
+                * message we'll treat a LCK_NL flock request as an unlock. */
+               einfo.ei_mode = LCK_NL;
+               break;
+       case F_WRLCK:
+               einfo.ei_mode = LCK_PW;
+               break;
+       default:
+               CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+                       file_lock->fl_type);
+               RETURN (-ENOTSUPP);
+       }
+
+       switch (cmd) {
+       case F_SETLKW:
+#ifdef F_SETLKW64
+       case F_SETLKW64:
+#endif
+               flags = 0;
+               break;
+       case F_SETLK:
+#ifdef F_SETLK64
+       case F_SETLK64:
+#endif
+               flags = LDLM_FL_BLOCK_NOWAIT;
+               break;
+       case F_GETLK:
+#ifdef F_GETLK64
+       case F_GETLK64:
+#endif
+               flags = LDLM_FL_TEST_LOCK;
+               /* Save the old mode so that if the mode in the lock changes we
+                * can decrement the appropriate reader or writer refcount. */
+               file_lock->fl_type = einfo.ei_mode;
+               break;
+       default:
+               CERROR("unknown fcntl lock command: %d\n", cmd);
+               RETURN (-EINVAL);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
+              "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
+              flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
+
+       rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+                       op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+       if ((file_lock->fl_flags & FL_FLOCK) &&
+           (rc == 0 || file_lock->fl_type == F_UNLCK))
+               rc2  = flock_lock_file_wait(file, file_lock);
+       if ((file_lock->fl_flags & FL_POSIX) &&
+           (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+           !(flags & LDLM_FL_TEST_LOCK))
+               rc2  = posix_lock_file_wait(file, file_lock);
+
+       if (rc2 && file_lock->fl_type != F_UNLCK) {
+               einfo.ei_mode = LCK_NL;
+               md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+                       op_data, &lockh, &flock, 0, NULL /* req */, flags);
+               rc = rc2;
+       }
+
+       ll_finish_md_op_data(op_data);
+
+       RETURN(rc);
+}
+
+int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+       ENTRY;
+
+       RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+{
+       struct lustre_handle lockh;
+       ldlm_policy_data_t policy;
+       ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+                               (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+       struct lu_fid *fid;
+       __u64 flags;
+       int i;
+       ENTRY;
+
+       if (!inode)
+              RETURN(0);
+
+       fid = &ll_i2info(inode)->lli_fid;
+       CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+              ldlm_lockname[mode]);
+
+       flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+       for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+               policy.l_inodebits.bits = *bits & (1 << i);
+               if (policy.l_inodebits.bits == 0)
+                       continue;
+
+               if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+                                 &policy, mode, &lockh)) {
+                       struct ldlm_lock *lock;
+
+                       lock = ldlm_handle2lock(&lockh);
+                       if (lock) {
+                               *bits &=
+                                     ~(lock->l_policy_data.l_inodebits.bits);
+                               LDLM_LOCK_PUT(lock);
+                       } else {
+                               *bits &= ~policy.l_inodebits.bits;
+                       }
+               }
+       }
+       RETURN(*bits == 0);
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+                           struct lustre_handle *lockh, __u64 flags)
+{
+       ldlm_policy_data_t policy = { .l_inodebits = {bits}};
+       struct lu_fid *fid;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       fid = &ll_i2info(inode)->lli_fid;
+       CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+       rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+                          fid, LDLM_IBITS, &policy,
+                          LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
+       RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+       /* Already unlinked. Just update nlink and return success */
+       if (rc == -ENOENT) {
+               clear_nlink(inode);
+               /* This path cannot be hit for regular files unless in
+                * case of obscure races, so no need to to validate
+                * size. */
+               if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+                       return 0;
+       } else if (rc != 0) {
+               CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
+                      ll_get_fsname(inode->i_sb, NULL, 0),
+                      PFID(ll_inode2fid(inode)), rc);
+       }
+
+       return rc;
+}
+
+int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+                            __u64 ibits)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *req = NULL;
+       struct obd_export *exp;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
+              inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
+
+       exp = ll_i2mdexp(inode);
+
+       /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+        *      But under CMD case, it caused some lock issues, should be fixed
+        *      with new CMD ibits lock. See bug 12718 */
+       if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+               struct lookup_intent oit = { .it_op = IT_GETATTR };
+               struct md_op_data *op_data;
+
+               if (ibits == MDS_INODELOCK_LOOKUP)
+                       oit.it_op = IT_LOOKUP;
+
+               /* Call getattr by fid, so do not provide name at all. */
+               op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
+                                            dentry->d_inode, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               oit.it_create_mode |= M_CHECK_STALE;
+               rc = md_intent_lock(exp, op_data, NULL, 0,
+                                   /* we are not interested in name
+                                      based lookup */
+                                   &oit, 0, &req,
+                                   ll_md_blocking_ast, 0);
+               ll_finish_md_op_data(op_data);
+               oit.it_create_mode &= ~M_CHECK_STALE;
+               if (rc < 0) {
+                       rc = ll_inode_revalidate_fini(inode, rc);
+                       GOTO (out, rc);
+               }
+
+               rc = ll_revalidate_it_finish(req, &oit, dentry);
+               if (rc != 0) {
+                       ll_intent_release(&oit);
+                       GOTO(out, rc);
+               }
+
+               /* Unlinked? Unhash dentry, so it is not picked up later by
+                  do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+                  here to preserve get_cwd functionality on 2.6.
+                  Bug 10503 */
+               if (!dentry->d_inode->i_nlink)
+                       d_lustre_invalidate(dentry);
+
+               ll_lookup_finish_locks(&oit, dentry);
+       } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+               struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+               obd_valid valid = OBD_MD_FLGETATTR;
+               struct md_op_data *op_data;
+               int ealen = 0;
+
+               if (S_ISREG(inode->i_mode)) {
+                       rc = ll_get_max_mdsize(sbi, &ealen);
+                       if (rc)
+                               RETURN(rc);
+                       valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                            0, ealen, LUSTRE_OPC_ANY,
+                                            NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               op_data->op_valid = valid;
+               /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+                * capa for this inode. Because we only keep capas of dirs
+                * fresh. */
+               rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+               ll_finish_md_op_data(op_data);
+               if (rc) {
+                       rc = ll_inode_revalidate_fini(inode, rc);
+                       RETURN(rc);
+               }
+
+               rc = ll_prep_inode(&inode, req, NULL, NULL);
+       }
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+                          __u64 ibits)
+{
+       struct inode *inode = dentry->d_inode;
+       int rc;
+       ENTRY;
+
+       rc = __ll_inode_revalidate_it(dentry, it, ibits);
+       if (rc != 0)
+               RETURN(rc);
+
+       /* if object isn't regular file, don't validate size */
+       if (!S_ISREG(inode->i_mode)) {
+               LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+               LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+               LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+       } else {
+               rc = ll_glimpse_size(inode);
+       }
+       RETURN(rc);
+}
+
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+                 struct lookup_intent *it, struct kstat *stat)
+{
+       struct inode *inode = de->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int res = 0;
+
+       res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
+                                            MDS_INODELOCK_LOOKUP);
+       ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+       if (res)
+               return res;
+
+       stat->dev = inode->i_sb->s_dev;
+       if (ll_need_32bit_api(sbi))
+               stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+       else
+               stat->ino = inode->i_ino;
+       stat->mode = inode->i_mode;
+       stat->nlink = inode->i_nlink;
+       stat->uid = inode->i_uid;
+       stat->gid = inode->i_gid;
+       stat->rdev = inode->i_rdev;
+       stat->atime = inode->i_atime;
+       stat->mtime = inode->i_mtime;
+       stat->ctime = inode->i_ctime;
+       stat->blksize = 1 << inode->i_blkbits;
+
+       stat->size = i_size_read(inode);
+       stat->blocks = inode->i_blocks;
+
+       return 0;
+}
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+       struct lookup_intent it = { .it_op = IT_GETATTR };
+
+       return ll_getattr_it(mnt, de, &it, stat);
+}
+
+
+struct posix_acl * ll_get_acl(struct inode *inode, int type)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct posix_acl *acl = NULL;
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       /* VFS' acl_permission_check->check_acl will release the refcount */
+       acl = posix_acl_dup(lli->lli_posix_acl);
+       spin_unlock(&lli->lli_lock);
+
+       RETURN(acl);
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+       int rc = 0;
+       ENTRY;
+
+#ifdef MAY_NOT_BLOCK
+       if (mask & MAY_NOT_BLOCK)
+               return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+       * need to do it before permission check. */
+
+       if (inode == inode->i_sb->s_root->d_inode) {
+               struct lookup_intent it = { .it_op = IT_LOOKUP };
+
+               rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
+                                             MDS_INODELOCK_LOOKUP);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+              inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+       if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+               return lustre_check_remote_perm(inode, mask);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+       rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+
+       RETURN(rc);
+}
+
+#define READ_METHOD aio_read
+#define READ_FUNCTION ll_file_aio_read
+#define WRITE_METHOD aio_write
+#define WRITE_FUNCTION ll_file_aio_write
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush,
+       .flock    = ll_file_flock,
+       .lock      = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush,
+       .flock    = ll_file_noflock,
+       .lock      = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+       .setattr        = ll_setattr,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl        = ll_get_acl,
+};
+
+/* dynamic ioctl number support routins */
+static struct llioc_ctl_data {
+       struct rw_semaphore     ioc_sem;
+       struct list_head              ioc_head;
+} llioc = {
+       __RWSEM_INITIALIZER(llioc.ioc_sem),
+       LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+       struct list_head              iocd_list;
+       unsigned int        iocd_size;
+       llioc_callback_t        iocd_cb;
+       unsigned int        iocd_count;
+       unsigned int        iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+       unsigned int size;
+       struct llioc_data *in_data = NULL;
+       ENTRY;
+
+       if (cb == NULL || cmd == NULL ||
+           count > LLIOC_MAX_CMD || count < 0)
+               RETURN(NULL);
+
+       size = sizeof(*in_data) + count * sizeof(unsigned int);
+       OBD_ALLOC(in_data, size);
+       if (in_data == NULL)
+               RETURN(NULL);
+
+       memset(in_data, 0, sizeof(*in_data));
+       in_data->iocd_size = size;
+       in_data->iocd_cb = cb;
+       in_data->iocd_count = count;
+       memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+       down_write(&llioc.ioc_sem);
+       list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+       up_write(&llioc.ioc_sem);
+
+       RETURN(in_data);
+}
+
+void ll_iocontrol_unregister(void *magic)
+{
+       struct llioc_data *tmp;
+
+       if (magic == NULL)
+               return;
+
+       down_write(&llioc.ioc_sem);
+       list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+               if (tmp == magic) {
+                       unsigned int size = tmp->iocd_size;
+
+                       list_del(&tmp->iocd_list);
+                       up_write(&llioc.ioc_sem);
+
+                       OBD_FREE(tmp, size);
+                       return;
+               }
+       }
+       up_write(&llioc.ioc_sem);
+
+       CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+
+EXPORT_SYMBOL(ll_iocontrol_register);
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+                       unsigned int cmd, unsigned long arg, int *rcp)
+{
+       enum llioc_iter ret = LLIOC_CONT;
+       struct llioc_data *data;
+       int rc = -EINVAL, i;
+
+       down_read(&llioc.ioc_sem);
+       list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+               for (i = 0; i < data->iocd_count; i++) {
+                       if (cmd != data->iocd_cmd[i])
+                               continue;
+
+                       ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+                       break;
+               }
+
+               if (ret == LLIOC_STOP)
+                       break;
+       }
+       up_read(&llioc.ioc_sem);
+
+       if (rcp)
+               *rcp = rc;
+       return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       int result;
+       ENTRY;
+
+       if (lli->lli_clob == NULL)
+               RETURN(0);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       result = cl_conf_set(env, lli->lli_clob, conf);
+       cl_env_nested_put(&nest, env);
+
+       if (conf->coc_opc == OBJECT_CONF_SET) {
+               struct ldlm_lock *lock = conf->coc_lock;
+
+               LASSERT(lock != NULL);
+               LASSERT(ldlm_has_layout(lock));
+               if (result == 0) {
+                       /* it can only be allowed to match after layout is
+                        * applied to inode otherwise false layout would be
+                        * seen. Applying layout shoud happen before dropping
+                        * the intent lock. */
+                       ldlm_lock_allow_match(lock);
+               }
+       }
+       RETURN(result);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_capa *oc;
+       struct ptlrpc_request *req;
+       struct mdt_body *body;
+       void *lvbdata;
+       void *lmm;
+       int lmmsize;
+       int rc;
+       ENTRY;
+
+       if (lock->l_lvb_data != NULL)
+               RETURN(0);
+
+       /* if layout lock was granted right away, the layout is returned
+        * within DLM_LVB of dlm reply; otherwise if the lock was ever
+        * blocked and then granted via completion ast, we have to fetch
+        * layout here. Please note that we can't use the LVB buffer in
+        * completion AST because it doesn't have a large enough buffer */
+       oc = ll_mdscapa_get(inode);
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc == 0)
+               rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                               OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+                               lmmsize, 0, &req);
+       capa_put(oc);
+       if (rc < 0)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL || body->eadatasize > lmmsize)
+               GOTO(out, rc = -EPROTO);
+
+       lmmsize = body->eadatasize;
+       if (lmmsize == 0) /* empty layout */
+               GOTO(out, rc = 0);
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+       if (lmm == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       OBD_ALLOC_LARGE(lvbdata, lmmsize);
+       if (lvbdata == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       memcpy(lvbdata, lmm, lmmsize);
+       lock_res_and_lock(lock);
+       if (lock->l_lvb_data == NULL) {
+               lock->l_lvb_data = lvbdata;
+               lock->l_lvb_len = lmmsize;
+               lvbdata = NULL;
+       }
+       unlock_res_and_lock(lock);
+
+       if (lvbdata != NULL)
+               OBD_FREE_LARGE(lvbdata, lmmsize);
+       EXIT;
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+                               struct inode *inode, __u32 *gen, bool reconf)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info    *sbi = ll_i2sbi(inode);
+       struct ldlm_lock *lock;
+       struct lustre_md md = { NULL };
+       struct cl_object_conf conf;
+       int rc = 0;
+       bool lvb_ready;
+       bool wait_layout = false;
+       ENTRY;
+
+       LASSERT(lustre_handle_is_used(lockh));
+
+       lock = ldlm_handle2lock(lockh);
+       LASSERT(lock != NULL);
+       LASSERT(ldlm_has_layout(lock));
+
+       LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+               inode, PFID(&lli->lli_fid), reconf);
+
+       lock_res_and_lock(lock);
+       lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+       unlock_res_and_lock(lock);
+       /* checking lvb_ready is racy but this is okay. The worst case is
+        * that multi processes may configure the file on the same time. */
+       if (lvb_ready || !reconf) {
+               rc = -ENODATA;
+               if (lvb_ready) {
+                       /* layout_gen must be valid if layout lock is not
+                        * cancelled and stripe has already set */
+                       *gen = lli->lli_layout_gen;
+                       rc = 0;
+               }
+               GOTO(out, rc);
+       }
+
+       rc = ll_layout_fetch(inode, lock);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* for layout lock, lmm is returned in lock's lvb.
+        * lvb_data is immutable if the lock is held so it's safe to access it
+        * without res lock. See the description in ldlm_lock_decref_internal()
+        * for the condition to free lvb_data of layout lock */
+       if (lock->l_lvb_data != NULL) {
+               rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+                                 lock->l_lvb_data, lock->l_lvb_len);
+               if (rc >= 0) {
+                       *gen = LL_LAYOUT_GEN_EMPTY;
+                       if (md.lsm != NULL)
+                               *gen = md.lsm->lsm_layout_gen;
+                       rc = 0;
+               } else {
+                       CERROR("%s: file "DFID" unpackmd error: %d\n",
+                               ll_get_fsname(inode->i_sb, NULL, 0),
+                               PFID(&lli->lli_fid), rc);
+               }
+       }
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* set layout to file. Unlikely this will fail as old layout was
+        * surely eliminated */
+       memset(&conf, 0, sizeof conf);
+       conf.coc_opc = OBJECT_CONF_SET;
+       conf.coc_inode = inode;
+       conf.coc_lock = lock;
+       conf.u.coc_md = &md;
+       rc = ll_layout_conf(inode, &conf);
+
+       if (md.lsm != NULL)
+               obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+       /* refresh layout failed, need to wait */
+       wait_layout = rc == -EBUSY;
+       EXIT;
+
+out:
+       LDLM_LOCK_PUT(lock);
+       ldlm_lock_decref(lockh, mode);
+
+       /* wait for IO to complete if it's still being used. */
+       if (wait_layout) {
+               CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+                       ll_get_fsname(inode->i_sb, NULL, 0),
+                       inode, PFID(&lli->lli_fid));
+
+               memset(&conf, 0, sizeof conf);
+               conf.coc_opc = OBJECT_CONF_WAIT;
+               conf.coc_inode = inode;
+               rc = ll_layout_conf(inode, &conf);
+               if (rc == 0)
+                       rc = -EAGAIN;
+
+               CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+                       PFID(&lli->lli_fid), rc);
+       }
+       RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+       struct ll_inode_info  *lli = ll_i2info(inode);
+       struct ll_sb_info     *sbi = ll_i2sbi(inode);
+       struct md_op_data     *op_data;
+       struct lookup_intent   it;
+       struct lustre_handle   lockh;
+       ldlm_mode_t            mode;
+       struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+                                          .ei_mode = LCK_CR,
+                                          .ei_cb_bl = ll_md_blocking_ast,
+                                          .ei_cb_cp = ldlm_completion_ast,
+                                          .ei_cbdata = NULL };
+       int rc;
+       ENTRY;
+
+       *gen = lli->lli_layout_gen;
+       if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+               RETURN(0);
+
+       /* sanity checks */
+       LASSERT(fid_is_sane(ll_inode2fid(inode)));
+       LASSERT(S_ISREG(inode->i_mode));
+
+       /* mostly layout lock is caching on the local side, so try to match
+        * it before grabbing layout lock mutex. */
+       mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+       if (mode != 0) { /* hit cached lock */
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
+               if (rc == 0)
+                       RETURN(0);
+
+               /* better hold lli_layout_mutex to try again otherwise
+                * it will have starvation problem. */
+       }
+
+       /* take layout lock mutex to enqueue layout lock exclusively. */
+       mutex_lock(&lli->lli_layout_mutex);
+
+again:
+       /* try again. Maybe somebody else has done this. */
+       mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+       if (mode != 0) { /* hit cached lock */
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               if (rc == -EAGAIN)
+                       goto again;
+
+               mutex_unlock(&lli->lli_layout_mutex);
+               RETURN(rc);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+                       0, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data)) {
+               mutex_unlock(&lli->lli_layout_mutex);
+               RETURN(PTR_ERR(op_data));
+       }
+
+       /* have to enqueue one */
+       memset(&it, 0, sizeof(it));
+       it.it_op = IT_LAYOUT;
+       lockh.cookie = 0ULL;
+
+       LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+                       ll_get_fsname(inode->i_sb, NULL, 0), inode,
+                       PFID(&lli->lli_fid));
+
+       rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+                       NULL, 0, NULL, 0);
+       if (it.d.lustre.it_data != NULL)
+               ptlrpc_req_finished(it.d.lustre.it_data);
+       it.d.lustre.it_data = NULL;
+
+       ll_finish_md_op_data(op_data);
+
+       md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
+
+       mode = it.d.lustre.it_lock_mode;
+       it.d.lustre.it_lock_mode = 0;
+       ll_intent_drop_lock(&it);
+
+       if (rc == 0) {
+               /* set lock data in case this is a new lock */
+               ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               if (rc == -EAGAIN)
+                       goto again;
+       }
+       mutex_unlock(&lli->lli_layout_mutex);
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644 (file)
index 0000000..b6fd959
--- /dev/null
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed = 0;
+static unsigned long long ll_capa_renewal_noent = 0;
+static unsigned long long ll_capa_renewal_failed = 0;
+static unsigned long long ll_capa_renewal_retries = 0;
+
+static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
+{
+       if (cfs_time_before(expiry, ll_capa_timer.expires) ||
+           !timer_pending(&ll_capa_timer)) {
+               mod_timer(&ll_capa_timer, expiry);
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                          "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+       }
+}
+
+static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa)
+{
+       return cfs_time_sub(ocapa->c_expiry,
+                           cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+       return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+       struct obd_capa *ocapa = NULL;
+       int expired = 0;
+
+       /* if ll_capa_list has client capa to expire or ll_idle_capas has
+        * expired capa, return 1.
+        */
+       spin_lock(&capa_lock);
+       if (!list_empty(ll_capa_list)) {
+               ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+                                      c_list);
+               expired = capa_is_to_expire(ocapa);
+               if (!expired)
+                       update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       } else if (!list_empty(&ll_idle_capas)) {
+               ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+                                      c_list);
+               expired = capa_is_expired(ocapa);
+               if (!expired)
+                       update_capa_timer(ocapa, ocapa->c_expiry);
+       }
+       spin_unlock(&capa_lock);
+
+       if (expired)
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+       return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+       struct obd_capa *tmp;
+       struct list_head *before = NULL;
+
+       /* TODO: client capa is sorted by expiry, this could be optimized */
+       list_for_each_entry_reverse(tmp, head, c_list) {
+               if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+                       before = &tmp->c_list;
+                       break;
+               }
+       }
+
+       LASSERT(&ocapa->c_list != before);
+       list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+       struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+       return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+       if (capa_for_mds(&ocapa->c_capa)) {
+               LASSERT(lli->lli_mds_capa == ocapa);
+               lli->lli_mds_capa = NULL;
+       } else if (capa_for_oss(&ocapa->c_capa)) {
+               list_del_init(&ocapa->u.cli.lli_list);
+       }
+
+       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+       list_del_init(&ocapa->c_list);
+       capa_count[CAPA_SITE_CLIENT]--;
+       /* release the ref when alloc */
+       capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+       struct obd_capa *ocapa, *tmp, *next;
+       struct inode *inode = NULL;
+       struct l_wait_info lwi = { 0 };
+       int rc;
+       ENTRY;
+
+       thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+
+       while (1) {
+               l_wait_event(ll_capa_thread.t_ctl_waitq,
+                            !thread_is_running(&ll_capa_thread) ||
+                            have_expired_capa(),
+                            &lwi);
+
+               if (!thread_is_running(&ll_capa_thread))
+                       break;
+
+               next = NULL;
+
+               spin_lock(&capa_lock);
+               list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+                       __u64 ibits;
+
+                       LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+                       if (!capa_is_to_expire(ocapa)) {
+                               next = ocapa;
+                               break;
+                       }
+
+                       list_del_init(&ocapa->c_list);
+
+                       /* for MDS capability, only renew those which belong to
+                        * dir, or its inode is opened, or client holds LOOKUP
+                        * lock.
+                        */
+                       /* ibits may be changed by ll_have_md_lock() so we have
+                        * to set it each time */
+                       ibits = MDS_INODELOCK_LOOKUP;
+                       if (capa_for_mds(&ocapa->c_capa) &&
+                           !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+                           obd_capa_open_count(ocapa) == 0 &&
+                           !ll_have_md_lock(ocapa->u.cli.inode,
+                                            &ibits, LCK_MINMODE)) {
+                               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                          "skip renewal for");
+                               sort_add_capa(ocapa, &ll_idle_capas);
+                               continue;
+                       }
+
+                       /* for OSS capability, only renew those whose inode is
+                        * opened.
+                        */
+                       if (capa_for_oss(&ocapa->c_capa) &&
+                           obd_capa_open_count(ocapa) == 0) {
+                               /* oss capa with open count == 0 won't renew,
+                                * move to idle list */
+                               sort_add_capa(ocapa, &ll_idle_capas);
+                               continue;
+                       }
+
+                       /* NB iput() is in ll_update_capa() */
+                       inode = igrab(ocapa->u.cli.inode);
+                       if (inode == NULL) {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "igrab failed for");
+                               continue;
+                       }
+
+                       capa_get(ocapa);
+                       ll_capa_renewed++;
+                       spin_unlock(&capa_lock);
+                       rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+                                          ll_update_capa);
+                       spin_lock(&capa_lock);
+                       if (rc) {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renew failed: %d", rc);
+                               ll_capa_renewal_failed++;
+                       }
+               }
+
+               if (next)
+                       update_capa_timer(next, capa_renewal_time(next));
+
+               list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+                                            c_list) {
+                       if (!capa_is_expired(ocapa)) {
+                               if (!next)
+                                       update_capa_timer(ocapa,
+                                                         ocapa->c_expiry);
+                               break;
+                       }
+
+                       if (atomic_read(&ocapa->c_refc) > 1) {
+                               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                          "expired(c_refc %d), don't release",
+                                          atomic_read(&ocapa->c_refc));
+                               /* don't try to renew any more */
+                               list_del_init(&ocapa->c_list);
+                               continue;
+                       }
+
+                       /* expired capa is released. */
+                       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+                       ll_delete_capa(ocapa);
+               }
+
+               spin_unlock(&capa_lock);
+       }
+
+       thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+       RETURN(0);
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+       task_t *task;
+       ENTRY;
+
+       init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+       task = kthread_run(capa_thread_main, NULL, "ll_capa");
+       if (IS_ERR(task)) {
+               CERROR("cannot start expired capa thread: rc %ld\n",
+                       PTR_ERR(task));
+               RETURN(PTR_ERR(task));
+       }
+       wait_event(ll_capa_thread.t_ctl_waitq,
+                      thread_is_running(&ll_capa_thread));
+
+       RETURN(0);
+}
+
+void ll_capa_thread_stop(void)
+{
+       thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+       wait_event(ll_capa_thread.t_ctl_waitq,
+                      thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+       int found = 0;
+
+       ENTRY;
+
+       if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+               RETURN(NULL);
+
+       LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+               opc == CAPA_OPC_OSS_TRUNC);
+
+       spin_lock(&capa_lock);
+       list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+               if (capa_is_expired(ocapa))
+                       continue;
+               if ((opc & CAPA_OPC_OSS_WRITE) &&
+                   capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+                       found = 1;
+                       break;
+               } else if ((opc & CAPA_OPC_OSS_READ) &&
+                          capa_opc_supported(&ocapa->c_capa,
+                                             CAPA_OPC_OSS_READ)) {
+                       found = 1;
+                       break;
+               } else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+                          capa_opc_supported(&ocapa->c_capa, opc)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+                                 ll_inode2fid(inode)));
+               LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+               capa_get(ocapa);
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+       } else {
+               ocapa = NULL;
+
+               if (atomic_read(&ll_capa_debug)) {
+                       CERROR("no capability for "DFID" opc "LPX64"\n",
+                              PFID(&lli->lli_fid), opc);
+                       atomic_set(&ll_capa_debug, 0);
+               }
+       }
+       spin_unlock(&capa_lock);
+
+       RETURN(ocapa);
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+               RETURN(NULL);
+
+       spin_lock(&capa_lock);
+       ocapa = capa_get(lli->lli_mds_capa);
+       spin_unlock(&capa_lock);
+       if (!ocapa && atomic_read(&ll_capa_debug)) {
+               CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+               atomic_set(&ll_capa_debug, 0);
+       }
+
+       RETURN(ocapa);
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+                                       struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *old = lli->lli_mds_capa;
+       struct lustre_capa *capa = &ocapa->c_capa;
+
+       if (!old) {
+               ocapa->u.cli.inode = inode;
+               lli->lli_mds_capa = ocapa;
+               capa_count[CAPA_SITE_CLIENT]++;
+
+               DEBUG_CAPA(D_SEC, capa, "add MDS");
+       } else {
+               spin_lock(&old->c_lock);
+               old->c_capa = *capa;
+               spin_unlock(&old->c_lock);
+
+               DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+               capa_put(ocapa);
+               ocapa = old;
+       }
+       return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+
+       /* inside capa_lock */
+       list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+               if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+                       continue;
+
+               LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+                                 ll_inode2fid(inode)));
+               LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+               return ocapa;
+       }
+
+       return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+                                     struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *tmp;
+       struct list_head *next = NULL;
+
+       /* capa is sorted in lli_oss_capas so lookup can always find the
+        * latest one */
+       list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+               if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+                       next = &tmp->u.cli.lli_list;
+                       break;
+               }
+       }
+       LASSERT(&ocapa->u.cli.lli_list != next);
+       list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+                                       struct obd_capa *ocapa)
+{
+       struct obd_capa *old;
+       struct lustre_capa *capa = &ocapa->c_capa;
+
+       LASSERTF(S_ISREG(inode->i_mode),
+                "inode has oss capa, but not regular file, mode: %d\n",
+                inode->i_mode);
+
+       /* FIXME: can't replace it so easily with fine-grained opc */
+       old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+       if (!old) {
+               ocapa->u.cli.inode = inode;
+               INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+               capa_count[CAPA_SITE_CLIENT]++;
+
+               DEBUG_CAPA(D_SEC, capa, "add OSS");
+       } else {
+               spin_lock(&old->c_lock);
+               old->c_capa = *capa;
+               spin_unlock(&old->c_lock);
+
+               DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+               capa_put(ocapa);
+               ocapa = old;
+       }
+
+       inode_add_oss_capa(inode, ocapa);
+       return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+       spin_lock(&capa_lock);
+       ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+                                              do_add_oss_capa(inode, ocapa);
+
+       /* truncate capa won't renew */
+       if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+               set_capa_expiry(ocapa);
+               list_del_init(&ocapa->c_list);
+               sort_add_capa(ocapa, ll_capa_list);
+
+               update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       }
+
+       spin_unlock(&capa_lock);
+
+       atomic_set(&ll_capa_debug, 1);
+       return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay)
+{
+       /* NB: set a fake expiry for this capa to prevent it renew too soon */
+       oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+       struct inode *inode = ocapa->u.cli.inode;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ocapa);
+
+       if (IS_ERR(capa)) {
+               /* set error code */
+               rc = PTR_ERR(capa);
+               spin_lock(&capa_lock);
+               if (rc == -ENOENT) {
+                       DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                  "renewal canceled because object removed");
+                       ll_capa_renewal_noent++;
+               } else {
+                       ll_capa_renewal_failed++;
+
+                       /* failed capa won't be renewed any longer, but if -EIO,
+                        * client might be doing recovery, retry in 2 min. */
+                       if (rc == -EIO && !capa_is_expired(ocapa)) {
+                               delay_capa_renew(ocapa, 120);
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renewal failed: -EIO, "
+                                          "retry in 2 mins");
+                               ll_capa_renewal_retries++;
+                               GOTO(retry, rc);
+                       } else {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renewal failed(rc: %d) for", rc);
+                       }
+               }
+
+               list_del_init(&ocapa->c_list);
+               sort_add_capa(ocapa, &ll_idle_capas);
+               spin_unlock(&capa_lock);
+
+               capa_put(ocapa);
+               iput(inode);
+               RETURN(rc);
+       }
+
+       spin_lock(&ocapa->c_lock);
+       LASSERT(!memcmp(&ocapa->c_capa, capa,
+                       offsetof(struct lustre_capa, lc_opc)));
+       ocapa->c_capa = *capa;
+       set_capa_expiry(ocapa);
+       spin_unlock(&ocapa->c_lock);
+
+       spin_lock(&capa_lock);
+       if (capa_for_oss(capa))
+               inode_add_oss_capa(inode, ocapa);
+       DEBUG_CAPA(D_SEC, capa, "renew");
+       EXIT;
+retry:
+       list_del_init(&ocapa->c_list);
+       sort_add_capa(ocapa, ll_capa_list);
+       update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       spin_unlock(&capa_lock);
+
+       capa_put(ocapa);
+       iput(inode);
+       return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+
+       if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+           == 0)
+               return;
+
+       if (!S_ISREG(inode->i_mode))
+               return;
+
+       atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+
+       if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+           == 0)
+               return;
+
+       if (!S_ISREG(inode->i_mode))
+               return;
+
+       atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return;
+
+       LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+       /* release ref when find */
+       capa_put(ocapa);
+       if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+               spin_lock(&capa_lock);
+               ll_delete_capa(ocapa);
+               spin_unlock(&capa_lock);
+       }
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa, *tmp;
+
+       spin_lock(&capa_lock);
+       ocapa = lli->lli_mds_capa;
+       if (ocapa)
+               ll_delete_capa(ocapa);
+
+       list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+                                    u.cli.lli_list)
+               ll_delete_capa(ocapa);
+       spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+       if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+               LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+                             "Fid capabilities renewal ENOENT: %llu\n"
+                             "Fid capabilities failed to renew: %llu\n"
+                             "Fid capabilities renewal retries: %llu\n",
+                             ll_capa_renewed, ll_capa_renewal_noent,
+                             ll_capa_renewal_failed, ll_capa_renewal_retries);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644 (file)
index 0000000..00b2b38
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+       struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+       ENTRY;
+       spin_lock(&lli->lli_lock);
+       lli->lli_flags |= LLIF_SOM_DIRTY;
+       if (page != NULL && list_empty(&page->cpg_pending_linkage))
+               list_add(&page->cpg_pending_linkage,
+                            &club->cob_pending_list);
+       spin_unlock(&lli->lli_lock);
+       EXIT;
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+       struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+       int rc = 0;
+
+       ENTRY;
+       spin_lock(&lli->lli_lock);
+       if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+               list_del_init(&page->cpg_pending_linkage);
+               rc = 1;
+       }
+       spin_unlock(&lli->lli_lock);
+       if (rc)
+               ll_queue_done_writing(club->cob_inode, 0);
+       EXIT;
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       lli->lli_flags |= flags;
+
+       if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+           list_empty(&club->cob_pending_list)) {
+               struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+               if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+                       CWARN("ino %lu/%u(flags %u) som valid it just after "
+                             "recovery\n",
+                             inode->i_ino, inode->i_generation,
+                             lli->lli_flags);
+               /* DONE_WRITING is allowed and inode has no dirty page. */
+               spin_lock(&lcq->lcq_lock);
+
+               LASSERT(list_empty(&lli->lli_close_list));
+               CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+                      inode->i_ino, inode->i_generation);
+               list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+               /* Avoid a concurrent insertion into the close thread queue:
+                * an inode is already in the close thread, open(), write(),
+                * close() happen, epoch is closed as the inode is marked as
+                * LLIF_EPOCH_PENDING. When pages are written inode should not
+                * be inserted into the queue again, clear this flag to avoid
+                * it. */
+               lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+               wake_up(&lcq->lcq_waitq);
+               spin_unlock(&lcq->lcq_lock);
+       }
+       spin_unlock(&lli->lli_lock);
+       EXIT;
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       op_data->op_flags |= MF_SOM_CHANGE;
+       /* Check if Size-on-MDS attributes are valid. */
+       if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+               CERROR("ino %lu/%u(flags %u) som valid it just after "
+                      "recovery\n", inode->i_ino, inode->i_generation,
+                      lli->lli_flags);
+
+       if (!cl_local_size(inode)) {
+               /* Send Size-on-MDS Attributes if valid. */
+               op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+                               ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+       }
+       EXIT;
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+                     struct obd_client_handle **och, unsigned long flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       if (!(list_empty(&club->cob_pending_list))) {
+               if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+                       LASSERT(*och != NULL);
+                       LASSERT(lli->lli_pending_och == NULL);
+                       /* Inode is dirty and there is no pending write done
+                        * request yet, DONE_WRITE is to be sent later. */
+                       lli->lli_flags |= LLIF_EPOCH_PENDING;
+                       lli->lli_pending_och = *och;
+                       spin_unlock(&lli->lli_lock);
+
+                       inode = igrab(inode);
+                       LASSERT(inode);
+                       GOTO(out, 0);
+               }
+               if (flags & LLIF_DONE_WRITING) {
+                       /* Some pages are still dirty, it is early to send
+                        * DONE_WRITE. Wait untill all pages will be flushed
+                        * and try DONE_WRITE again later. */
+                       LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+                       lli->lli_flags |= LLIF_DONE_WRITING;
+                       spin_unlock(&lli->lli_lock);
+
+                       inode = igrab(inode);
+                       LASSERT(inode);
+                       GOTO(out, 0);
+               }
+       }
+       CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n",
+              ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+       op_data->op_flags |= MF_EPOCH_CLOSE;
+
+       if (flags & LLIF_DONE_WRITING) {
+               LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+               LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+               *och = lli->lli_pending_och;
+               lli->lli_pending_och = NULL;
+               lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+       } else {
+               /* Pack Size-on-MDS inode attributes only if they has changed */
+               if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+                       spin_unlock(&lli->lli_lock);
+                       GOTO(out, 0);
+               }
+
+               /* There is a pending DONE_WRITE -- close epoch with no
+                * attribute change. */
+               if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+                       spin_unlock(&lli->lli_lock);
+                       GOTO(out, 0);
+               }
+       }
+
+       LASSERT(list_empty(&club->cob_pending_list));
+       lli->lli_flags &= ~LLIF_SOM_DIRTY;
+       spin_unlock(&lli->lli_lock);
+       ll_done_writing_attr(inode, op_data);
+
+       EXIT;
+out:
+       return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ptlrpc_request *request = NULL;
+       __u32 old_flags;
+       struct obdo *oa;
+       int rc;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+               CERROR("ino %lu/%u(flags %u) som valid it just after "
+                      "recovery\n", inode->i_ino, inode->i_generation,
+                      lli->lli_flags);
+
+       OBDO_ALLOC(oa);
+       if (!oa) {
+               CERROR("can't allocate memory for Size-on-MDS update.\n");
+               RETURN(-ENOMEM);
+       }
+
+       old_flags = op_data->op_flags;
+       op_data->op_flags = MF_SOM_CHANGE;
+
+       /* If inode is already in another epoch, skip getattr from OSTs. */
+       if (lli->lli_ioepoch == op_data->op_ioepoch) {
+               rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+                                     old_flags & MF_GETATTR_LOCK);
+               if (rc) {
+                       oa->o_valid = 0;
+                       if (rc != -ENOENT)
+                               CERROR("inode_getattr failed (%d): unable to "
+                                      "send a Size-on-MDS attribute update "
+                                      "for inode %lu/%u\n", rc, inode->i_ino,
+                                      inode->i_generation);
+               } else {
+                       CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+                              PFID(&lli->lli_fid));
+               }
+               /* Install attributes into op_data. */
+               md_from_obdo(op_data, oa, oa->o_valid);
+       }
+
+       rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+                       NULL, 0, NULL, 0, &request, NULL);
+       ptlrpc_req_finished(request);
+
+       OBDO_FREE(oa);
+       RETURN(rc);
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+                                   struct md_op_data *op_data,
+                                   struct obd_client_handle **och)
+{
+       ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+       /* If there is no @och, we do not do D_W yet. */
+       if (*och == NULL)
+               return;
+
+       ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+       ll_prep_md_op_data(op_data, inode, NULL, NULL,
+                          0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+       struct obd_client_handle *och = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL) {
+               CERROR("can't allocate op_data\n");
+               EXIT;
+               return;
+       }
+
+       ll_prepare_done_writing(inode, op_data, &och);
+       /* If there is no @och, we do not do D_W yet. */
+       if (och == NULL)
+               GOTO(out, 0);
+
+       rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+       if (rc == -EAGAIN) {
+               /* MDS has instructed us to obtain Size-on-MDS attribute from
+                * OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+       } else if (rc) {
+               CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+out:
+       ll_finish_md_op_data(op_data);
+       if (och) {
+               md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+               OBD_FREE_PTR(och);
+       }
+       EXIT;
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+       struct ll_inode_info *lli = NULL;
+
+       spin_lock(&lcq->lcq_lock);
+
+       if (!list_empty(&lcq->lcq_head)) {
+               lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+                                    lli_close_list);
+               list_del_init(&lli->lli_close_list);
+       } else if (atomic_read(&lcq->lcq_stop))
+               lli = ERR_PTR(-EALREADY);
+
+       spin_unlock(&lcq->lcq_lock);
+       return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+       struct ll_close_queue *lcq = arg;
+       ENTRY;
+
+       complete(&lcq->lcq_comp);
+
+       while (1) {
+               struct l_wait_info lwi = { 0 };
+               struct ll_inode_info *lli;
+               struct inode *inode;
+
+               l_wait_event_exclusive(lcq->lcq_waitq,
+                                      (lli = ll_close_next_lli(lcq)) != NULL,
+                                      &lwi);
+               if (IS_ERR(lli))
+                       break;
+
+               inode = ll_info2i(lli);
+               CDEBUG(D_INFO, "done_writting for inode %lu/%u\n",
+                      inode->i_ino, inode->i_generation);
+               ll_done_writing(inode);
+               iput(inode);
+       }
+
+       CDEBUG(D_INFO, "ll_close exiting\n");
+       complete(&lcq->lcq_comp);
+       RETURN(0);
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+       struct ll_close_queue *lcq;
+       task_t *task;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+               return -EINTR;
+
+       OBD_ALLOC(lcq, sizeof(*lcq));
+       if (lcq == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&lcq->lcq_lock);
+       INIT_LIST_HEAD(&lcq->lcq_head);
+       init_waitqueue_head(&lcq->lcq_waitq);
+       init_completion(&lcq->lcq_comp);
+
+       task = kthread_run(ll_close_thread, lcq, "ll_close");
+       if (IS_ERR(task)) {
+               OBD_FREE(lcq, sizeof(*lcq));
+               return PTR_ERR(task);
+       }
+
+       wait_for_completion(&lcq->lcq_comp);
+       *lcq_ret = lcq;
+       return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+       init_completion(&lcq->lcq_comp);
+       atomic_inc(&lcq->lcq_stop);
+       wake_up(&lcq->lcq_waitq);
+       wait_for_completion(&lcq->lcq_comp);
+       OBD_FREE(lcq, sizeof(*lcq));
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644 (file)
index 0000000..177b4db
--- /dev/null
@@ -0,0 +1,1578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <lustre_debug.h>
+#include <lustre_ver.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_eacl.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre_mdc.h>
+#include <linux/lustre_intent.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF   0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+       int                             lld_cwd_count;
+       int                             lld_mnt_count;
+       struct obd_client_handle        lld_cwd_och;
+       struct obd_client_handle        lld_mnt_och;
+       struct lookup_intent            *lld_it;
+       unsigned int                    lld_sa_generation;
+       unsigned int                    lld_invalid:1;
+       struct rcu_head                 lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+extern struct file_operations ll_pgcache_seq_fops;
+
+#define LLI_INODE_MAGIC                 0x111d0de5
+#define LLI_INODE_DEAD           0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+       char        *lgd_name;      /* points to a buffer with NAME_MAX+1 size */
+       struct lu_fid    lgd_fid;       /* target fid we are looking for */
+       int           lgd_found;     /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+       struct hlist_node       lrp_list;
+       uid_t              lrp_uid;
+       gid_t              lrp_gid;
+       uid_t              lrp_fsuid;
+       gid_t              lrp_fsgid;
+       int                  lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+                                                   is access permission with
+                                                   lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+       /* MDS has an authority for the Size-on-MDS attributes. */
+       LLIF_MDS_SIZE_LOCK      = (1 << 0),
+       /* Epoch close is postponed. */
+       LLIF_EPOCH_PENDING      = (1 << 1),
+       /* DONE WRITING is allowed. */
+       LLIF_DONE_WRITING       = (1 << 2),
+       /* Sizeon-on-MDS attributes are changed. An attribute update needs to
+        * be sent to MDS. */
+       LLIF_SOM_DIRTY    = (1 << 3),
+       /* File is contented */
+       LLIF_CONTENDED    = (1 << 4),
+       /* Truncate uses server lock for this file */
+       LLIF_SRVLOCK        = (1 << 5),
+       /* File data is modified. */
+       LLIF_DATA_MODIFIED      = (1 << 6),
+};
+
+struct ll_inode_info {
+       __u32                           lli_inode_magic;
+       __u32                           lli_flags;
+       __u64                           lli_ioepoch;
+
+       spinlock_t                      lli_lock;
+       struct posix_acl                *lli_posix_acl;
+
+       struct hlist_head               *lli_remote_perms;
+       struct mutex                            lli_rmtperm_mutex;
+
+       /* identifying fields for both metadata and data stacks. */
+       struct lu_fid              lli_fid;
+       /* Parent fid for accessing default stripe data on parent directory
+        * for allocating OST objects after a mknod() and later open-by-FID. */
+       struct lu_fid              lli_pfid;
+
+       struct list_head                      lli_close_list;
+       struct list_head                      lli_oss_capas;
+       /* open count currently used by capability only, indicate whether
+        * capability needs renewal */
+       atomic_t                    lli_open_count;
+       struct obd_capa         *lli_mds_capa;
+       cfs_time_t                    lli_rmtperm_time;
+
+       /* handle is to be sent to MDS later on done_writing and setattr.
+        * Open handle data are needed for the recovery to reconstruct
+        * the inode state on the MDS. XXX: recovery is not ready yet. */
+       struct obd_client_handle       *lli_pending_och;
+
+       /* We need all three because every inode may be opened in different
+        * modes */
+       struct obd_client_handle       *lli_mds_read_och;
+       struct obd_client_handle       *lli_mds_write_och;
+       struct obd_client_handle       *lli_mds_exec_och;
+       __u64                      lli_open_fd_read_count;
+       __u64                      lli_open_fd_write_count;
+       __u64                      lli_open_fd_exec_count;
+       /* Protects access to och pointers and their usage counters */
+       struct mutex                    lli_och_mutex;
+
+       struct inode                    lli_vfs_inode;
+
+       /* the most recent timestamps obtained from mds */
+       struct ost_lvb                  lli_lvb;
+       spinlock_t                      lli_agl_lock;
+
+       /* Try to make the d::member and f::member are aligned. Before using
+        * these members, make clear whether it is directory or not. */
+       union {
+               /* for directory */
+               struct {
+                       /* serialize normal readdir and statahead-readdir. */
+                       struct mutex                    d_readdir_mutex;
+
+                       /* metadata statahead */
+                       /* since parent-child threads can share the same @file
+                        * struct, "opendir_key" is the token when dir close for
+                        * case of parent exit before child -- it is me should
+                        * cleanup the dir readahead. */
+                       void                       *d_opendir_key;
+                       struct ll_statahead_info       *d_sai;
+                       struct posix_acl               *d_def_acl;
+                       /* protect statahead stuff. */
+                       spinlock_t                      d_sa_lock;
+                       /* "opendir_pid" is the token when lookup/revalid
+                        * -- I am the owner of dir statahead. */
+                       pid_t                      d_opendir_pid;
+               } d;
+
+#define lli_readdir_mutex       u.d.d_readdir_mutex
+#define lli_opendir_key         u.d.d_opendir_key
+#define lli_sai                 u.d.d_sai
+#define lli_def_acl         u.d.d_def_acl
+#define lli_sa_lock         u.d.d_sa_lock
+#define lli_opendir_pid         u.d.d_opendir_pid
+
+               /* for non-directory */
+               struct {
+                       struct semaphore                f_size_sem;
+                       void                            *f_size_sem_owner;
+                       char                            *f_symlink_name;
+                       __u64                           f_maxbytes;
+                       /*
+                        * struct rw_semaphore {
+                        *    signed long       count;     // align d.d_def_acl
+                        *    spinlock_t        wait_lock; // align d.d_sa_lock
+                        *    struct list_head wait_list;
+                        * }
+                        */
+                       struct rw_semaphore             f_trunc_sem;
+                       struct mutex                    f_write_mutex;
+
+                       struct rw_semaphore             f_glimpse_sem;
+                       cfs_time_t                      f_glimpse_time;
+                       struct list_head                        f_agl_list;
+                       __u64                           f_agl_index;
+
+                       /* for writepage() only to communicate to fsync */
+                       int                             f_async_rc;
+
+                       /* volatile file criteria is based on file name, this
+                        * flag is used to keep the test result, so the strcmp
+                        * is done only once
+                        */
+                       bool                            f_volatile;
+                       /*
+                        * whenever a process try to read/write the file, the
+                        * jobid of the process will be saved here, and it'll
+                        * be packed into the write PRC when flush later.
+                        *
+                        * so the read/write statistics for jobid will not be
+                        * accurate if the file is shared by different jobs.
+                        */
+                       char                 f_jobid[JOBSTATS_JOBID_SIZE];
+               } f;
+
+#define lli_size_sem       u.f.f_size_sem
+#define lli_size_sem_owner      u.f.f_size_sem_owner
+#define lli_symlink_name       u.f.f_symlink_name
+#define lli_maxbytes       u.f.f_maxbytes
+#define lli_trunc_sem     u.f.f_trunc_sem
+#define lli_write_mutex         u.f.f_write_mutex
+#define lli_glimpse_sem                u.f.f_glimpse_sem
+#define lli_glimpse_time       u.f.f_glimpse_time
+#define lli_agl_list           u.f.f_agl_list
+#define lli_agl_index          u.f.f_agl_index
+#define lli_async_rc           u.f.f_async_rc
+#define lli_jobid              u.f.f_jobid
+#define lli_volatile           u.f.f_volatile
+
+       } u;
+
+       /* XXX: For following frequent used members, although they maybe special
+        *      used for non-directory object, it is some time-wasting to check
+        *      whether the object is directory or not before using them. On the
+        *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+        *      the "ll_inode_info" size even if moving those members into u.f.
+        *      So keep them out side.
+        *
+        *      In the future, if more members are added only for directory,
+        *      some of the following members can be moved into u.f.
+        */
+       bool                        lli_has_smd;
+       struct cl_object               *lli_clob;
+
+       /* mutex to request for layout lock exclusively. */
+       struct mutex                    lli_layout_mutex;
+       /* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
+       __u32                           lli_layout_gen;
+};
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+       return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system.  That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+       RA_STAT_HIT = 0,
+       RA_STAT_MISS,
+       RA_STAT_DISTANT_READPAGE,
+       RA_STAT_MISS_IN_WINDOW,
+       RA_STAT_FAILED_GRAB_PAGE,
+       RA_STAT_FAILED_MATCH,
+       RA_STAT_DISCARDED,
+       RA_STAT_ZERO_LEN,
+       RA_STAT_ZERO_WINDOW,
+       RA_STAT_EOF,
+       RA_STAT_MAX_IN_FLIGHT,
+       RA_STAT_WRONG_GRAB_PAGE,
+       _NR_RA_STAT,
+};
+
+struct ll_ra_info {
+       atomic_t              ra_cur_pages;
+       unsigned long        ra_max_pages;
+       unsigned long        ra_max_pages_per_file;
+       unsigned long        ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+       unsigned long ria_start;  /* start offset of read-ahead*/
+       unsigned long ria_end;    /* end offset of read-ahead*/
+       /* If stride read pattern is detected, ria_stoff means where
+        * stride read is started. Note: for normal read-ahead, the
+        * value here is meaningless, and also it will not be accessed*/
+       pgoff_t ria_stoff;
+       /* ria_length and ria_pages are the length and pages length in the
+        * stride I/O mode. And they will also be used to check whether
+        * it is stride I/O read-ahead in the read-ahead pages*/
+       unsigned long ria_length;
+       unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+       pid_t pid;
+       struct obd_histogram pp_r_hist;
+       struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+       struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+       pid_t                rw_pid;
+       int                    rw_op;
+       loff_t              rw_range_start;
+       loff_t              rw_range_end;
+       loff_t              rw_last_file_pos;
+       loff_t              rw_offset;
+       size_t              rw_smallest_extent;
+       size_t              rw_largest_extent;
+       struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+       STATS_TRACK_ALL = 0,  /* track all processes */
+       STATS_TRACK_PID,      /* track process with this pid */
+       STATS_TRACK_PPID,     /* track processes with this ppid */
+       STATS_TRACK_GID,      /* track processes with this gid */
+       STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK        0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM          0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK        0x04
+#define LL_SBI_USER_XATTR      0x08 /* support user xattr */
+#define LL_SBI_ACL            0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT      0x40 /* remote client */
+#define LL_SBI_MDS_CAPA          0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA         0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW     0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+
+#define LL_SBI_FLAGS { \
+       "nolck",        \
+       "checksum",     \
+       "flock",        \
+       "xattr",        \
+       "acl",          \
+       "rmt_client",   \
+       "mds_capa",     \
+       "oss_capa",     \
+       "flock",        \
+       "lru_resize",   \
+       "lazy_statfs",  \
+       "som",          \
+       "32bit_api",    \
+       "64bit_hash",   \
+       "agl",          \
+       "verbose",      \
+       "layout",       \
+       "user_fid2path" }
+
+/* default value for ll_sb_info->contention_time */
+#define SBI_DEFAULT_CONTENTION_SECONDS     60
+/* default value for lockless_truncate_enable */
+#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
+#define RCE_HASHES      32
+
+struct rmtacl_ctl_entry {
+       struct list_head       rce_list;
+       pid_t       rce_key; /* hash key */
+       int           rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+       spinlock_t      rct_lock;
+       struct list_head        rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES       32
+
+struct eacl_entry {
+       struct list_head            ee_list;
+       pid_t            ee_key; /* hash key */
+       struct lu_fid    ee_fid;
+       int                ee_type; /* ACL type for ACCESS or DEFAULT */
+       ext_acl_xattr_header *ee_acl;
+};
+
+struct eacl_table {
+       spinlock_t      et_lock;
+       struct list_head        et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+       struct list_head                  ll_list;
+       /* this protects pglist and ra_info.  It isn't safe to
+        * grab from interrupt contexts */
+       spinlock_t                ll_lock;
+       spinlock_t                ll_pp_extent_lock; /* pp_extent entry*/
+       spinlock_t                ll_process_lock; /* ll_rw_process_info */
+       struct obd_uuid    ll_sb_uuid;
+       struct obd_export       *ll_md_exp;
+       struct obd_export       *ll_dt_exp;
+       struct proc_dir_entry*    ll_proc_root;
+       struct lu_fid        ll_root_fid; /* root object fid */
+
+       int                    ll_flags;
+       struct list_head                ll_conn_chain; /* per-conn chain of SBs */
+       struct lustre_client_ocd  ll_lco;
+
+       struct list_head                ll_orphan_dentry_list; /*please don't ask -p*/
+       struct ll_close_queue    *ll_lcq;
+
+       struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+       /* Used to track "unstable" pages on a client, and maintain a
+        * LRU list of clean pages. An "unstable" page is defined as
+        * any page which is sent to a server as part of a bulk request,
+        * but is uncommitted to stable storage. */
+       struct cl_client_cache    ll_cache;
+
+       struct lprocfs_stats     *ll_ra_stats;
+
+       struct ll_ra_info        ll_ra_info;
+       unsigned int          ll_namelen;
+       struct file_operations   *ll_fop;
+
+       /* =0 - hold lock over whole read/write
+        * >0 - max. chunk to be read/written w/o lock re-acquiring */
+       unsigned long        ll_max_rw_chunk;
+       unsigned int          ll_md_brw_size; /* used by readdir */
+
+       struct lu_site     *ll_site;
+       struct cl_device         *ll_cl;
+       /* Statistics */
+       struct ll_rw_extents_info ll_rw_extents_info;
+       int                    ll_extent_process_count;
+       struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+       unsigned int          ll_offset_process_count;
+       struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+       unsigned int          ll_rw_offset_entry_count;
+       int                    ll_stats_track_id;
+       enum stats_track_type     ll_stats_track_type;
+       int                    ll_rw_stats_on;
+
+       /* metadata stat-ahead */
+       unsigned int          ll_sa_max;     /* max statahead RPCs */
+       atomic_t                  ll_sa_total;   /* statahead thread started
+                                                 * count */
+       atomic_t                  ll_sa_wrong;   /* statahead thread stopped for
+                                                 * low hit ratio */
+       atomic_t                  ll_agl_total;  /* AGL thread started count */
+
+       dev_t                ll_sdev_orig; /* save s_dev before assign for
+                                                * clustred nfs */
+       struct rmtacl_ctl_table   ll_rct;
+       struct eacl_table        ll_et;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+struct ll_ra_read {
+       pgoff_t      lrr_start;
+       pgoff_t      lrr_count;
+       struct task_struct *lrr_reader;
+       struct list_head          lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+       spinlock_t  ras_lock;
+       /*
+        * index of the last page that read(2) needed and that wasn't in the
+        * cache. Used by ras_update() to detect seeks.
+        *
+        * XXX nikita: if access seeks into cached region, Lustre doesn't see
+        * this.
+        */
+       unsigned long   ras_last_readpage;
+       /*
+        * number of pages read after last read-ahead window reset. As window
+        * is reset on each seek, this is effectively a number of consecutive
+        * accesses. Maybe ->ras_accessed_in_window is better name.
+        *
+        * XXX nikita: window is also reset (by ras_update()) when Lustre
+        * believes that memory pressure evicts read-ahead pages. In that
+        * case, it probably doesn't make sense to expand window to
+        * PTLRPC_MAX_BRW_PAGES on the third access.
+        */
+       unsigned long   ras_consecutive_pages;
+       /*
+        * number of read requests after the last read-ahead window reset
+        * As window is reset on each seek, this is effectively the number
+        * on consecutive read request and is used to trigger read-ahead.
+        */
+       unsigned long   ras_consecutive_requests;
+       /*
+        * Parameters of current read-ahead window. Handled by
+        * ras_update(). On the initial access to the file or after a seek,
+        * window is reset to 0. After 3 consecutive accesses, window is
+        * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+        * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+        */
+       unsigned long   ras_window_start, ras_window_len;
+       /*
+        * Where next read-ahead should start at. This lies within read-ahead
+        * window. Read-ahead window is read in pieces rather than at once
+        * because: 1. lustre limits total number of pages under read-ahead by
+        * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+        * not covered by DLM lock.
+        */
+       unsigned long   ras_next_readahead;
+       /*
+        * Total number of ll_file_read requests issued, reads originating
+        * due to mmap are not counted in this total.  This value is used to
+        * trigger full file read-ahead after multiple reads to a small file.
+        */
+       unsigned long   ras_requests;
+       /*
+        * Page index with respect to the current request, these value
+        * will not be accurate when dealing with reads issued via mmap.
+        */
+       unsigned long   ras_request_index;
+       /*
+        * list of struct ll_ra_read's one per read(2) call current in
+        * progress against this file descriptor. Used by read-ahead code,
+        * protected by ->ras_lock.
+        */
+       struct list_head      ras_read_beads;
+       /*
+        * The following 3 items are used for detecting the stride I/O
+        * mode.
+        * In stride I/O mode,
+        * ...............|-----data-----|****gap*****|--------|******|....
+        *    offset      |-stride_pages-|-stride_gap-|
+        * ras_stride_offset = offset;
+        * ras_stride_length = stride_pages + stride_gap;
+        * ras_stride_pages = stride_pages;
+        * Note: all these three items are counted by pages.
+        */
+       unsigned long   ras_stride_length;
+       unsigned long   ras_stride_pages;
+       pgoff_t  ras_stride_offset;
+       /*
+        * number of consecutive stride request count, and it is similar as
+        * ras_consecutive_requests, but used for stride I/O mode.
+        * Note: only more than 2 consecutive stride request are detected,
+        * stride read-ahead will be enable
+        */
+       unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+       struct ll_readahead_state fd_ras;
+       int fd_omode;
+       struct ccc_grouplock fd_grouplock;
+       __u64 lfd_pos;
+       __u32 fd_flags;
+       struct file *fd_file;
+       /* Indicate whether need to report failure when close.
+        * true: failure is known, not report again.
+        * false: unknown failure, should report. */
+       bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+       return &lli->lli_vfs_inode;
+}
+
+struct it_cb_data {
+       struct inode  *icbd_parent;
+       struct dentry **icbd_childp;
+       obd_id  hash;
+};
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+       return 1;
+#else
+       return unlikely(current_is_32bit() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#endif
+}
+
+#define LLAP_MAGIC 98764321
+
+extern struct kmem_cache *ll_async_page_slab;
+extern size_t ll_async_page_slab_size;
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#ifdef LPROCFS
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                               struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                       struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern struct file_operations ll_dir_operations;
+extern struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+                            struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+               filldir_t filldir);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+int ll_objects_destroy(struct ptlrpc_request *request,
+                      struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                     struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                      void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+void ll_removepage(struct page *page);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_file_punch(struct inode *, loff_t, int);
+ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
+void ll_clear_file_contended(struct inode*);
+int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+                struct ll_readahead_state *ras, struct address_space *mapping,
+                struct cl_page_list *queue, int flags);
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+                                 __u64);
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+                          ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+                                  struct lustre_handle *lockh, __u64 flags);
+int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+                            __u64 bits);
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+                    struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_local_open(struct file *file,
+                 struct lookup_intent *it, struct ll_file_data *fd,
+                 struct obd_client_handle *och);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+               struct file *file);
+int ll_md_real_close(struct inode *inode, int flags);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+                     struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+                    __u64 ioepoch, int sync);
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+                 struct md_open_data **mod);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+                         struct lustre_handle *fh);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                             struct ll_file_data *file, loff_t pos,
+                             size_t count, int rw);
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+              struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct ll_file_data *ll_file_data_get(void);
+struct posix_acl * ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+                            int flags, struct lov_user_md *lum,
+                            int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                            struct lov_mds_md **lmm, int *lmm_size,
+                            struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                    int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+                    int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+             int num_bytes);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_fid2path(struct inode *inode, void *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+
+/* llite/dcache.c */
+
+int ll_dops_init(struct dentry *de, int block, int init_sa);
+extern struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+               const struct dentry *dentry, const struct inode *inode,
+               unsigned int len, const char *str, const struct qstr *d_name);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                           struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+char *ll_read_opt(const char *opt, char *data);
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                      __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+                unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+                 struct super_block *, struct lookup_intent *);
+void lustre_dump_dentry(struct dentry *, int recur);
+void lustre_dump_inode(struct inode *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+                                     struct inode *i1, struct inode *i2,
+                                     const char *name, int namelen,
+                                     int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+                                     const struct lu_fid *fid);
+
+/* llite/special.c */
+extern struct inode_operations ll_special_inode_operations;
+extern struct file_operations ll_special_chr_inode_fops;
+extern struct file_operations ll_special_chr_file_fops;
+extern struct file_operations ll_special_blk_inode_fops;
+extern struct file_operations ll_special_fifo_inode_fops;
+extern struct file_operations ll_special_fifo_file_fops;
+extern struct file_operations ll_special_sock_inode_fops;
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+       spinlock_t              lcq_lock;
+       struct list_head                lcq_head;
+       wait_queue_head_t               lcq_waitq;
+       struct completion       lcq_comp;
+       atomic_t                lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific achitecture can implement only part of this list */
+enum vvp_io_subtype {
+       /** normal IO */
+       IO_NORMAL,
+       /** io called from .sendfile */
+       IO_SENDFILE,
+       /** io started from splice_{read|write} */
+       IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+       /** io subtype */
+       enum vvp_io_subtype    cui_io_subtype;
+
+       union {
+               struct {
+                       read_actor_t      cui_actor;
+                       void         *cui_target;
+               } sendfile;
+               struct {
+                       struct pipe_inode_info *cui_pipe;
+                       unsigned int        cui_flags;
+               } splice;
+               struct vvp_fault_io {
+                       /**
+                        * Inode modification time that is checked across DLM
+                        * lock request.
+                        */
+                       time_t           ft_mtime;
+                       struct vm_area_struct *ft_vma;
+                       /**
+                        *  locked page returned from vvp_io
+                        */
+                       struct page         *ft_vmpage;
+                       struct vm_fault_api {
+                               /**
+                                * kernel fault info
+                                */
+                               struct vm_fault *ft_vmf;
+                               /**
+                                * fault API used bitflags for return code.
+                                */
+                               unsigned int    ft_flags;
+                       } fault;
+               } fault;
+       } u;
+       /**
+        * Read-ahead state used by read and page-fault IO contexts.
+        */
+       struct ll_ra_read    cui_bead;
+       /**
+        * Set when cui_bead has been initialized.
+        */
+       int               cui_ra_window_set;
+       /**
+        * Partially truncated page, that vvp_io_trunc_start() keeps locked
+        * across truncate.
+        */
+       struct cl_page      *cui_partpage;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+       /** normal/sendfile/splice */
+       enum vvp_io_subtype via_io_subtype;
+
+       union {
+               struct {
+                       struct kiocb      *via_iocb;
+                       struct iovec      *via_iov;
+                       unsigned long      via_nrsegs;
+               } normal;
+               struct {
+                       read_actor_t       via_actor;
+                       void          *via_target;
+               } sendfile;
+               struct {
+                       struct pipe_inode_info  *via_pipe;
+                       unsigned int       via_flags;
+               } splice;
+       } u;
+};
+
+struct ll_cl_context {
+       void       *lcc_cookie;
+       struct cl_io   *lcc_io;
+       struct cl_page *lcc_page;
+       struct lu_env  *lcc_env;
+       int          lcc_refcheck;
+       int          lcc_created;
+};
+
+struct vvp_thread_info {
+       struct ost_lvb       vti_lvb;
+       struct cl_2queue     vti_queue;
+       struct iovec     vti_local_iov;
+       struct vvp_io_args   vti_args;
+       struct ra_io_arg     vti_ria;
+       struct kiocb     vti_kiocb;
+       struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+       extern struct lu_context_key vvp_key;
+       struct vvp_thread_info      *info;
+
+       info = lu_context_key_get(&env->le_ctx, &vvp_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+                                              enum vvp_io_subtype type)
+{
+       struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+       ret->via_io_subtype = type;
+
+       return ret;
+}
+
+struct vvp_session {
+       struct vvp_io    vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+       extern struct lu_context_key vvp_session_key;
+       struct vvp_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+       return &vvp_env_session(env)->vs_ios;
+}
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+       rb_root_t                      lt_root;
+       struct list_head                      lt_locked_list;
+       struct ll_file_data         *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                             __u64 end, ldlm_mode_t mode);
+void policy_from_vma(ldlm_policy_data_t *policy,
+               struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                              size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+       struct address_space *mapping = vmpage->mapping;
+       loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+       LASSERT(PageLocked(vmpage));
+       if (mapping == NULL)
+               return;
+
+       ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+       truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+       return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+       return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+       struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+       if (obd == NULL)
+               LBUG();
+       return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+       return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+       return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+       return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+       struct lu_fid *fid;
+
+       LASSERT(inode != NULL);
+       fid = &ll_i2info(inode)->lli_fid;
+
+       return fid;
+}
+
+static inline int ll_mds_max_easize(struct super_block *sb)
+{
+       return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+       return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+                   void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+struct hlist_head *alloc_rmtperm_hash(void);
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern timer_list_t ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+               struct ll_readahead_state *ras, unsigned long index,
+               unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+int ll_is_file_contended(struct file *file);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+obd_valid rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+          ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+                                struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN     2
+#define LL_SA_RPC_DEF     32
+#define LL_SA_RPC_MAX     8192
+
+#define LL_SA_CACHE_BIT         5
+#define LL_SA_CACHE_SIZE       (1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK       (LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+       struct inode       *sai_inode;
+       atomic_t            sai_refcount;   /* when access this struct, hold
+                                                * refcount */
+       unsigned int        sai_generation; /* generation for statahead */
+       unsigned int        sai_max;    /* max ahead of lookup */
+       __u64              sai_sent;       /* stat requests sent count */
+       __u64              sai_replied;    /* stat requests which received
+                                                * reply */
+       __u64              sai_index;      /* index of statahead entry */
+       __u64              sai_index_wait; /* index of entry which is the
+                                                * caller is waiting for */
+       __u64              sai_hit;     /* hit count */
+       __u64              sai_miss;       /* miss count:
+                                                * for "ls -al" case, it includes
+                                                * hidden dentry miss;
+                                                * for "ls -l" case, it does not
+                                                * include hidden dentry miss.
+                                                * "sai_miss_hidden" is used for
+                                                * the later case.
+                                                */
+       unsigned int        sai_consecutive_miss; /* consecutive miss */
+       unsigned int        sai_miss_hidden;/* "ls -al", but first dentry
+                                                * is not a hidden one */
+       unsigned int        sai_skip_hidden;/* skipped hidden dentry count */
+       unsigned int        sai_ls_all:1,   /* "ls -al", do stat-ahead for
+                                                * hidden entries */
+                               sai_in_readpage:1,/* statahead is in readdir()*/
+                               sai_agl_valid:1;/* AGL is valid for the dir */
+       wait_queue_head_t            sai_waitq;      /* stat-ahead wait queue */
+       struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
+       struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
+       struct list_head              sai_entries;    /* entry list */
+       struct list_head              sai_entries_received; /* entries returned */
+       struct list_head              sai_entries_stated;   /* entries stated */
+       struct list_head              sai_entries_agl; /* AGL entries to be sent */
+       struct list_head              sai_cache[LL_SA_CACHE_SIZE];
+       spinlock_t              sai_cache_lock[LL_SA_CACHE_SIZE];
+       atomic_t                sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+                      int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc;
+
+       down_read(&lli->lli_glimpse_sem);
+       rc = cl_glimpse_size(inode);
+       lli->lli_glimpse_time = cfs_time_current();
+       up_read(&lli->lli_glimpse_sem);
+       return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+       struct ll_inode_info     *lli = ll_i2info(dir);
+       struct ll_statahead_info *sai = lli->lli_sai;
+       struct ll_dentry_data    *ldd = ll_d2d(dentry);
+
+       /* not the same process, don't mark */
+       if (lli->lli_opendir_pid != current_pid())
+               return;
+
+       if (sai != NULL && ldd != NULL)
+               ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+ll_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+       struct ll_inode_info  *lli;
+       struct ll_dentry_data *ldd;
+
+       if (ll_i2sbi(dir)->ll_sa_max == 0)
+               return -EAGAIN;
+
+       lli = ll_i2info(dir);
+       /* not the same process, don't statahead */
+       if (lli->lli_opendir_pid != current_pid())
+               return -EAGAIN;
+
+       /* statahead has been stopped */
+       if (lli->lli_opendir_key == NULL)
+               return -EAGAIN;
+
+       ldd = ll_d2d(dentryp);
+       /*
+        * When stats a dentry, the system trigger more than once "revalidate"
+        * or "lookup", for "getattr", for "getxattr", and maybe for others.
+        * Under patchless client mode, the operation intent is not accurate,
+        * which maybe misguide the statahead thread. For example:
+        * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+        * have the same operation intent -- "IT_GETATTR".
+        * In fact, one dentry should has only one chance to interact with the
+        * statahead thread, otherwise the statahead windows will be confused.
+        * The solution is as following:
+        * Assign "lld_sa_generation" with "sai_generation" when a dentry
+        * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+        * will bypass interacting with statahead thread for checking:
+        * "lld_sa_generation == lli_sai->sai_generation"
+        */
+       if (ldd && lli->lli_sai &&
+           ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+               return -EAGAIN;
+
+       return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+       int ret;
+
+       ret = ll_need_statahead(dir, *dentryp);
+       if (ret <= 0)
+               return ret;
+
+       return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support rountine */
+enum llioc_iter {
+       LLIOC_CONT = 0,
+       LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD     256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
+ *      returned to ll_iocontrol_register.  Callback functions should use this
+ *      data to check the potential collasion of ioctl cmd. If collasion is
+ *      found, callback function should return LLIOC_CONT.
+ *  @rcp: The result of ioctl command.
+ *
+ *  Return values:
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
+ *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+               struct file *file, unsigned int cmd, unsigned long arg,
+               void *magic, int *rcp);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+               unsigned int cmd, unsigned long arg, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ *      belong to the command list @cmd.
+ *
+ * Return vaule:
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+                                              const struct iattr *attr)
+{
+       LASSERT(attr->ia_valid & ATTR_FILE);
+       return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+       ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+       ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+       LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
+       i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+       ll_inode_size_lock(inode);
+       i_size_write(inode, kms);
+       ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode)        i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+       return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+                      enum cl_fsync_mode mode);
+
+/** direct write pages */
+struct ll_dio_pages {
+       /** page array to be written. we don't support
+        * partial pages except the last one. */
+       struct page **ldp_pages;
+       /* offset of each page */
+       loff_t       *ldp_offsets;
+       /** if ldp_offsets is NULL, it means a sequential
+        * pages to be written, then this is the file offset
+        * of the * first page. */
+       loff_t  ldp_start_offset;
+       /** how many bytes are to be written. */
+       size_t  ldp_size;
+       /** # of pages in the array. */
+       int        ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+                                 int rc)
+{
+       int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+                                     LPROC_LL_OSC_WRITE;
+
+       ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                                 int rw, struct inode *inode,
+                                 struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct inode *inode = file->f_dentry->d_inode;
+
+       LASSERT(fd != NULL);
+       return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+               (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+                                   struct lookup_intent *it, __u64 *bits)
+{
+       if (!it->d.lustre.it_lock_set) {
+               struct lustre_handle handle;
+
+               /* If this inode is a remote object, it will get two
+                * separate locks in different namespaces, Master MDT,
+                * where the name entry is, will grant LOOKUP lock,
+                * remote MDT, where the object is, will grant
+                * UPDATE|PERM lock. The inode will be attched to both
+                * LOOKUP and PERM locks, so revoking either locks will
+                * case the dcache being cleared */
+               if (it->d.lustre.it_remote_lock_mode) {
+                       handle.cookie = it->d.lustre.it_remote_lock_handle;
+                       CDEBUG(D_DLMTRACE, "setting l_data to inode %p"
+                              "(%lu/%u) for remote lock "LPX64"\n", inode,
+                              inode->i_ino, inode->i_generation,
+                              handle.cookie);
+                       md_set_lock_data(exp, &handle.cookie, inode, NULL);
+               }
+
+               handle.cookie = it->d.lustre.it_lock_handle;
+
+               CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)"
+                      " for lock "LPX64"\n", inode, inode->i_ino,
+                      inode->i_generation, handle.cookie);
+
+               md_set_lock_data(exp, &handle.cookie, inode,
+                                &it->d.lustre.it_lock_bits);
+               it->d.lustre.it_lock_set = 1;
+       }
+
+       if (bits != NULL)
+               *bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+       spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+       struct ll_dentry_data *lld = ll_d2d(dentry);
+
+       return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+       struct ll_dentry_data *lld = ll_d2d(dentry);
+
+       if (lld != NULL)
+               lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry)
+{
+       CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
+              "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
+              dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+
+       spin_lock(&dentry->d_lock);
+       __d_lustre_invalidate(dentry);
+       if (d_refcount(dentry) == 0)
+               __d_drop(dentry);
+       spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+       spin_lock(&dentry->d_lock);
+       LASSERT(ll_d2d(dentry) != NULL);
+       ll_d2d(dentry)->lld_invalid = 0;
+       spin_unlock(&dentry->d_lock);
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/* Compatibility for old (1.8) compiled userspace quota code */
+struct if_quotactl_18 {
+       __u32              qc_cmd;
+       __u32              qc_type;
+       __u32              qc_id;
+       __u32              qc_stat;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+#define LL_IOC_QUOTACTL_18           _IOWR('f', 162, struct if_quotactl_18 *)
+/* End compatibility for old (1.8) compiled userspace quota code */
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+
+enum {
+       LL_LAYOUT_GEN_NONE  = ((__u32)-2),      /* layout lock was cancelled */
+       LL_LAYOUT_GEN_EMPTY = ((__u32)-1)       /* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644 (file)
index 0000000..278b97d
--- /dev/null
@@ -0,0 +1,2424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+LIST_HEAD(ll_super_blocks);
+DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef MS_HAS_NEW_AOPS
+extern struct address_space_operations ll_aops;
+#else
+extern struct address_space_operations_ext ll_aops;
+#endif
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+       struct ll_sb_info *sbi = NULL;
+       unsigned long pages;
+       unsigned long lru_page_max;
+       struct sysinfo si;
+       class_uuid_t uuid;
+       int i;
+       ENTRY;
+
+       OBD_ALLOC(sbi, sizeof(*sbi));
+       if (!sbi)
+               RETURN(NULL);
+
+       spin_lock_init(&sbi->ll_lock);
+       mutex_init(&sbi->ll_lco.lco_lock);
+       spin_lock_init(&sbi->ll_pp_extent_lock);
+       spin_lock_init(&sbi->ll_process_lock);
+       sbi->ll_rw_stats_on = 0;
+
+       si_meminfo(&si);
+       pages = si.totalram - si.totalhigh;
+       if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) {
+               lru_page_max = pages / 2;
+       } else {
+               lru_page_max = (pages / 4) * 3;
+       }
+
+       /* initialize ll_cache data */
+       atomic_set(&sbi->ll_cache.ccc_users, 0);
+       sbi->ll_cache.ccc_lru_max = lru_page_max;
+       atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+       spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+       INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+       atomic_set(&sbi->ll_cache.ccc_unstable_nr, 0);
+       init_waitqueue_head(&sbi->ll_cache.ccc_unstable_waitq);
+
+       sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+                                          SBI_DEFAULT_READAHEAD_MAX);
+       sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+       sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+                                          SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+       INIT_LIST_HEAD(&sbi->ll_conn_chain);
+       INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+       ll_generate_random_uuid(uuid);
+       class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+       CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+       spin_lock(&ll_sb_lock);
+       list_add_tail(&sbi->ll_list, &ll_super_blocks);
+       spin_unlock(&ll_sb_lock);
+
+       sbi->ll_flags |= LL_SBI_VERBOSE;
+       sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+       sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+       for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+               spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+                              pp_r_hist.oh_lock);
+               spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+                              pp_w_hist.oh_lock);
+       }
+
+       /* metadata statahead is enabled by default */
+       sbi->ll_sa_max = LL_SA_RPC_DEF;
+       atomic_set(&sbi->ll_sa_total, 0);
+       atomic_set(&sbi->ll_sa_wrong, 0);
+       atomic_set(&sbi->ll_agl_total, 0);
+       sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+       RETURN(sbi);
+}
+
+void ll_free_sbi(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       ENTRY;
+
+       if (sbi != NULL) {
+               spin_lock(&ll_sb_lock);
+               list_del(&sbi->ll_list);
+               spin_unlock(&ll_sb_lock);
+               OBD_FREE(sbi, sizeof(*sbi));
+       }
+       EXIT;
+}
+
+static struct dentry_operations ll_d_root_ops = {
+       .d_compare = ll_dcompare,
+       .d_revalidate = ll_revalidate_nd,
+};
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+                                   struct vfsmount *mnt)
+{
+       struct inode *root = 0;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       struct obd_capa *oc = NULL;
+       struct obd_statfs *osfs = NULL;
+       struct ptlrpc_request *request = NULL;
+       struct obd_connect_data *data = NULL;
+       struct obd_uuid *uuid;
+       struct md_op_data *op_data;
+       struct lustre_md lmd;
+       obd_valid valid;
+       int size, err, checksum;
+       ENTRY;
+
+       obd = class_name2obd(md);
+       if (!obd) {
+               CERROR("MD %s: not setup or attached\n", md);
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC_PTR(osfs);
+       if (osfs == NULL) {
+               OBD_FREE_PTR(data);
+               RETURN(-ENOMEM);
+       }
+
+       if (proc_lustre_fs_root) {
+               err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+                                                 dt, md);
+               if (err < 0)
+                       CERROR("could not register mount in /proc/fs/lustre\n");
+       }
+
+       /* indicate the features supported by this client */
+       data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                 OBD_CONNECT_ATTRFID  |
+                                 OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                 OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+                                 OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR    |
+                                 OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
+                                 OBD_CONNECT_EINPROGRESS |
+                                 OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+       if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+               data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+       if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+               data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+       data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+               /* flag mdc connection as lightweight, only used for test
+                * purpose, use with care */
+               data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+       data->ocd_ibits_known = MDS_INODELOCK_FULL;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+
+       if (sb->s_flags & MS_RDONLY)
+               data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+       if (sbi->ll_flags & LL_SBI_USER_XATTR)
+               data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+       /* force vfs to use lustre handler for flock() calls - bug 10743 */
+       sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+       sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+       if (sbi->ll_flags & LL_SBI_FLOCK)
+               sbi->ll_fop = &ll_file_operations_flock;
+       else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+               sbi->ll_fop = &ll_file_operations;
+       else
+               sbi->ll_fop = &ll_file_operations_noflock;
+
+       /* real client */
+       data->ocd_connect_flags |= OBD_CONNECT_REAL;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+       data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+       err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+       if (err == -EBUSY) {
+               LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+                                  "recovery, of which this client is not a "
+                                  "part. Please wait for recovery to complete,"
+                                  " abort, or time out.\n", md);
+               GOTO(out, err);
+       } else if (err) {
+               CERROR("cannot connect to %s: rc = %d\n", md, err);
+               GOTO(out, err);
+       }
+
+       sbi->ll_md_exp->exp_connect_data = *data;
+
+       err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+                          LUSTRE_SEQ_METADATA);
+       if (err) {
+               CERROR("%s: Can't init metadata layer FID infrastructure, "
+                      "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_md, err);
+       }
+
+       /* For mount, we only need fs info from MDT0, and also in DNE, it
+        * can make sure the client can be mounted as long as MDT0 is
+        * avaible */
+       err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+                       cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                       OBD_STATFS_FOR_MDT0);
+       if (err)
+               GOTO(out_md_fid, err);
+
+       /* This needs to be after statfs to ensure connect has finished.
+        * Note that "data" does NOT contain the valid connect reply.
+        * If connecting to a 1.8 server there will be no LMV device, so
+        * we can access the MDC export directly and exp_connect_flags will
+        * be non-zero, but if accessing an upgraded 2.1 server it will
+        * have the correct flags filled in.
+        * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+       valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+       if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+           valid != CLIENT_CONNECT_MDT_REQD) {
+               char *buf;
+
+               OBD_ALLOC_WAIT(buf, PAGE_CACHE_SIZE);
+               obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+                                     valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+               LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+                                  "feature(s) needed for correct operation "
+                                  "of this client (%s). Please upgrade "
+                                  "server or downgrade client.\n",
+                                  sbi->ll_md_exp->exp_obd->obd_name, buf);
+               OBD_FREE(buf, PAGE_CACHE_SIZE);
+               GOTO(out_md_fid, err = -EPROTO);
+       }
+
+       size = sizeof(*data);
+       err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+                          KEY_CONN_DATA,  &size, data, NULL);
+       if (err) {
+               CERROR("%s: Get connect data failed: rc = %d\n",
+                      sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_md_fid, err);
+       }
+
+       LASSERT(osfs->os_bsize);
+       sb->s_blocksize = osfs->os_bsize;
+       sb->s_blocksize_bits = log2(osfs->os_bsize);
+       sb->s_magic = LL_SUPER_MAGIC;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sbi->ll_namelen = osfs->os_namelen;
+       sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+       if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+           !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+               LCONSOLE_INFO("Disabling user_xattr feature because "
+                             "it is not supported on the server\n");
+               sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+               sb->s_flags |= MS_POSIXACL;
+#endif
+               sbi->ll_flags |= LL_SBI_ACL;
+       } else {
+               LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+               sb->s_flags &= ~MS_POSIXACL;
+#endif
+               sbi->ll_flags &= ~LL_SBI_ACL;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+               if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+                       sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+                       LCONSOLE_INFO("client is set as remote by default.\n");
+               }
+       } else {
+               if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+                       sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+                       LCONSOLE_INFO("client claims to be remote, but server "
+                                     "rejected, forced to be local.\n");
+               }
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+               LCONSOLE_INFO("client enabled MDS capability!\n");
+               sbi->ll_flags |= LL_SBI_MDS_CAPA;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+               LCONSOLE_INFO("client enabled OSS capability!\n");
+               sbi->ll_flags |= LL_SBI_OSS_CAPA;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+               sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+       if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+               sbi->ll_md_brw_size = data->ocd_brw_size;
+       else
+               sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+       if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+               LCONSOLE_INFO("Layout lock feature supported.\n");
+               sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+       }
+
+       obd = class_name2obd(dt);
+       if (!obd) {
+               CERROR("DT %s: not setup or attached\n", dt);
+               GOTO(out_md_fid, err = -ENODEV);
+       }
+
+       data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
+                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+                                 OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
+                                 OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+                                 OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+                                 OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+                                 OBD_CONNECT_MAXBYTES |
+                                 OBD_CONNECT_EINPROGRESS |
+                                 OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+       if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+               data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+       if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+               /* OBD_CONNECT_CKSUM should always be set, even if checksums are
+                * disabled by default, because it can still be enabled on the
+                * fly via /proc. As a consequence, we still need to come to an
+                * agreement on the supported algorithms at connect time */
+               data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+                       data->ocd_cksum_types = OBD_CKSUM_ADLER;
+               else
+                       data->ocd_cksum_types = cksum_types_supported_client();
+       }
+
+       data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+       CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
+              "ocd_grant: %d\n", data->ocd_connect_flags,
+              data->ocd_version, data->ocd_grant);
+
+       obd->obd_upcall.onu_owner = &sbi->ll_lco;
+       obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+       data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+       err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+                         NULL);
+       if (err == -EBUSY) {
+               LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+                                  "recovery, of which this client is not a "
+                                  "part.  Please wait for recovery to "
+                                  "complete, abort, or time out.\n", dt);
+               GOTO(out_md, err);
+       } else if (err) {
+               CERROR("%s: Cannot connect to %s: rc = %d\n",
+                      sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+               GOTO(out_md, err);
+       }
+
+       sbi->ll_dt_exp->exp_connect_data = *data;
+
+       err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+                          LUSTRE_SEQ_METADATA);
+       if (err) {
+               CERROR("%s: Can't init data layer FID infrastructure, "
+                      "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+               GOTO(out_dt, err);
+       }
+
+       mutex_lock(&sbi->ll_lco.lco_lock);
+       sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+       sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+       sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+       mutex_unlock(&sbi->ll_lco.lco_lock);
+
+       fid_zero(&sbi->ll_root_fid);
+       err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+       if (err) {
+               CERROR("cannot mds_connect: rc = %d\n", err);
+               GOTO(out_lock_cn_cb, err);
+       }
+       if (!fid_is_sane(&sbi->ll_root_fid)) {
+               CERROR("%s: Invalid root fid "DFID" during mount\n",
+                      sbi->ll_md_exp->exp_obd->obd_name,
+                      PFID(&sbi->ll_root_fid));
+               GOTO(out_lock_cn_cb, err = -EINVAL);
+       }
+       CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+       sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+       sb->s_export_op = &lustre_export_operations;
+#endif
+
+       /* make root inode
+        * XXX: move this to after cbd setup? */
+       valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               valid |= OBD_MD_FLRMTPERM;
+       else if (sbi->ll_flags & LL_SBI_ACL)
+               valid |= OBD_MD_FLACL;
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+       op_data->op_fid1 = sbi->ll_root_fid;
+       op_data->op_mode = 0;
+       op_data->op_capa1 = oc;
+       op_data->op_valid = valid;
+
+       err = md_getattr(sbi->ll_md_exp, op_data, &request);
+       if (oc)
+               capa_put(oc);
+       OBD_FREE_PTR(op_data);
+       if (err) {
+               CERROR("%s: md_getattr failed for root: rc = %d\n",
+                      sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_lock_cn_cb, err);
+       }
+
+       err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+                              sbi->ll_md_exp, &lmd);
+       if (err) {
+               CERROR("failed to understand root inode md: rc = %d\n", err);
+               ptlrpc_req_finished(request);
+               GOTO(out_lock_cn_cb, err);
+       }
+
+       LASSERT(fid_is_sane(&sbi->ll_root_fid));
+       root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+                                           ll_need_32bit_api(sbi)),
+                      &lmd);
+       md_free_lustre_md(sbi->ll_md_exp, &lmd);
+       ptlrpc_req_finished(request);
+
+       if (root == NULL || IS_ERR(root)) {
+               if (lmd.lsm)
+                       obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+               if (lmd.posix_acl) {
+                       posix_acl_release(lmd.posix_acl);
+                       lmd.posix_acl = NULL;
+               }
+#endif
+               err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+               root = NULL;
+               CERROR("lustre_lite: bad iget4 for root\n");
+               GOTO(out_root, err);
+       }
+
+       err = ll_close_thread_start(&sbi->ll_lcq);
+       if (err) {
+               CERROR("cannot start close thread: rc %d\n", err);
+               GOTO(out_root, err);
+       }
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               rct_init(&sbi->ll_rct);
+               et_init(&sbi->ll_et);
+       }
+#endif
+
+       checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+       err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+                                KEY_CHECKSUM, sizeof(checksum), &checksum,
+                                NULL);
+       cl_sb_init(sb);
+
+       err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+                                KEY_CACHE_SET, sizeof(sbi->ll_cache),
+                                &sbi->ll_cache, NULL);
+
+       sb->s_root = d_make_root(root);
+       if (sb->s_root == NULL) {
+               CERROR("%s: can't make root dentry\n",
+                       ll_get_fsname(sb, NULL, 0));
+               GOTO(out_root, err = -ENOMEM);
+       }
+
+       /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+       d_set_d_op(sb->s_root, &ll_d_root_ops);
+       sb->s_d_op = &ll_d_ops;
+
+       sbi->ll_sdev_orig = sb->s_dev;
+
+       /* We set sb->s_dev equal on all lustre clients in order to support
+        * NFS export clustering.  NFSD requires that the FSID be the same
+        * on all clients. */
+       /* s_dev is also used in lt_compare() to compare two fs, but that is
+        * only a node-local comparison. */
+       uuid = obd_get_uuid(sbi->ll_md_exp);
+       if (uuid != NULL)
+               sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (osfs != NULL)
+               OBD_FREE_PTR(osfs);
+
+       RETURN(err);
+out_root:
+       if (root)
+               iput(root);
+out_lock_cn_cb:
+       obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+       obd_disconnect(sbi->ll_dt_exp);
+       sbi->ll_dt_exp = NULL;
+       /* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+       obd_zombie_barrier();
+out_md_fid:
+       obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+       obd_disconnect(sbi->ll_md_exp);
+       sbi->ll_md_exp = NULL;
+out:
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (osfs != NULL)
+               OBD_FREE_PTR(osfs);
+       lprocfs_unregister_mountpoint(sbi);
+       return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+       int size, rc;
+
+       *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+       size = sizeof(int);
+       rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+                         KEY_MAX_EASIZE, &size, lmmsize, NULL);
+       if (rc)
+               CERROR("Get max mdsize error rc %d \n", rc);
+
+       RETURN(rc);
+}
+
+void ll_dump_inode(struct inode *inode)
+{
+       struct ll_d_hlist_node *tmp;
+       int dentry_count = 0;
+
+       LASSERT(inode != NULL);
+
+       ll_d_hlist_for_each(tmp, &inode->i_dentry)
+               dentry_count++;
+
+       CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
+              inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino,
+              inode->i_mode, atomic_read(&inode->i_count), dentry_count);
+}
+
+void lustre_dump_dentry(struct dentry *dentry, int recur)
+{
+       struct list_head *tmp;
+       int subdirs = 0;
+
+       LASSERT(dentry != NULL);
+
+       list_for_each(tmp, &dentry->d_subdirs)
+               subdirs++;
+
+       CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
+              " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
+              dentry->d_name.len, dentry->d_name.name,
+              dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
+              dentry->d_parent, dentry->d_inode, d_refcount(dentry),
+              dentry->d_flags, dentry->d_fsdata, subdirs);
+       if (dentry->d_inode != NULL)
+               ll_dump_inode(dentry->d_inode);
+
+       if (recur == 0)
+               return;
+
+       list_for_each(tmp, &dentry->d_subdirs) {
+               struct dentry *d = list_entry(tmp, struct dentry, d_u.d_child);
+               lustre_dump_dentry(d, recur - 1);
+       }
+}
+
+void client_common_put_super(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       ENTRY;
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               et_fini(&sbi->ll_et);
+               rct_fini(&sbi->ll_rct);
+       }
+#endif
+
+       ll_close_thread_shutdown(sbi->ll_lcq);
+
+       cl_sb_fini(sb);
+
+       list_del(&sbi->ll_conn_chain);
+
+       obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+       obd_disconnect(sbi->ll_dt_exp);
+       sbi->ll_dt_exp = NULL;
+       /* wait till all OSCs are gone, since cl_cache is accessing sbi.
+        * see LU-2543. */
+       obd_zombie_barrier();
+
+       lprocfs_unregister_mountpoint(sbi);
+
+       obd_fid_fini(sbi->ll_md_exp->exp_obd);
+       obd_disconnect(sbi->ll_md_exp);
+       sbi->ll_md_exp = NULL;
+
+       EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+
+       ENTRY;
+
+       /* not init sb ?*/
+       if (!(sb->s_flags & MS_ACTIVE))
+               return;
+
+       sbi = ll_s2sbi(sb);
+       /* we need restore s_dev from changed for clustred NFS before put_super
+        * because new kernels have cached s_dev and change sb->s_dev in
+        * put_super not affected real removing devices */
+       if (sbi)
+               sb->s_dev = sbi->ll_sdev_orig;
+       EXIT;
+}
+
+char *ll_read_opt(const char *opt, char *data)
+{
+       char *value;
+       char *retval;
+       ENTRY;
+
+       CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
+       if (strncmp(opt, data, strlen(opt)))
+               RETURN(NULL);
+       if ((value = strchr(data, '=')) == NULL)
+               RETURN(NULL);
+
+       value++;
+       OBD_ALLOC(retval, strlen(value) + 1);
+       if (!retval) {
+               CERROR("out of memory!\n");
+               RETURN(NULL);
+       }
+
+       memcpy(retval, value, strlen(value)+1);
+       CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
+       RETURN(retval);
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+       if (strncmp(opt, data, strlen(opt)) != 0)
+               return(0);
+       else
+               return(fl);
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+       int tmp;
+       char *s1 = options, *s2;
+       ENTRY;
+
+       if (!options)
+               RETURN(0);
+
+       CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+       while (*s1) {
+               CDEBUG(D_SUPER, "next opt=%s\n", s1);
+               tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 50, 0)
+               tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
+               if (tmp) {
+                       /* Ignore deprecated mount option.  The client will
+                        * always try to mount with ACL support, whether this
+                        * is used depends on whether server supports it. */
+                       LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                 "mount option 'acl'.\n");
+                       goto next;
+               }
+               tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
+               if (tmp) {
+                       LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                 "mount option 'noacl'.\n");
+                       goto next;
+               }
+#else
+#warning "{no}acl options have been deprecated since 1.8, please remove them"
+#endif
+               tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+
+               tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+                                  s1);
+               RETURN(-EINVAL);
+
+next:
+               /* Find next opt */
+               s2 = strchr(s1, ',');
+               if (s2 == NULL)
+                       break;
+               s1 = s2 + 1;
+       }
+       RETURN(0);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+       lli->lli_inode_magic = LLI_INODE_MAGIC;
+       lli->lli_flags = 0;
+       lli->lli_ioepoch = 0;
+       lli->lli_maxbytes = MAX_LFS_FILESIZE;
+       spin_lock_init(&lli->lli_lock);
+       lli->lli_posix_acl = NULL;
+       lli->lli_remote_perms = NULL;
+       mutex_init(&lli->lli_rmtperm_mutex);
+       /* Do not set lli_fid, it has been initialized already. */
+       fid_zero(&lli->lli_pfid);
+       INIT_LIST_HEAD(&lli->lli_close_list);
+       INIT_LIST_HEAD(&lli->lli_oss_capas);
+       atomic_set(&lli->lli_open_count, 0);
+       lli->lli_mds_capa = NULL;
+       lli->lli_rmtperm_time = 0;
+       lli->lli_pending_och = NULL;
+       lli->lli_mds_read_och = NULL;
+       lli->lli_mds_write_och = NULL;
+       lli->lli_mds_exec_och = NULL;
+       lli->lli_open_fd_read_count = 0;
+       lli->lli_open_fd_write_count = 0;
+       lli->lli_open_fd_exec_count = 0;
+       mutex_init(&lli->lli_och_mutex);
+       spin_lock_init(&lli->lli_agl_lock);
+       lli->lli_has_smd = false;
+       lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+       lli->lli_clob = NULL;
+
+       LASSERT(lli->lli_vfs_inode.i_mode != 0);
+       if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+               mutex_init(&lli->lli_readdir_mutex);
+               lli->lli_opendir_key = NULL;
+               lli->lli_sai = NULL;
+               lli->lli_def_acl = NULL;
+               spin_lock_init(&lli->lli_sa_lock);
+               lli->lli_opendir_pid = 0;
+       } else {
+               sema_init(&lli->lli_size_sem, 1);
+               lli->lli_size_sem_owner = NULL;
+               lli->lli_symlink_name = NULL;
+               init_rwsem(&lli->lli_trunc_sem);
+               mutex_init(&lli->lli_write_mutex);
+               init_rwsem(&lli->lli_glimpse_sem);
+               lli->lli_glimpse_time = 0;
+               INIT_LIST_HEAD(&lli->lli_agl_list);
+               lli->lli_agl_index = 0;
+               lli->lli_async_rc = 0;
+               lli->lli_volatile = false;
+       }
+       mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+       static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+       bdi->name = "lustre";
+       return bdi_register(bdi, NULL, "lustre-%d",
+                           atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+       struct lustre_profile *lprof = NULL;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi;
+       char  *dt = NULL, *md = NULL;
+       char  *profilenm = get_profile_name(sb);
+       struct config_llog_instance *cfg;
+       /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+       const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+       int    err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+       OBD_ALLOC_PTR(cfg);
+       if (cfg == NULL)
+               RETURN(-ENOMEM);
+
+       try_module_get(THIS_MODULE);
+
+       /* client additional sb info */
+       lsi->lsi_llsbi = sbi = ll_init_sbi();
+       if (!sbi) {
+               module_put(THIS_MODULE);
+               OBD_FREE_PTR(cfg);
+               RETURN(-ENOMEM);
+       }
+
+       err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+       if (err)
+               GOTO(out_free, err);
+
+       err = bdi_init(&lsi->lsi_bdi);
+       if (err)
+               GOTO(out_free, err);
+       lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+       lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+       err = ll_bdi_register(&lsi->lsi_bdi);
+       if (err)
+               GOTO(out_free, err);
+
+       sb->s_bdi = &lsi->lsi_bdi;
+
+       /* Generate a string unique to this super, in case some joker tries
+          to mount the same fs at two mount points.
+          Use the address of the super itself.*/
+       cfg->cfg_instance = sb;
+       cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+       cfg->cfg_callback = class_config_llog_handler;
+       /* set up client obds */
+       err = lustre_process_log(sb, profilenm, cfg);
+       if (err < 0) {
+               CERROR("Unable to process log: %d\n", err);
+               GOTO(out_free, err);
+       }
+
+       /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+       lprof = class_get_profile(profilenm);
+       if (lprof == NULL) {
+               LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+                                  " read from the MGS.  Does that filesystem "
+                                  "exist?\n", profilenm);
+               GOTO(out_free, err = -EINVAL);
+       }
+       CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+              lprof->lp_md, lprof->lp_dt);
+
+       OBD_ALLOC(dt, strlen(lprof->lp_dt) + instlen + 2);
+       if (!dt)
+               GOTO(out_free, err = -ENOMEM);
+       sprintf(dt, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+
+       OBD_ALLOC(md, strlen(lprof->lp_md) + instlen + 2);
+       if (!md)
+               GOTO(out_free, err = -ENOMEM);
+       sprintf(md, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+
+       /* connections, registrations, sb setup */
+       err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+       if (md)
+               OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+       if (dt)
+               OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+       if (err)
+               ll_put_super(sb);
+       else if (sbi->ll_flags & LL_SBI_VERBOSE)
+               LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+       OBD_FREE_PTR(cfg);
+       RETURN(err);
+} /* ll_fill_super */
+
+
+void lu_context_keys_dump(void);
+
+void ll_put_super(struct super_block *sb)
+{
+       struct config_llog_instance cfg;
+       struct obd_device *obd;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       char *profilenm = get_profile_name(sb);
+       int ccc_count, next, force = 1, rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+       ll_print_capa_stat(sbi);
+
+       cfg.cfg_instance = sb;
+       lustre_end_log(sb, profilenm, &cfg);
+
+       if (sbi->ll_md_exp) {
+               obd = class_exp2obd(sbi->ll_md_exp);
+               if (obd)
+                       force = obd->obd_force;
+       }
+
+       /* Wait for unstable pages to be committed to stable storage */
+       if (force == 0) {
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(sbi->ll_cache.ccc_unstable_waitq,
+                       atomic_read(&sbi->ll_cache.ccc_unstable_nr) == 0,
+                       &lwi);
+       }
+
+       ccc_count = atomic_read(&sbi->ll_cache.ccc_unstable_nr);
+       if (force == 0 && rc != -EINTR)
+               LASSERTF(ccc_count == 0, "count: %i\n", ccc_count);
+
+
+       /* We need to set force before the lov_disconnect in
+          lustre_common_put_super, since l_d cleans up osc's as well. */
+       if (force) {
+               next = 0;
+               while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+                                                    &next)) != NULL) {
+                       obd->obd_force = force;
+               }
+       }
+
+       if (sbi->ll_lcq) {
+               /* Only if client_common_fill_super succeeded */
+               client_common_put_super(sb);
+       }
+
+       next = 0;
+       while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+               class_manual_cleanup(obd);
+       }
+
+       if (sbi->ll_flags & LL_SBI_VERBOSE)
+               LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+       if (profilenm)
+               class_del_profile(profilenm);
+
+       if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+               bdi_destroy(&lsi->lsi_bdi);
+               lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+       }
+
+       ll_free_sbi(sb);
+       lsi->lsi_llsbi = NULL;
+
+       lustre_common_put_super(sb);
+
+       module_put(THIS_MODULE);
+
+       EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+       struct inode *inode = NULL;
+
+       /* NOTE: we depend on atomic igrab() -bzzz */
+       lock_res_and_lock(lock);
+       if (lock->l_resource->lr_lvb_inode) {
+               struct ll_inode_info * lli;
+               lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+               if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+                       inode = igrab(lock->l_resource->lr_lvb_inode);
+               } else {
+                       inode = lock->l_resource->lr_lvb_inode;
+                       LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+                                        D_WARNING, lock, "lr_lvb_inode %p is "
+                                        "bogus: magic %08x",
+                                        lock->l_resource->lr_lvb_inode,
+                                        lli->lli_inode_magic);
+                       inode = NULL;
+               }
+       }
+       unlock_res_and_lock(lock);
+       return inode;
+}
+
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
+{
+       struct inode *inode = NULL;
+       /* NOTE: we depend on atomic igrab() -bzzz */
+       lock_res_and_lock(lock);
+       if (lock->l_ast_data) {
+               struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
+               if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+                       inode = igrab(lock->l_ast_data);
+               } else {
+                       inode = lock->l_ast_data;
+                       LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+                                        D_WARNING, lock, "l_ast_data %p is "
+                                        "bogus: magic %08x", lock->l_ast_data,
+                                        lli->lli_inode_magic);
+                       inode = NULL;
+               }
+       }
+       unlock_res_and_lock(lock);
+       return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+
+       if (S_ISDIR(inode->i_mode)) {
+               /* these should have been cleared in ll_file_release */
+               LASSERT(lli->lli_opendir_key == NULL);
+               LASSERT(lli->lli_sai == NULL);
+               LASSERT(lli->lli_opendir_pid == 0);
+       }
+
+       ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+       md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+       LASSERT(!lli->lli_open_fd_write_count);
+       LASSERT(!lli->lli_open_fd_read_count);
+       LASSERT(!lli->lli_open_fd_exec_count);
+
+       if (lli->lli_mds_write_och)
+               ll_md_real_close(inode, FMODE_WRITE);
+       if (lli->lli_mds_exec_och)
+               ll_md_real_close(inode, FMODE_EXEC);
+       if (lli->lli_mds_read_och)
+               ll_md_real_close(inode, FMODE_READ);
+
+       if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+               OBD_FREE(lli->lli_symlink_name,
+                        strlen(lli->lli_symlink_name) + 1);
+               lli->lli_symlink_name = NULL;
+       }
+
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               LASSERT(lli->lli_posix_acl == NULL);
+               if (lli->lli_remote_perms) {
+                       free_rmtperm_hash(lli->lli_remote_perms);
+                       lli->lli_remote_perms = NULL;
+               }
+       }
+#ifdef CONFIG_FS_POSIX_ACL
+       else if (lli->lli_posix_acl) {
+               LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+               LASSERT(lli->lli_remote_perms == NULL);
+               posix_acl_release(lli->lli_posix_acl);
+               lli->lli_posix_acl = NULL;
+       }
+#endif
+       lli->lli_inode_magic = LLI_INODE_DEAD;
+
+       ll_clear_inode_capas(inode);
+       if (!S_ISDIR(inode->i_mode))
+               LASSERT(list_empty(&lli->lli_agl_list));
+
+       /*
+        * XXX This has to be done before lsm is freed below, because
+        * cl_object still uses inode lsm.
+        */
+       cl_inode_fini(inode);
+       lli->lli_has_smd = false;
+
+       EXIT;
+}
+
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+                 struct md_open_data **mod)
+{
+       struct lustre_md md;
+       struct inode *inode = dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *request = NULL;
+       int rc, ia_valid;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+                       &request, mod);
+       if (rc) {
+               ptlrpc_req_finished(request);
+               if (rc == -ENOENT) {
+                       clear_nlink(inode);
+                       /* Unlinked special device node? Or just a race?
+                        * Pretend we done everything. */
+                       if (!S_ISREG(inode->i_mode) &&
+                           !S_ISDIR(inode->i_mode)) {
+                               ia_valid = op_data->op_attr.ia_valid;
+                               op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+                               rc = simple_setattr(dentry, &op_data->op_attr);
+                               op_data->op_attr.ia_valid = ia_valid;
+                       }
+               } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+                       CERROR("md_setattr fails: rc = %d\n", rc);
+               }
+               RETURN(rc);
+       }
+
+       rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+                             sbi->ll_md_exp, &md);
+       if (rc) {
+               ptlrpc_req_finished(request);
+               RETURN(rc);
+       }
+
+       ia_valid = op_data->op_attr.ia_valid;
+       /* inode size will be in ll_setattr_ost, can't do it now since dirty
+        * cache is not cleared yet. */
+       op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+       rc = simple_setattr(dentry, &op_data->op_attr);
+       op_data->op_attr.ia_valid = ia_valid;
+
+       /* Extract epoch data if obtained. */
+       op_data->op_handle = md.body->handle;
+       op_data->op_ioepoch = md.body->ioepoch;
+
+       ll_update_inode(inode, &md);
+       ptlrpc_req_finished(request);
+
+       RETURN(rc);
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+                                  struct md_op_data *op_data,
+                                  struct md_open_data *mod)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       if (!S_ISREG(inode->i_mode))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID" for truncate\n",
+              op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+       op_data->op_flags = MF_EPOCH_CLOSE;
+       ll_done_writing_attr(inode, op_data);
+       ll_pack_inode2opdata(inode, op_data, NULL);
+
+       rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+       if (rc == -EAGAIN) {
+               /* MDS has instructed us to obtain Size-on-MDS attribute
+                * from OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+       } else if (rc) {
+               CERROR("inode %lu mdc truncate failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+       RETURN(rc);
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+       struct obd_capa *capa;
+       int rc;
+
+       if (attr->ia_valid & ATTR_SIZE)
+               capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+       else
+               capa = ll_mdscapa_get(inode);
+
+       rc = cl_setattr_ost(inode, attr, capa);
+
+       if (attr->ia_valid & ATTR_SIZE)
+               ll_truncate_free_capa(capa);
+       else
+               capa_put(capa);
+
+       return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct md_op_data *op_data = NULL;
+       struct md_open_data *mod = NULL;
+       int rc = 0, rc1 = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, "
+               "valid %x\n", ll_get_fsname(inode->i_sb, NULL, 0), inode,
+               PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+               attr->ia_valid);
+
+       if (attr->ia_valid & ATTR_SIZE) {
+               /* Check new size against VFS/VM file size limit and rlimit */
+               rc = inode_newsize_ok(inode, attr->ia_size);
+               if (rc)
+                       RETURN(rc);
+
+               /* The maximum Lustre file size is variable, based on the
+                * OST maximum object size and number of stripes.  This
+                * needs another check in addition to the VFS check above. */
+               if (attr->ia_size > ll_file_maxbytes(inode)) {
+                       CDEBUG(D_INODE,"file "DFID" too large %llu > "LPU64"\n",
+                              PFID(&lli->lli_fid), attr->ia_size,
+                              ll_file_maxbytes(inode));
+                       RETURN(-EFBIG);
+               }
+
+               attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+       }
+
+       /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+       if (attr->ia_valid & TIMES_SET_FLAGS) {
+               if (current_fsuid() != inode->i_uid &&
+                   !cfs_capable(CFS_CAP_FOWNER))
+                       RETURN(-EPERM);
+       }
+
+       /* We mark all of the fields "set" so MDS/OST does not re-set them */
+       if (attr->ia_valid & ATTR_CTIME) {
+               attr->ia_ctime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_CTIME_SET;
+       }
+       if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+           (attr->ia_valid & ATTR_ATIME)) {
+               attr->ia_atime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_ATIME_SET;
+       }
+       if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+           (attr->ia_valid & ATTR_MTIME)) {
+               attr->ia_mtime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_MTIME_SET;
+       }
+
+       if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+               CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+                      LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+                      cfs_time_current_sec());
+
+       /* If we are changing file size, file content is modified, flag it. */
+       if (attr->ia_valid & ATTR_SIZE) {
+               attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags |= LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       /* We always do an MDS RPC, even if we're only changing the size;
+        * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               RETURN(-ENOMEM);
+
+       if (!S_ISDIR(inode->i_mode)) {
+               if (attr->ia_valid & ATTR_SIZE)
+                       inode_dio_write_done(inode);
+               mutex_unlock(&inode->i_mutex);
+               down_write(&lli->lli_trunc_sem);
+       }
+
+       memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+       /* Open epoch for truncate. */
+       if (exp_connect_som(ll_i2mdexp(inode)) &&
+           (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+               op_data->op_flags = MF_EPOCH_OPEN;
+
+       rc = ll_md_setattr(dentry, op_data, &mod);
+       if (rc)
+               GOTO(out, rc);
+
+       /* RPC to MDT is sent, cancel data modification flag */
+       if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       ll_ioepoch_open(lli, op_data->op_ioepoch);
+       if (!S_ISREG(inode->i_mode))
+               GOTO(out, rc = 0);
+
+       if (attr->ia_valid & (ATTR_SIZE |
+                             ATTR_ATIME | ATTR_ATIME_SET |
+                             ATTR_MTIME | ATTR_MTIME_SET))
+               /* For truncate and utimes sending attributes to OSTs, setting
+                * mtime/atime to the past will be performed under PW [0:EOF]
+                * extent lock (new_size:EOF for truncate).  It may seem
+                * excessive to send mtime/atime updates to OSTs when not
+                * setting times to past, but it is necessary due to possible
+                * time de-synchronization between MDT inode and OST objects */
+               rc = ll_setattr_ost(inode, attr);
+       EXIT;
+out:
+       if (op_data) {
+               if (op_data->op_ioepoch) {
+                       rc1 = ll_setattr_done_writing(inode, op_data, mod);
+                       if (!rc)
+                               rc = rc1;
+               }
+               ll_finish_md_op_data(op_data);
+       }
+       if (!S_ISDIR(inode->i_mode)) {
+               up_write(&lli->lli_trunc_sem);
+               mutex_lock(&inode->i_mutex);
+               if (attr->ia_valid & ATTR_SIZE)
+                       inode_dio_wait(inode);
+       }
+
+       ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+                       LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+       return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+       int mode = de->d_inode->i_mode;
+
+       if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+                             (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+               attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+       if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+                              (ATTR_SIZE|ATTR_MODE)) &&
+           (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+            (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+             !(attr->ia_mode & S_ISGID))))
+               attr->ia_valid |= ATTR_FORCE;
+
+       if ((mode & S_ISUID) &&
+           !(attr->ia_mode & S_ISUID) &&
+           !(attr->ia_valid & ATTR_KILL_SUID))
+               attr->ia_valid |= ATTR_KILL_SUID;
+
+       if (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+           !(attr->ia_mode & S_ISGID) &&
+           !(attr->ia_valid & ATTR_KILL_SGID))
+               attr->ia_valid |= ATTR_KILL_SGID;
+
+       return ll_setattr_raw(de, attr);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                      __u64 max_age, __u32 flags)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_statfs obd_osfs;
+       int rc;
+       ENTRY;
+
+       rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+       if (rc) {
+               CERROR("md_statfs fails: rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       osfs->os_type = sb->s_magic;
+
+       CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+              osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+
+       if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+               flags |= OBD_STATFS_NODELAY;
+
+       rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+       if (rc) {
+               CERROR("obd_statfs fails: rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+              obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+              obd_osfs.os_files);
+
+       osfs->os_bsize = obd_osfs.os_bsize;
+       osfs->os_blocks = obd_osfs.os_blocks;
+       osfs->os_bfree = obd_osfs.os_bfree;
+       osfs->os_bavail = obd_osfs.os_bavail;
+
+       /* If we don't have as many objects free on the OST as inodes
+        * on the MDS, we reduce the total number of inodes to
+        * compensate, so that the "inodes in use" number is correct.
+        */
+       if (obd_osfs.os_ffree < osfs->os_ffree) {
+               osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+                       obd_osfs.os_ffree;
+               osfs->os_ffree = obd_osfs.os_ffree;
+       }
+
+       RETURN(rc);
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+       struct super_block *sb = de->d_sb;
+       struct obd_statfs osfs;
+       int rc;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64());
+       ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+       /* Some amount of caching on the client is allowed */
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+       if (rc)
+               return rc;
+
+       statfs_unpack(sfs, &osfs);
+
+       /* We need to downshift for all 32-bit kernels, because we can't
+        * tell if the kernel is being called via sys_statfs64() or not.
+        * Stop before overflowing f_bsize - in which case it is better
+        * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+       if (sizeof(long) < 8) {
+               while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+                       sfs->f_bsize <<= 1;
+
+                       osfs.os_blocks >>= 1;
+                       osfs.os_bfree >>= 1;
+                       osfs.os_bavail >>= 1;
+               }
+       }
+
+       sfs->f_blocks = osfs.os_blocks;
+       sfs->f_bfree = osfs.os_bfree;
+       sfs->f_bavail = osfs.os_bavail;
+
+       return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+       struct ll_inode_info *lli;
+
+       LASSERT(!S_ISDIR(inode->i_mode));
+
+       lli = ll_i2info(inode);
+       LASSERT(lli->lli_size_sem_owner != current);
+       down(&lli->lli_size_sem);
+       LASSERT(lli->lli_size_sem_owner == NULL);
+       lli->lli_size_sem_owner = current;
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+       struct ll_inode_info *lli;
+
+       lli = ll_i2info(inode);
+       LASSERT(lli->lli_size_sem_owner == current);
+       lli->lli_size_sem_owner = NULL;
+       up(&lli->lli_size_sem);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct mdt_body *body = md->body;
+       struct lov_stripe_md *lsm = md->lsm;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+       LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+       if (lsm != NULL) {
+               if (!lli->lli_has_smd &&
+                   !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+                       cl_file_inode_init(inode, md);
+
+               lli->lli_maxbytes = lsm->lsm_maxbytes;
+               if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+                       lli->lli_maxbytes = MAX_LFS_FILESIZE;
+       }
+
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               if (body->valid & OBD_MD_FLRMTPERM)
+                       ll_update_remote_perm(inode, md->remote_perm);
+       }
+#ifdef CONFIG_FS_POSIX_ACL
+       else if (body->valid & OBD_MD_FLACL) {
+               spin_lock(&lli->lli_lock);
+               if (lli->lli_posix_acl)
+                       posix_acl_release(lli->lli_posix_acl);
+               lli->lli_posix_acl = md->posix_acl;
+               spin_unlock(&lli->lli_lock);
+       }
+#endif
+       inode->i_ino = cl_fid_build_ino(&body->fid1, ll_need_32bit_api(sbi));
+       inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+       if (body->valid & OBD_MD_FLATIME) {
+               if (body->atime > LTIME_S(inode->i_atime))
+                       LTIME_S(inode->i_atime) = body->atime;
+               lli->lli_lvb.lvb_atime = body->atime;
+       }
+       if (body->valid & OBD_MD_FLMTIME) {
+               if (body->mtime > LTIME_S(inode->i_mtime)) {
+                       CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+                              "to "LPU64"\n", inode->i_ino,
+                              LTIME_S(inode->i_mtime), body->mtime);
+                       LTIME_S(inode->i_mtime) = body->mtime;
+               }
+               lli->lli_lvb.lvb_mtime = body->mtime;
+       }
+       if (body->valid & OBD_MD_FLCTIME) {
+               if (body->ctime > LTIME_S(inode->i_ctime))
+                       LTIME_S(inode->i_ctime) = body->ctime;
+               lli->lli_lvb.lvb_ctime = body->ctime;
+       }
+       if (body->valid & OBD_MD_FLMODE)
+               inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+       if (body->valid & OBD_MD_FLTYPE)
+               inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+       LASSERT(inode->i_mode != 0);
+       if (S_ISREG(inode->i_mode)) {
+               inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS);
+       } else {
+               inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+       }
+       if (body->valid & OBD_MD_FLUID)
+               inode->i_uid = body->uid;
+       if (body->valid & OBD_MD_FLGID)
+               inode->i_gid = body->gid;
+       if (body->valid & OBD_MD_FLFLAGS)
+               inode->i_flags = ll_ext_to_inode_flags(body->flags);
+       if (body->valid & OBD_MD_FLNLINK)
+               set_nlink(inode, body->nlink);
+       if (body->valid & OBD_MD_FLRDEV)
+               inode->i_rdev = old_decode_dev(body->rdev);
+
+       if (body->valid & OBD_MD_FLID) {
+               /* FID shouldn't be changed! */
+               if (fid_is_sane(&lli->lli_fid)) {
+                       LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+                                "Trying to change FID "DFID
+                                " to the "DFID", inode %lu/%u(%p)\n",
+                                PFID(&lli->lli_fid), PFID(&body->fid1),
+                                inode->i_ino, inode->i_generation, inode);
+               } else
+                       lli->lli_fid = body->fid1;
+       }
+
+       LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+       if (body->valid & OBD_MD_FLSIZE) {
+               if (exp_connect_som(ll_i2mdexp(inode)) &&
+                   S_ISREG(inode->i_mode)) {
+                       struct lustre_handle lockh;
+                       ldlm_mode_t mode;
+
+                       /* As it is possible a blocking ast has been processed
+                        * by this time, we need to check there is an UPDATE
+                        * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+                        * it. */
+                       mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+                                              &lockh, LDLM_FL_CBPENDING);
+                       if (mode) {
+                               if (lli->lli_flags & (LLIF_DONE_WRITING |
+                                                     LLIF_EPOCH_PENDING |
+                                                     LLIF_SOM_DIRTY)) {
+                                       CERROR("ino %lu flags %u still has "
+                                              "size authority! do not trust "
+                                              "the size got from MDS\n",
+                                              inode->i_ino, lli->lli_flags);
+                               } else {
+                                       /* Use old size assignment to avoid
+                                        * deadlock bz14138 & bz14326 */
+                                       i_size_write(inode, body->size);
+                                       lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+                               }
+                               ldlm_lock_decref(&lockh, mode);
+                       }
+               } else {
+                       /* Use old size assignment to avoid
+                        * deadlock bz14138 & bz14326 */
+                       i_size_write(inode, body->size);
+
+                       CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+                              inode->i_ino, (unsigned long long)body->size);
+               }
+
+               if (body->valid & OBD_MD_FLBLOCKS)
+                       inode->i_blocks = body->blocks;
+       }
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               LASSERT(md->mds_capa);
+               ll_add_capa(inode, md->mds_capa);
+       }
+       if (body->valid & OBD_MD_FLOSSCAPA) {
+               LASSERT(md->oss_capa);
+               ll_add_capa(inode, md->oss_capa);
+       }
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+       struct lustre_md *md = opaque;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+              PFID(&lli->lli_fid), inode);
+
+       LASSERT(!lli->lli_has_smd);
+
+       /* Core attributes from the MDS first.  This is a new inode, and
+        * the VFS doesn't zero times in the core inode so we have to do
+        * it ourselves.  They will be overwritten by either MDS or OST
+        * attributes - we just need to make sure they aren't newer. */
+       LTIME_S(inode->i_mtime) = 0;
+       LTIME_S(inode->i_atime) = 0;
+       LTIME_S(inode->i_ctime) = 0;
+       inode->i_rdev = 0;
+       ll_update_inode(inode, md);
+
+       /* OIDEBUG(inode); */
+
+       /* initializing backing dev info. */
+       inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+
+
+       if (S_ISREG(inode->i_mode)) {
+               struct ll_sb_info *sbi = ll_i2sbi(inode);
+               inode->i_op = &ll_file_inode_operations;
+               inode->i_fop = sbi->ll_fop;
+               inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+               EXIT;
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &ll_dir_inode_operations;
+               inode->i_fop = &ll_dir_operations;
+               EXIT;
+       } else if (S_ISLNK(inode->i_mode)) {
+               inode->i_op = &ll_fast_symlink_inode_operations;
+               EXIT;
+       } else {
+               inode->i_op = &ll_special_inode_operations;
+
+               init_special_inode(inode, inode->i_mode,
+                                  inode->i_rdev);
+
+               EXIT;
+       }
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+       struct cl_inode_info *lli = cl_i2info(inode);
+       ENTRY;
+
+       if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+               /* discard all dirty pages before truncating them, required by
+                * osc_extent implementation at LU-1030. */
+               cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_DISCARD);
+
+       truncate_inode_pages(&inode->i_data, 0);
+
+       /* Workaround for LU-118 */
+       if (inode->i_data.nrpages) {
+               TREE_READ_LOCK_IRQ(&inode->i_data);
+               TREE_READ_UNLOCK_IRQ(&inode->i_data);
+               LASSERTF(inode->i_data.nrpages == 0,
+                        "inode=%lu/%u(%p) nrpages=%lu, see "
+                        "http://jira.whamcloud.com/browse/LU-118\n",
+                        inode->i_ino, inode->i_generation, inode,
+                        inode->i_data.nrpages);
+       }
+       /* Workaround end */
+
+       ll_clear_inode(inode);
+       clear_inode(inode);
+
+       EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+                unsigned int cmd, unsigned long arg)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       int rc, flags = 0;
+       ENTRY;
+
+       switch(cmd) {
+       case FSFILT_IOC_GETFLAGS: {
+               struct mdt_body *body;
+               struct md_op_data *op_data;
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                            0, 0, LUSTRE_OPC_ANY,
+                                            NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               op_data->op_valid = OBD_MD_FLFLAGS;
+               rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+               ll_finish_md_op_data(op_data);
+               if (rc) {
+                       CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+                       RETURN(-abs(rc));
+               }
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+               flags = body->flags;
+
+               ptlrpc_req_finished(req);
+
+               RETURN(put_user(flags, (int *)arg));
+       }
+       case FSFILT_IOC_SETFLAGS: {
+               struct lov_stripe_md *lsm;
+               struct obd_info oinfo = { { { 0 } } };
+               struct md_op_data *op_data;
+
+               if (get_user(flags, (int *)arg))
+                       RETURN(-EFAULT);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+               op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+               rc = md_setattr(sbi->ll_md_exp, op_data,
+                               NULL, 0, NULL, 0, &req, NULL);
+               ll_finish_md_op_data(op_data);
+               ptlrpc_req_finished(req);
+               if (rc)
+                       RETURN(rc);
+
+               inode->i_flags = ll_ext_to_inode_flags(flags);
+
+               lsm = ccc_inode_lsm_get(inode);
+               if (lsm == NULL)
+                       RETURN(0);
+
+               OBDO_ALLOC(oinfo.oi_oa);
+               if (!oinfo.oi_oa) {
+                       ccc_inode_lsm_put(inode, lsm);
+                       RETURN(-ENOMEM);
+               }
+               oinfo.oi_md = lsm;
+               oinfo.oi_oa->o_oi = lsm->lsm_oi;
+               oinfo.oi_oa->o_flags = flags;
+               oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+                                      OBD_MD_FLGROUP;
+               oinfo.oi_capa = ll_mdscapa_get(inode);
+               obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+               rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+               capa_put(oinfo.oi_capa);
+               OBDO_FREE(oinfo.oi_oa);
+               ccc_inode_lsm_put(inode, lsm);
+
+               if (rc && rc != -EPERM && rc != -EACCES)
+                       CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+               RETURN(rc);
+       }
+       default:
+               RETURN(-ENOSYS);
+       }
+
+       RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+       struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+       CDEBUG(D_SEC, "flush context for user %d\n", current_uid());
+
+       obd_set_info_async(NULL, sbi->ll_md_exp,
+                          sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+                          0, NULL, NULL);
+       obd_set_info_async(NULL, sbi->ll_dt_exp,
+                          sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+                          0, NULL, NULL);
+       return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       struct obd_ioctl_data *ioc_data;
+       ENTRY;
+
+
+       CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+              sb->s_count, atomic_read(&sb->s_active));
+
+       obd = class_exp2obd(sbi->ll_md_exp);
+       if (obd == NULL) {
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      sbi->ll_md_exp->exp_handle.h_cookie);
+               EXIT;
+               return;
+       }
+       obd->obd_force = 1;
+
+       obd = class_exp2obd(sbi->ll_dt_exp);
+       if (obd == NULL) {
+               CERROR("Invalid LOV connection handle "LPX64"\n",
+                      sbi->ll_dt_exp->exp_handle.h_cookie);
+               EXIT;
+               return;
+       }
+       obd->obd_force = 1;
+
+       OBD_ALLOC_PTR(ioc_data);
+       if (ioc_data) {
+               obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+                             sizeof *ioc_data, ioc_data, NULL);
+
+               obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+                             sizeof *ioc_data, ioc_data, NULL);
+
+               OBD_FREE_PTR(ioc_data);
+       }
+
+
+       /* Really, we'd like to wait until there are no requests outstanding,
+        * and then continue.  For now, we just invalidate the requests,
+        * schedule() and sleep one second if needed, and hope.
+        */
+       schedule();
+
+       EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       char *profilenm = get_profile_name(sb);
+       int err;
+       __u32 read_only;
+
+       if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+               read_only = *flags & MS_RDONLY;
+               err = obd_set_info_async(NULL, sbi->ll_md_exp,
+                                        sizeof(KEY_READ_ONLY),
+                                        KEY_READ_ONLY, sizeof(read_only),
+                                        &read_only, NULL);
+               if (err) {
+                       LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+                                     profilenm, read_only ?
+                                     "read-only" : "read-write", err);
+                       return err;
+               }
+
+               if (read_only)
+                       sb->s_flags |= MS_RDONLY;
+               else
+                       sb->s_flags &= ~MS_RDONLY;
+
+               if (sbi->ll_flags & LL_SBI_VERBOSE)
+                       LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+                                     read_only ?  "read-only" : "read-write");
+       }
+       return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+                 struct super_block *sb, struct lookup_intent *it)
+{
+       struct ll_sb_info *sbi = NULL;
+       struct lustre_md md;
+       int rc;
+       ENTRY;
+
+       LASSERT(*inode || sb);
+       sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+       rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+                             sbi->ll_md_exp, &md);
+       if (rc)
+               RETURN(rc);
+
+       if (*inode) {
+               ll_update_inode(*inode, &md);
+       } else {
+               LASSERT(sb != NULL);
+
+               /*
+                * At this point server returns to client's same fid as client
+                * generated for creating. So using ->fid1 is okay here.
+                */
+               LASSERT(fid_is_sane(&md.body->fid1));
+
+               *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+                                                     ll_need_32bit_api(sbi)),
+                                &md);
+               if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+                       if (md.posix_acl) {
+                               posix_acl_release(md.posix_acl);
+                               md.posix_acl = NULL;
+                       }
+#endif
+                       rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+                       *inode = NULL;
+                       CERROR("new_inode -fatal: rc %d\n", rc);
+                       GOTO(out, rc);
+               }
+       }
+
+       /* Handling piggyback layout lock.
+        * Layout lock can be piggybacked by getattr and open request.
+        * The lsm can be applied to inode only if it comes with a layout lock
+        * otherwise correct layout may be overwritten, for example:
+        * 1. proc1: mdt returns a lsm but not granting layout
+        * 2. layout was changed by another client
+        * 3. proc2: refresh layout and layout lock granted
+        * 4. proc1: to apply a stale layout */
+       if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+               struct lustre_handle lockh;
+               struct ldlm_lock *lock;
+
+               lockh.cookie = it->d.lustre.it_lock_handle;
+               lock = ldlm_handle2lock(&lockh);
+               LASSERT(lock != NULL);
+               if (ldlm_has_layout(lock)) {
+                       struct cl_object_conf conf;
+
+                       memset(&conf, 0, sizeof(conf));
+                       conf.coc_opc = OBJECT_CONF_SET;
+                       conf.coc_inode = *inode;
+                       conf.coc_lock = lock;
+                       conf.u.coc_md = &md;
+                       (void)ll_layout_conf(*inode, &conf);
+               }
+               LDLM_LOCK_PUT(lock);
+       }
+
+out:
+       if (md.lsm != NULL)
+               obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+       md_free_lustre_md(sbi->ll_md_exp, &md);
+       RETURN(rc);
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+       struct ll_sb_info *sbi = NULL;
+       struct obd_export *exp;
+       char *buf = NULL;
+       struct obd_ioctl_data *data = NULL;
+       __u32 type;
+       __u32 flags;
+       int len = 0, rc;
+
+       if (!inode || !(sbi = ll_i2sbi(inode)))
+               GOTO(out_statfs, rc = -EINVAL);
+
+       rc = obd_ioctl_getdata(&buf, &len, arg);
+       if (rc)
+               GOTO(out_statfs, rc);
+
+       data = (void*)buf;
+       if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+           !data->ioc_pbuf1 || !data->ioc_pbuf2)
+               GOTO(out_statfs, rc = -EINVAL);
+
+       if (data->ioc_inllen1 != sizeof(__u32) ||
+           data->ioc_inllen2 != sizeof(__u32) ||
+           data->ioc_plen1 != sizeof(struct obd_statfs) ||
+           data->ioc_plen2 != sizeof(struct obd_uuid))
+               GOTO(out_statfs, rc = -EINVAL);
+
+       memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+       if (type & LL_STATFS_LMV)
+               exp = sbi->ll_md_exp;
+       else if (type & LL_STATFS_LOV)
+               exp = sbi->ll_dt_exp;
+       else
+               GOTO(out_statfs, rc = -ENODEV);
+
+       flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+       rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+       if (rc)
+               GOTO(out_statfs, rc);
+out_statfs:
+       if (buf)
+               obd_ioctl_freedata(buf, len);
+       return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+       char *ptr;
+       void *sb;
+       struct lprocfs_static_vars lvars;
+       unsigned long x;
+       int rc = 0;
+
+       lprocfs_llite_init_vars(&lvars);
+
+       /* The instance name contains the sb: lustre-client-aacfe000 */
+       ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+       if (!ptr || !*(++ptr))
+               return -EINVAL;
+       if (sscanf(ptr, "%lx", &x) != 1)
+               return -EINVAL;
+       sb = (void *)x;
+       /* This better be a real Lustre superblock! */
+       LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+       /* Note we have not called client_common_fill_super yet, so
+          proc fns must be able to handle that! */
+       rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+                                     lcfg, sb);
+       if (rc > 0)
+               rc = 0;
+       return(rc);
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
+                                      struct inode *i1, struct inode *i2,
+                                      const char *name, int namelen,
+                                      int mode, __u32 opc, void *data)
+{
+       LASSERT(i1 != NULL);
+
+       if (namelen > ll_i2sbi(i1)->ll_namelen)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       if (op_data == NULL)
+               OBD_ALLOC_PTR(op_data);
+
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       ll_i2gids(op_data->op_suppgids, i1, i2);
+       op_data->op_fid1 = *ll_inode2fid(i1);
+       op_data->op_capa1 = ll_mdscapa_get(i1);
+
+       if (i2) {
+               op_data->op_fid2 = *ll_inode2fid(i2);
+               op_data->op_capa2 = ll_mdscapa_get(i2);
+       } else {
+               fid_zero(&op_data->op_fid2);
+               op_data->op_capa2 = NULL;
+       }
+
+       op_data->op_name = name;
+       op_data->op_namelen = namelen;
+       op_data->op_mode = mode;
+       op_data->op_mod_time = cfs_time_current_sec();
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       op_data->op_bias = 0;
+       op_data->op_cli_flags = 0;
+       if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+            filename_is_volatile(name, namelen, NULL))
+               op_data->op_bias |= MDS_CREATE_VOLATILE;
+       op_data->op_opc = opc;
+       op_data->op_mds = 0;
+       op_data->op_data = data;
+
+       /* If the file is being opened after mknod() (normally due to NFS)
+        * try to use the default stripe data from parent directory for
+        * allocating OST objects.  Try to pass the parent FID to MDS. */
+       if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+           !ll_i2info(i2)->lli_has_smd) {
+               struct ll_inode_info *lli = ll_i2info(i2);
+
+               spin_lock(&lli->lli_lock);
+               if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+                       op_data->op_fid1 = lli->lli_pfid;
+               spin_unlock(&lli->lli_lock);
+               /** We ignore parent's capability temporary. */
+       }
+
+       /* When called by ll_setattr_raw, file is i1. */
+       if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+               op_data->op_bias |= MDS_DATA_MODIFIED;
+
+       return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+       capa_put(op_data->op_capa1);
+       capa_put(op_data->op_capa2);
+       OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+       struct ll_sb_info *sbi;
+
+       LASSERT((seq != NULL) && (dentry != NULL));
+       sbi = ll_s2sbi(dentry->d_sb);
+
+       if (sbi->ll_flags & LL_SBI_NOLCK)
+               seq_puts(seq, ",nolock");
+
+       if (sbi->ll_flags & LL_SBI_FLOCK)
+               seq_puts(seq, ",flock");
+
+       if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+               seq_puts(seq, ",localflock");
+
+       if (sbi->ll_flags & LL_SBI_USER_XATTR)
+               seq_puts(seq, ",user_xattr");
+
+       if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+               seq_puts(seq, ",lazystatfs");
+
+       if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+               seq_puts(seq, ",user_fid2path");
+
+       RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_device *obd;
+       ENTRY;
+
+       if (cmd == OBD_IOC_GETDTNAME)
+               obd = class_exp2obd(sbi->ll_dt_exp);
+       else if (cmd == OBD_IOC_GETMDNAME)
+               obd = class_exp2obd(sbi->ll_md_exp);
+       else
+               RETURN(-EINVAL);
+
+       if (!obd)
+               RETURN(-ENOENT);
+
+       if (copy_to_user((void *)arg, obd->obd_name,
+                            strlen(obd->obd_name) + 1))
+               RETURN(-EFAULT);
+
+       RETURN(0);
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+       static char fsname_static[MTI_NAME_MAXLEN];
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       char *ptr;
+       int len;
+
+       if (buf == NULL) {
+               /* this means the caller wants to use static buffer
+                * and it doesn't care about race. Usually this is
+                * in error reporting path */
+               buf = fsname_static;
+               buflen = sizeof(fsname_static);
+       }
+
+       len = strlen(lsi->lsi_lmd->lmd_profile);
+       ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+       if (ptr && (strcmp(ptr, "-client") == 0))
+               len -= 7;
+
+       if (unlikely(len >= buflen))
+               len = buflen - 1;
+       strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+       buf[len] = '\0';
+
+       return buf;
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+       char *path = NULL;
+
+       struct path p;
+
+       p.dentry = dentry;
+       p.mnt = current->fs->root.mnt;
+       path_get(&p);
+       path = d_path(&p, buf, bufsize);
+       path_put(&p);
+
+       return path;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+       char *buf, *path = NULL;
+       struct dentry *dentry = NULL;
+       struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+       /* this can be called inside spin lock so use GFP_ATOMIC. */
+       buf = (char *)__get_free_page(GFP_ATOMIC);
+       if (buf != NULL) {
+               dentry = d_find_alias(page->mapping->host);
+               if (dentry != NULL)
+                       path = ll_d_path(dentry, buf, PAGE_SIZE);
+       }
+
+       CWARN("%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+             "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+             s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+             PFID(&obj->cob_header.coh_lu.loh_fid),
+             (path && !IS_ERR(path)) ? path : "", ioret);
+
+       if (dentry != NULL)
+               dput(dentry);
+
+       if (buf != NULL)
+               free_page((unsigned long)buf);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644 (file)
index 0000000..d9590d8
--- /dev/null
@@ -0,0 +1,507 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                      int *type);
+
+static struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+                           struct vm_area_struct *vma, unsigned long addr,
+                           size_t count)
+{
+       policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+                                (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+       policy->l_extent.end = (policy->l_extent.start + count - 1) |
+                              ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                              size_t count)
+{
+       struct vm_area_struct *vma, *ret = NULL;
+       ENTRY;
+
+       /* mmap_sem must have been held by caller. */
+       LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+       for(vma = find_vma(mm, addr);
+           vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+               if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+                   vma->vm_flags & VM_SHARED) {
+                       ret = vma;
+                       break;
+               }
+       }
+       RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
+                              struct lu_env **env_ret,
+                              struct cl_env_nest *nest,
+                              pgoff_t index, unsigned long *ra_flags)
+{
+       struct file       *file  = vma->vm_file;
+       struct inode      *inode = file->f_dentry->d_inode;
+       struct cl_io      *io;
+       struct cl_fault_io *fio;
+       struct lu_env     *env;
+       ENTRY;
+
+       *env_ret = NULL;
+       if (ll_file_nolock(file))
+               RETURN(ERR_PTR(-EOPNOTSUPP));
+
+       /*
+        * page fault can be called when lustre IO is
+        * already active for the current thread, e.g., when doing read/write
+        * against user level buffer mapped from Lustre buffer. To avoid
+        * stomping on existing context, optionally force an allocation of a new
+        * one.
+        */
+       env = cl_env_nested_get(nest);
+       if (IS_ERR(env))
+                RETURN(ERR_PTR(-EINVAL));
+
+       *env_ret = env;
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = ll_i2info(inode)->lli_clob;
+       LASSERT(io->ci_obj != NULL);
+
+       fio = &io->u.ci_fault;
+       fio->ft_index      = index;
+       fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+       /*
+        * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+        * the kernel will not read other pages not covered by ldlm in
+        * filemap_nopage. we do our readahead in ll_readpage.
+        */
+       if (ra_flags != NULL)
+               *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+       vma->vm_flags &= ~VM_SEQ_READ;
+       vma->vm_flags |= VM_RAND_READ;
+
+       CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+              fio->ft_index, fio->ft_executable);
+
+       if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+               struct ccc_io *cio = ccc_env_io(env);
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(cio->cui_cl.cis_io == io);
+
+               /* mmap lock must be MANDATORY
+                * it has to cache pages. */
+               io->ci_lockreq = CILR_MANDATORY;
+
+               cio->cui_fd  = fd;
+       }
+
+       return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+                           bool *retry)
+{
+       struct lu_env      *env;
+       struct cl_io        *io;
+       struct vvp_io      *vio;
+       struct cl_env_nest       nest;
+       int                   result;
+       sigset_t             set;
+       struct inode         *inode;
+       struct ll_inode_info     *lli;
+       ENTRY;
+
+       LASSERT(vmpage != NULL);
+
+       io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+       if (IS_ERR(io))
+               GOTO(out, result = PTR_ERR(io));
+
+       result = io->ci_result;
+       if (result < 0)
+               GOTO(out, result);
+
+       io->u.ci_fault.ft_mkwrite = 1;
+       io->u.ci_fault.ft_writable = 1;
+
+       vio = vvp_env_io(env);
+       vio->u.fault.ft_vma    = vma;
+       vio->u.fault.ft_vmpage = vmpage;
+
+       set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+       /* we grab lli_trunc_sem to exclude truncate case.
+        * Otherwise, we could add dirty pages into osc cache
+        * while truncate is on-going. */
+       inode = ccc_object_inode(io->ci_obj);
+       lli = ll_i2info(inode);
+       down_read(&lli->lli_trunc_sem);
+
+       result = cl_io_loop(env, io);
+
+       up_read(&lli->lli_trunc_sem);
+
+       cfs_restore_sigs(set);
+
+       if (result == 0) {
+               struct inode *inode = vma->vm_file->f_dentry->d_inode;
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               lock_page(vmpage);
+               if (vmpage->mapping == NULL) {
+                       unlock_page(vmpage);
+
+                       /* page was truncated and lock was cancelled, return
+                        * ENODATA so that VM_FAULT_NOPAGE will be returned
+                        * to handle_mm_fault(). */
+                       if (result == 0)
+                               result = -ENODATA;
+               } else if (!PageDirty(vmpage)) {
+                       /* race, the page has been cleaned by ptlrpcd after
+                        * it was unlocked, it has to be added into dirty
+                        * cache again otherwise this soon-to-dirty page won't
+                        * consume any grants, even worse if this page is being
+                        * transferred because it will break RPC checksum.
+                        */
+                       unlock_page(vmpage);
+
+                       CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+                              "been written out, retry.\n",
+                              vmpage, vmpage->index);
+
+                       *retry = true;
+                       result = -EAGAIN;
+               }
+
+               if (result == 0) {
+                       spin_lock(&lli->lli_lock);
+                       lli->lli_flags |= LLIF_DATA_MODIFIED;
+                       spin_unlock(&lli->lli_lock);
+               }
+       }
+       EXIT;
+
+out:
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+
+       LASSERT(ergo(result == 0, PageLocked(vmpage)));
+       return(result);
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+       switch(result) {
+       case 0:
+               result = VM_FAULT_LOCKED;
+               break;
+       case -EFAULT:
+               result = VM_FAULT_NOPAGE;
+               break;
+       case -ENOMEM:
+               result = VM_FAULT_OOM;
+               break;
+       default:
+               result = VM_FAULT_SIGBUS;
+               break;
+       }
+       return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct lu_env      *env;
+       struct cl_io        *io;
+       struct vvp_io      *vio = NULL;
+       struct page          *vmpage;
+       unsigned long       ra_flags;
+       struct cl_env_nest       nest;
+       int                   result;
+       int                   fault_ret = 0;
+       ENTRY;
+
+       io = ll_fault_io_init(vma, &env,  &nest, vmf->pgoff, &ra_flags);
+       if (IS_ERR(io))
+               RETURN(to_fault_error(PTR_ERR(io)));
+
+       result = io->ci_result;
+       if (result == 0) {
+               vio = vvp_env_io(env);
+               vio->u.fault.ft_vma       = vma;
+               vio->u.fault.ft_vmpage    = NULL;
+               vio->u.fault.fault.ft_vmf = vmf;
+
+               result = cl_io_loop(env, io);
+
+               fault_ret = vio->u.fault.fault.ft_flags;
+               vmpage = vio->u.fault.ft_vmpage;
+               if (result != 0 && vmpage != NULL) {
+                       page_cache_release(vmpage);
+                       vmf->page = NULL;
+               }
+       }
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       vma->vm_flags |= ra_flags;
+       if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+               fault_ret |= to_fault_error(result);
+
+       CDEBUG(D_MMAP, "%s fault %d/%d\n",
+              current->comm, fault_ret, result);
+       RETURN(fault_ret);
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       int count = 0;
+       bool printed = false;
+       int result;
+       sigset_t set;
+
+       /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+        * so that it can be killed by admin but not cause segfault by
+        * other signals. */
+       set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+       result = ll_fault0(vma, vmf);
+       LASSERT(!(result & VM_FAULT_LOCKED));
+       if (result == 0) {
+               struct page *vmpage = vmf->page;
+
+               /* check if this page has been truncated */
+               lock_page(vmpage);
+               if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+                       unlock_page(vmpage);
+                       page_cache_release(vmpage);
+                       vmf->page = NULL;
+
+                       if (!printed && ++count > 16) {
+                               CWARN("the page is under heavy contention,"
+                                     "maybe your app(%s) needs revising :-)\n",
+                                     current->comm);
+                               printed = true;
+                       }
+
+                       goto restart;
+               }
+
+               result |= VM_FAULT_LOCKED;
+       }
+       cfs_restore_sigs(set);
+       return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       int count = 0;
+       bool printed = false;
+       bool retry;
+       int result;
+
+       do {
+               retry = false;
+               result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+               if (!printed && ++count > 16) {
+                       CWARN("app(%s): the page %lu of file %lu is under heavy"
+                             " contention.\n",
+                             current->comm, vmf->pgoff,
+                             vma->vm_file->f_dentry->d_inode->i_ino);
+                       printed = true;
+               }
+       } while (retry);
+
+       switch(result) {
+       case 0:
+               LASSERT(PageLocked(vmf->page));
+               result = VM_FAULT_LOCKED;
+               break;
+       case -ENODATA:
+       case -EFAULT:
+               result = VM_FAULT_NOPAGE;
+               break;
+       case -ENOMEM:
+               result = VM_FAULT_OOM;
+               break;
+       case -EAGAIN:
+               result = VM_FAULT_RETRY;
+               break;
+       default:
+               result = VM_FAULT_SIGBUS;
+               break;
+       }
+
+       return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+       struct inode *inode    = vma->vm_file->f_dentry->d_inode;
+       struct ccc_object *vob = cl_inode2ccc(inode);
+
+       ENTRY;
+       LASSERT(vma->vm_file);
+       LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+       atomic_inc(&vob->cob_mmap_cnt);
+       EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+       struct inode      *inode = vma->vm_file->f_dentry->d_inode;
+       struct ccc_object *vob   = cl_inode2ccc(inode);
+
+       ENTRY;
+       LASSERT(vma->vm_file);
+       atomic_dec(&vob->cob_mmap_cnt);
+       LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+       EXIT;
+}
+
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
+{
+       return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT));
+
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+       int rc = -ENOENT;
+       ENTRY;
+
+       LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+       if (mapping_mapped(mapping)) {
+               rc = 0;
+               unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+                                   last - first + 1, 0);
+       }
+
+       RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+       .fault                  = ll_fault,
+       .page_mkwrite           = ll_page_mkwrite,
+       .open                   = ll_vm_open,
+       .close                  = ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       int rc;
+       ENTRY;
+
+       if (ll_file_nolock(file))
+               RETURN(-EOPNOTSUPP);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+       rc = generic_file_mmap(file, vma);
+       if (rc == 0) {
+               vma->vm_ops = &ll_file_vm_ops;
+               vma->vm_ops->open(vma);
+               /* update the inode's size and mtime */
+               rc = ll_glimpse_size(inode);
+       }
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644 (file)
index 0000000..28cc41e
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+       __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+       while (len--) {
+               __u32 key = key1 + (key0 ^ (*name++ * 7152373));
+               if (key & 0x80000000) key -= 0x7fffffff;
+               key1 = key0;
+               key0 = key;
+       }
+       return (key0 << 1);
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+       return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+                        (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+                                     const struct lu_fid *fid)
+{
+       struct ll_sb_info     *sbi = ll_s2sbi(sb);
+       struct ptlrpc_request *req = NULL;
+       struct inode      *inode = NULL;
+       int                eadatalen = 0;
+       unsigned long         hash = cl_fid_build_ino(fid,
+                                                     ll_need_32bit_api(sbi));
+       struct  md_op_data    *op_data;
+       int                rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+       inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+       if (inode)
+               RETURN(inode);
+
+       rc = ll_get_max_mdsize(sbi, &eadatalen);
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       /* Because inode is NULL, ll_prep_md_op_data can not
+        * be used here. So we allocate op_data ourselves */
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       op_data->op_fid1 = *fid;
+       op_data->op_mode = eadatalen;
+       op_data->op_valid = OBD_MD_FLEASIZE;
+
+       /* mds_fid2dentry ignores f_type */
+       rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+       OBD_FREE_PTR(op_data);
+       if (rc) {
+               CERROR("can't get object attrs, fid "DFID", rc %d\n",
+                      PFID(fid), rc);
+               RETURN(ERR_PTR(rc));
+       }
+       rc = ll_prep_inode(&inode, req, sb, NULL);
+       ptlrpc_req_finished(req);
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       RETURN(inode);
+}
+
+struct lustre_nfs_fid {
+       struct lu_fid   lnf_child;
+       struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+       struct inode  *inode;
+       struct dentry *result;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+       if (!fid_is_sane(fid))
+               RETURN(ERR_PTR(-ESTALE));
+
+       inode = search_inode_for_lustre(sb, fid);
+       if (IS_ERR(inode))
+               RETURN(ERR_PTR(PTR_ERR(inode)));
+
+       if (is_bad_inode(inode)) {
+               /* we didn't find the right inode.. */
+               iput(inode);
+               RETURN(ERR_PTR(-ESTALE));
+       }
+
+       /**
+        * It is an anonymous dentry without OST objects created yet.
+        * We have to find the parent to tell MDS how to init lov objects.
+        */
+       if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+           parent != NULL) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               spin_lock(&lli->lli_lock);
+               lli->lli_pfid = *parent;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       result = d_obtain_alias(inode);
+       if (IS_ERR(result))
+               RETURN(result);
+
+       ll_dops_init(result, 1, 0);
+
+       RETURN(result);
+}
+
+#define LUSTRE_NFS_FID   0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *               at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+                       struct inode *parent)
+{
+       struct lustre_nfs_fid *nfs_fid = (void *)fh;
+       ENTRY;
+
+       CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+             inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+             (int)sizeof(struct lustre_nfs_fid));
+
+       if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+               RETURN(255);
+
+       nfs_fid->lnf_child = *ll_inode2fid(inode);
+       nfs_fid->lnf_parent = *ll_inode2fid(parent);
+       *plen = sizeof(struct lustre_nfs_fid) / 4;
+
+       RETURN(LUSTRE_NFS_FID);
+}
+
+static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+                                  loff_t hash, u64 ino, unsigned type)
+{
+       /* It is hack to access lde_fid for comparison with lgd_fid.
+        * So the input 'name' must be part of the 'lu_dirent'. */
+       struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+       struct ll_getname_data *lgd = cookie;
+       struct lu_fid fid;
+
+       fid_le_to_cpu(&fid, &lde->lde_fid);
+       if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+               memcpy(lgd->lgd_name, name, namelen);
+               lgd->lgd_name[namelen] = 0;
+               lgd->lgd_found = 1;
+       }
+       return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+                      struct dentry *child)
+{
+       struct inode *dir = dentry->d_inode;
+       struct ll_getname_data lgd;
+       __u64 offset = 0;
+       int rc;
+       ENTRY;
+
+       if (!dir || !S_ISDIR(dir->i_mode))
+               GOTO(out, rc = -ENOTDIR);
+
+       if (!dir->i_fop)
+               GOTO(out, rc = -EINVAL);
+
+       lgd.lgd_name = name;
+       lgd.lgd_fid = ll_i2info(child->d_inode)->lli_fid;
+       lgd.lgd_found = 0;
+
+       mutex_lock(&dir->i_mutex);
+       rc = ll_dir_read(dir, &offset, &lgd, ll_nfs_get_name_filldir);
+       mutex_unlock(&dir->i_mutex);
+       if (!rc && !lgd.lgd_found)
+               rc = -ENOENT;
+       EXIT;
+
+out:
+       return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                     int fh_len, int fh_type)
+{
+       struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+       if (fh_type != LUSTRE_NFS_FID)
+               RETURN(ERR_PTR(-EPROTO));
+
+       RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                     int fh_len, int fh_type)
+{
+       struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+       if (fh_type != LUSTRE_NFS_FID)
+               RETURN(ERR_PTR(-EPROTO));
+
+       RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL));
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+       struct ptlrpc_request *req = NULL;
+       struct inode      *dir = dchild->d_inode;
+       struct ll_sb_info     *sbi;
+       struct dentry    *result = NULL;
+       struct mdt_body       *body;
+       static char        dotdot[] = "..";
+       struct md_op_data     *op_data;
+       int                rc;
+       int                   lmmsize;
+       ENTRY;
+
+       LASSERT(dir && S_ISDIR(dir->i_mode));
+
+       sbi = ll_s2sbi(dir->i_sb);
+
+       CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+                       dir->i_ino, PFID(ll_inode2fid(dir)));
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc != 0)
+               RETURN(ERR_PTR(rc));
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+                                    strlen(dotdot), lmmsize,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN((void *)op_data);
+
+       rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc) {
+               CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+               RETURN(ERR_PTR(rc));
+       }
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body->valid & OBD_MD_FLID);
+
+       CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+               PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+       result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+       ptlrpc_req_finished(req);
+       RETURN(result);
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+       .fh_to_dentry = ll_fh_to_dentry,
+       .fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644 (file)
index 0000000..4c61036
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <lustre_lite.h>
+#include <lustre_eacl.h>
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+       return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+       return id & (EE_HASHES - 1);
+}
+
+obd_valid rce_ops2valid(int ops)
+{
+       switch (ops) {
+       case RMT_LSETFACL:
+               return OBD_MD_FLRMTLSETFACL;
+       case RMT_LGETFACL:
+               return OBD_MD_FLRMTLGETFACL;
+       case RMT_RSETFACL:
+               return OBD_MD_FLRMTRSETFACL;
+       case RMT_RGETFACL:
+               return OBD_MD_FLRMTRGETFACL;
+       default:
+               return 0;
+       }
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       OBD_ALLOC_PTR(rce);
+       if (!rce)
+               return NULL;
+
+       INIT_LIST_HEAD(&rce->rce_list);
+       rce->rce_key = key;
+       rce->rce_ops = ops;
+
+       return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+       if (!list_empty(&rce->rce_list))
+               list_del(&rce->rce_list);
+
+       OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+                                          pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+       struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+       list_for_each_entry(rce, head, rce_list)
+               if (rce->rce_key == key)
+                       return rce;
+
+       return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       spin_lock(&rct->rct_lock);
+       rce = __rct_search(rct, key);
+       spin_unlock(&rct->rct_lock);
+       return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+       struct rmtacl_ctl_entry *rce, *e;
+
+       rce = rce_alloc(key, ops);
+       if (rce == NULL)
+               return -ENOMEM;
+
+       spin_lock(&rct->rct_lock);
+       e = __rct_search(rct, key);
+       if (unlikely(e != NULL)) {
+               CWARN("Unexpected stale rmtacl_entry found: "
+                     "[key: %d] [ops: %d]\n", (int)key, ops);
+               rce_free(e);
+       }
+       list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+       spin_unlock(&rct->rct_lock);
+
+       return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       spin_lock(&rct->rct_lock);
+       rce = __rct_search(rct, key);
+       if (rce)
+               rce_free(rce);
+       spin_unlock(&rct->rct_lock);
+
+       return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+       int i;
+
+       spin_lock_init(&rct->rct_lock);
+       for (i = 0; i < RCE_HASHES; i++)
+               INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+       struct rmtacl_ctl_entry *rce;
+       int i;
+
+       spin_lock(&rct->rct_lock);
+       for (i = 0; i < RCE_HASHES; i++)
+               while (!list_empty(&rct->rct_entries[i])) {
+                       rce = list_entry(rct->rct_entries[i].next,
+                                            struct rmtacl_ctl_entry, rce_list);
+                       rce_free(rce);
+               }
+       spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+                                  ext_acl_xattr_header *header)
+{
+       struct eacl_entry *ee;
+
+       OBD_ALLOC_PTR(ee);
+       if (!ee)
+               return NULL;
+
+       INIT_LIST_HEAD(&ee->ee_list);
+       ee->ee_key = key;
+       ee->ee_fid = *fid;
+       ee->ee_type = type;
+       ee->ee_acl = header;
+
+       return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+       if (!list_empty(&ee->ee_list))
+               list_del(&ee->ee_list);
+
+       if (ee->ee_acl)
+               lustre_ext_acl_xattr_free(ee->ee_acl);
+
+       OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+                                       struct lu_fid *fid, int type)
+{
+       struct eacl_entry *ee;
+       struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+       LASSERT(fid != NULL);
+       list_for_each_entry(ee, head, ee_list)
+               if (ee->ee_key == key) {
+                       if (lu_fid_eq(&ee->ee_fid, fid) &&
+                           ee->ee_type == type) {
+                               list_del_init(&ee->ee_list);
+                               return ee;
+                       }
+               }
+
+       return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+                                struct lu_fid *fid, int type)
+{
+       struct eacl_entry *ee;
+
+       spin_lock(&et->et_lock);
+       ee = __et_search_del(et, key, fid, type);
+       spin_unlock(&et->et_lock);
+       return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+       struct eacl_entry *ee, *next;
+       struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+       spin_lock(&et->et_lock);
+       list_for_each_entry_safe(ee, next, head, ee_list)
+               if (ee->ee_key == key)
+                       ee_free(ee);
+
+       spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+          ext_acl_xattr_header *header)
+{
+       struct eacl_entry *ee, *e;
+
+       ee = ee_alloc(key, fid, type, header);
+       if (ee == NULL)
+               return -ENOMEM;
+
+       spin_lock(&et->et_lock);
+       e = __et_search_del(et, key, fid, type);
+       if (unlikely(e != NULL)) {
+               CWARN("Unexpected stale eacl_entry found: "
+                     "[key: %d] [fid: "DFID"] [type: %d]\n",
+                     (int)key, PFID(fid), type);
+               ee_free(e);
+       }
+       list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+       spin_unlock(&et->et_lock);
+
+       return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+       int i;
+
+       spin_lock_init(&et->et_lock);
+       for (i = 0; i < EE_HASHES; i++)
+               INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+       struct eacl_entry *ee;
+       int i;
+
+       spin_lock(&et->et_lock);
+       for (i = 0; i < EE_HASHES; i++)
+               while (!list_empty(&et->et_entries[i])) {
+                       ee = list_entry(et->et_entries[i].next,
+                                           struct eacl_entry, ee_list);
+                       ee_free(ee);
+               }
+       spin_unlock(&et->et_lock);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644 (file)
index 0000000..b72f257
--- /dev/null
@@ -0,0 +1,869 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ *  linux/drivers/block/loop.c
+ *
+ *  Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>         /* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/pagevec.h>
+
+#include <asm/uaccess.h>
+
+#include <lustre_lib.h>
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS     LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+       LLOOP_UNBOUND,
+       LLOOP_BOUND,
+       LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+       int               lo_number;
+       int               lo_refcnt;
+       loff_t         lo_offset;
+       loff_t         lo_sizelimit;
+       int               lo_flags;
+       int             (*ioctl)(struct lloop_device *, int cmd,
+                                   unsigned long arg);
+
+       struct file      *lo_backing_file;
+       struct block_device *lo_device;
+       unsigned             lo_blocksize;
+
+       int               old_gfp_mask;
+
+       spinlock_t              lo_lock;
+       struct bio              *lo_bio;
+       struct bio              *lo_biotail;
+       int                     lo_state;
+       struct semaphore        lo_sem;
+       struct mutex            lo_ctl_mutex;
+       atomic_t         lo_pending;
+       wait_queue_head_t         lo_bh_wait;
+
+       struct request_queue *lo_queue;
+
+       const struct lu_env *lo_env;
+       struct cl_io     lo_io;
+       struct ll_dio_pages  lo_pvec;
+
+       /* data to handle bio for lustre. */
+       struct lo_request_data {
+               struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+               loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
+       } lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+       LO_FLAGS_READ_ONLY       = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+       loff_t size, offset, loopsize;
+
+       /* Compute loopsize in bytes */
+       size = i_size_read(file->f_mapping->host);
+       offset = lo->lo_offset;
+       loopsize = size - offset;
+       if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+               loopsize = lo->lo_sizelimit;
+
+       /*
+        * Unfortunately, if we want to do I/O on the device,
+        * the number of 512-byte sectors has to fit into a sector_t.
+        */
+       return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+       const struct lu_env  *env   = lo->lo_env;
+       struct cl_io     *io    = &lo->lo_io;
+       struct inode     *inode = lo->lo_backing_file->f_dentry->d_inode;
+       struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+       pgoff_t        offset;
+       int                ret;
+       int                i;
+       int                rw;
+       obd_count            page_count = 0;
+       struct bio_vec       *bvec;
+       struct bio         *bio;
+       ssize_t        bytes;
+
+       struct ll_dio_pages  *pvec = &lo->lo_pvec;
+       struct page      **pages = pvec->ldp_pages;
+       loff_t         *offsets = pvec->ldp_offsets;
+
+       truncate_inode_pages(inode->i_mapping, 0);
+
+       /* initialize the IO */
+       memset(io, 0, sizeof(*io));
+       io->ci_obj = obj;
+       ret = cl_io_init(env, io, CIT_MISC, obj);
+       if (ret)
+               return io->ci_result;
+       io->ci_lockreq = CILR_NEVER;
+
+       LASSERT(head != NULL);
+       rw = head->bi_rw;
+       for (bio = head; bio != NULL; bio = bio->bi_next) {
+               LASSERT(rw == bio->bi_rw);
+
+               offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+               bio_for_each_segment(bvec, bio, i) {
+                       BUG_ON(bvec->bv_offset != 0);
+                       BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+
+                       pages[page_count] = bvec->bv_page;
+                       offsets[page_count] = offset;
+                       page_count++;
+                       offset += bvec->bv_len;
+               }
+               LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+       }
+
+       ll_stats_ops_tally(ll_i2sbi(inode),
+                       (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+                       page_count);
+
+       pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+       pvec->ldp_nr = page_count;
+
+       /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+        * write those pages into OST. Even worse case is that more pages
+        * would be asked to write out to swap space, and then finally get here
+        * again.
+        * Unfortunately this is NOT easy to fix.
+        * Thoughts on solution:
+        * 0. Define a reserved pool for cl_pages, which could be a list of
+        *    pre-allocated cl_pages;
+        * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+        *    which measures how many layers for this lustre object. Generally
+        *    speaking, the depth would be 2, one for llite, and one for lovsub.
+        *    However, for SNS, there will be more since we need additional page
+        *    to store parity;
+        * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+        *    pool. Afterwards, the clio would allocate the pages from reserved
+        *    pool, this guarantees we neeedn't allocate the cl_pages from
+        *    generic cl_page slab cache.
+        *    Of course, if there is NOT enough pages in the pool, we might
+        *    be asked to write less pages once, this purely depends on
+        *    implementation. Anyway, we should be careful to avoid deadlocking.
+        */
+       mutex_lock(&inode->i_mutex);
+       bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+       mutex_unlock(&inode->i_mutex);
+       cl_io_fini(env, io);
+       return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lo->lo_lock, flags);
+       if (lo->lo_biotail) {
+               lo->lo_biotail->bi_next = bio;
+               lo->lo_biotail = bio;
+       } else
+               lo->lo_bio = lo->lo_biotail = bio;
+       spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+       atomic_inc(&lo->lo_pending);
+       if (waitqueue_active(&lo->lo_bh_wait))
+               wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+       struct bio *first;
+       struct bio **bio;
+       unsigned int count = 0;
+       unsigned int page_count = 0;
+       int rw;
+
+       spin_lock_irq(&lo->lo_lock);
+       first = lo->lo_bio;
+       if (unlikely(first == NULL)) {
+               spin_unlock_irq(&lo->lo_lock);
+               return 0;
+       }
+
+       /* TODO: need to split the bio, too bad. */
+       LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+       rw = first->bi_rw;
+       bio = &lo->lo_bio;
+       while (*bio && (*bio)->bi_rw == rw) {
+               CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+                      (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                      page_count, (*bio)->bi_vcnt);
+               if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+                       break;
+
+
+               page_count += (*bio)->bi_vcnt;
+               count++;
+               bio = &(*bio)->bi_next;
+       }
+       if (*bio) {
+               /* Some of bios can't be mergable. */
+               lo->lo_bio = *bio;
+               *bio = NULL;
+       } else {
+               /* Hit the end of queue */
+               lo->lo_biotail = NULL;
+               lo->lo_bio = NULL;
+       }
+       *req = first;
+       spin_unlock_irq(&lo->lo_lock);
+       return count;
+}
+
+static ll_mrf_ret
+loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+       struct lloop_device *lo = q->queuedata;
+       int rw = bio_rw(old_bio);
+       int inactive;
+
+       if (!lo)
+               goto err;
+
+       CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+              (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+
+       spin_lock_irq(&lo->lo_lock);
+       inactive = (lo->lo_state != LLOOP_BOUND);
+       spin_unlock_irq(&lo->lo_lock);
+       if (inactive)
+               goto err;
+
+       if (rw == WRITE) {
+               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+                       goto err;
+       } else if (rw == READA) {
+               rw = READ;
+       } else if (rw != READ) {
+               CERROR("lloop: unknown command (%x)\n", rw);
+               goto err;
+       }
+       loop_add_bio(lo, old_bio);
+       LL_MRF_RETURN(0);
+err:
+       cfs_bio_io_error(old_bio, old_bio->bi_size);
+       LL_MRF_RETURN(0);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+       int ret;
+       ret = do_bio_lustrebacked(lo, bio);
+       while (bio) {
+               struct bio *tmp = bio->bi_next;
+               bio->bi_next = NULL;
+               cfs_bio_endio(bio, bio->bi_size, ret);
+               bio = tmp;
+       }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+       return atomic_read(&lo->lo_pending) ||
+               (lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+       struct lloop_device *lo = data;
+       struct bio *bio;
+       unsigned int count;
+       unsigned long times = 0;
+       unsigned long total_count = 0;
+
+       struct lu_env *env;
+       int refcheck;
+       int ret = 0;
+
+       set_user_nice(current, -20);
+
+       lo->lo_state = LLOOP_BOUND;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               GOTO(out, ret = PTR_ERR(env));
+
+       lo->lo_env = env;
+       memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+       lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+       lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+       /*
+        * up sem, we are running
+        */
+       up(&lo->lo_sem);
+
+       for (;;) {
+               wait_event(lo->lo_bh_wait, loop_active(lo));
+               if (!atomic_read(&lo->lo_pending)) {
+                       int exiting = 0;
+                       spin_lock_irq(&lo->lo_lock);
+                       exiting = (lo->lo_state == LLOOP_RUNDOWN);
+                       spin_unlock_irq(&lo->lo_lock);
+                       if (exiting)
+                               break;
+               }
+
+               bio = NULL;
+               count = loop_get_bio(lo, &bio);
+               if (!count) {
+                       CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+                       continue;
+               }
+
+               total_count += count;
+               if (total_count < count) {     /* overflow */
+                       total_count = count;
+                       times = 1;
+               } else {
+                       times++;
+               }
+               if ((times & 127) == 0) {
+                       CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+                              total_count, times, total_count / times);
+               }
+
+               LASSERT(bio != NULL);
+               LASSERT(count <= atomic_read(&lo->lo_pending));
+               loop_handle_bio(lo, bio);
+               atomic_sub(count, &lo->lo_pending);
+       }
+       cl_env_put(env, &refcheck);
+
+out:
+       up(&lo->lo_sem);
+       return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+                      struct block_device *bdev, struct file *file)
+{
+       struct inode     *inode;
+       struct address_space *mapping;
+       int                lo_flags = 0;
+       int                error;
+       loff_t          size;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       error = -EBUSY;
+       if (lo->lo_state != LLOOP_UNBOUND)
+               goto out;
+
+       mapping = file->f_mapping;
+       inode = mapping->host;
+
+       error = -EINVAL;
+       if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+               goto out;
+
+       if (!(file->f_mode & FMODE_WRITE))
+               lo_flags |= LO_FLAGS_READ_ONLY;
+
+       size = get_loop_size(lo, file);
+
+       if ((loff_t)(sector_t)size != size) {
+               error = -EFBIG;
+               goto out;
+       }
+
+       /* remove all pages in cache so as dirty pages not to be existent. */
+       truncate_inode_pages(mapping, 0);
+
+       set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+       lo->lo_blocksize = PAGE_CACHE_SIZE;
+       lo->lo_device = bdev;
+       lo->lo_flags = lo_flags;
+       lo->lo_backing_file = file;
+       lo->ioctl = NULL;
+       lo->lo_sizelimit = 0;
+       lo->old_gfp_mask = mapping_gfp_mask(mapping);
+       mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+       lo->lo_bio = lo->lo_biotail = NULL;
+
+       /*
+        * set queue make_request_fn, and add limits based on lower level
+        * device
+        */
+       blk_queue_make_request(lo->lo_queue, loop_make_request);
+       lo->lo_queue->queuedata = lo;
+
+       /* queue parameters */
+       CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+       blk_queue_logical_block_size(lo->lo_queue,
+                                    (unsigned short)PAGE_CACHE_SIZE);
+       blk_queue_max_hw_sectors(lo->lo_queue,
+                                LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+       blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+       set_capacity(disks[lo->lo_number], size);
+       bd_set_size(bdev, size << 9);
+
+       set_blocksize(bdev, lo->lo_blocksize);
+
+       kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+       down(&lo->lo_sem);
+       return 0;
+
+out:
+       /* This is safe: open() is still holding a reference. */
+       module_put(THIS_MODULE);
+       return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+                      int count)
+{
+       struct file *filp = lo->lo_backing_file;
+       int gfp = lo->old_gfp_mask;
+
+       if (lo->lo_state != LLOOP_BOUND)
+               return -ENXIO;
+
+       if (lo->lo_refcnt > count)      /* we needed one fd for the ioctl */
+               return -EBUSY;
+
+       if (filp == NULL)
+               return -EINVAL;
+
+       spin_lock_irq(&lo->lo_lock);
+       lo->lo_state = LLOOP_RUNDOWN;
+       spin_unlock_irq(&lo->lo_lock);
+       wake_up(&lo->lo_bh_wait);
+
+       down(&lo->lo_sem);
+       lo->lo_backing_file = NULL;
+       lo->ioctl = NULL;
+       lo->lo_device = NULL;
+       lo->lo_offset = 0;
+       lo->lo_sizelimit = 0;
+       lo->lo_flags = 0;
+       ll_invalidate_bdev(bdev, 0);
+       set_capacity(disks[lo->lo_number], 0);
+       bd_set_size(bdev, 0);
+       mapping_set_gfp_mask(filp->f_mapping, gfp);
+       lo->lo_state = LLOOP_UNBOUND;
+       fput(filp);
+       /* This is safe: open() is still holding a reference. */
+       module_put(THIS_MODULE);
+       return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+       struct lloop_device *lo = bdev->bd_disk->private_data;
+
+       mutex_lock(&lo->lo_ctl_mutex);
+       lo->lo_refcnt++;
+       mutex_unlock(&lo->lo_ctl_mutex);
+
+       return 0;
+}
+
+static int lo_release(struct gendisk *disk, fmode_t mode)
+{
+       struct lloop_device *lo = disk->private_data;
+
+       mutex_lock(&lo->lo_ctl_mutex);
+       --lo->lo_refcnt;
+       mutex_unlock(&lo->lo_ctl_mutex);
+
+       return 0;
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+                   unsigned int cmd, unsigned long arg)
+{
+       struct lloop_device *lo = bdev->bd_disk->private_data;
+       struct inode *inode = NULL;
+       int err = 0;
+
+       mutex_lock(&lloop_mutex);
+       switch (cmd) {
+       case LL_IOC_LLOOP_DETACH: {
+               err = loop_clr_fd(lo, bdev, 2);
+               if (err == 0)
+                       ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+               break;
+       }
+
+       case LL_IOC_LLOOP_INFO: {
+               struct lu_fid fid;
+
+               LASSERT(lo->lo_backing_file != NULL);
+               if (inode == NULL)
+                       inode = lo->lo_backing_file->f_dentry->d_inode;
+               if (lo->lo_state == LLOOP_BOUND)
+                       fid = ll_i2info(inode)->lli_fid;
+               else
+                       fid_zero(&fid);
+
+               if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+                       err = -EFAULT;
+               break;
+       }
+
+       default:
+               err = -EINVAL;
+               break;
+       }
+       mutex_unlock(&lloop_mutex);
+
+       return err;
+}
+
+static struct block_device_operations lo_fops = {
+       .owner =        THIS_MODULE,
+       .open =  lo_open,
+       .release =      lo_release,
+       .ioctl =        lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device numner.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+                                  unsigned int cmd, unsigned long arg,
+                                  void *magic, int *rcp)
+{
+       struct lloop_device *lo = NULL;
+       struct block_device *bdev = NULL;
+       int err = 0;
+       dev_t dev;
+
+       if (magic != ll_iocontrol_magic)
+               return LLIOC_CONT;
+
+       if (disks == NULL)
+               GOTO(out1, err = -ENODEV);
+
+       CWARN("Enter llop_ioctl\n");
+
+       mutex_lock(&lloop_mutex);
+       switch (cmd) {
+       case LL_IOC_LLOOP_ATTACH: {
+               struct lloop_device *lo_free = NULL;
+               int i;
+
+               for (i = 0; i < max_loop; i++, lo = NULL) {
+                       lo = &loop_dev[i];
+                       if (lo->lo_state == LLOOP_UNBOUND) {
+                               if (!lo_free)
+                                       lo_free = lo;
+                               continue;
+                       }
+                       if (lo->lo_backing_file->f_dentry->d_inode ==
+                           file->f_dentry->d_inode)
+                               break;
+               }
+               if (lo || !lo_free)
+                       GOTO(out, err = -EBUSY);
+
+               lo = lo_free;
+               dev = MKDEV(lloop_major, lo->lo_number);
+
+               /* quit if the used pointer is writable */
+               if (put_user((long)old_encode_dev(dev), (long*)arg))
+                       GOTO(out, err = -EFAULT);
+
+               bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+               if (IS_ERR(bdev))
+                       GOTO(out, err = PTR_ERR(bdev));
+
+               get_file(file);
+               err = loop_set_fd(lo, NULL, bdev, file);
+               if (err) {
+                       fput(file);
+                       ll_blkdev_put(bdev, 0);
+               }
+
+               break;
+       }
+
+       case LL_IOC_LLOOP_DETACH_BYDEV: {
+               int minor;
+
+               dev = old_decode_dev(arg);
+               if (MAJOR(dev) != lloop_major)
+                       GOTO(out, err = -EINVAL);
+
+               minor = MINOR(dev);
+               if (minor > max_loop - 1)
+                       GOTO(out, err = -EINVAL);
+
+               lo = &loop_dev[minor];
+               if (lo->lo_state != LLOOP_BOUND)
+                       GOTO(out, err = -EINVAL);
+
+               bdev = lo->lo_device;
+               err = loop_clr_fd(lo, bdev, 1);
+               if (err == 0)
+                       ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+               break;
+       }
+
+       default:
+               err = -EINVAL;
+               break;
+       }
+
+out:
+       mutex_unlock(&lloop_mutex);
+out1:
+       if (rcp)
+               *rcp = err;
+       return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+       int     i;
+       unsigned int cmdlist[] = {
+               LL_IOC_LLOOP_ATTACH,
+               LL_IOC_LLOOP_DETACH_BYDEV,
+       };
+
+       if (max_loop < 1 || max_loop > 256) {
+               max_loop = MAX_LOOP_DEFAULT;
+               CWARN("lloop: invalid max_loop (must be between"
+                     " 1 and 256), using default (%u)\n", max_loop);
+       }
+
+       lloop_major = register_blkdev(0, "lloop");
+       if (lloop_major < 0)
+               return -EIO;
+
+       CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+              lloop_major, max_loop);
+
+       ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+       if (ll_iocontrol_magic == NULL)
+               goto out_mem1;
+
+       OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
+       if (!loop_dev)
+               goto out_mem1;
+
+       OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
+       if (!disks)
+               goto out_mem2;
+
+       for (i = 0; i < max_loop; i++) {
+               disks[i] = alloc_disk(1);
+               if (!disks[i])
+                       goto out_mem3;
+       }
+
+       mutex_init(&lloop_mutex);
+
+       for (i = 0; i < max_loop; i++) {
+               struct lloop_device *lo = &loop_dev[i];
+               struct gendisk *disk = disks[i];
+
+               lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+               if (!lo->lo_queue)
+                       goto out_mem4;
+
+               mutex_init(&lo->lo_ctl_mutex);
+               sema_init(&lo->lo_sem, 0);
+               init_waitqueue_head(&lo->lo_bh_wait);
+               lo->lo_number = i;
+               spin_lock_init(&lo->lo_lock);
+               disk->major = lloop_major;
+               disk->first_minor = i;
+               disk->fops = &lo_fops;
+               sprintf(disk->disk_name, "lloop%d", i);
+               disk->private_data = lo;
+               disk->queue = lo->lo_queue;
+       }
+
+       /* We cannot fail after we call this, so another loop!*/
+       for (i = 0; i < max_loop; i++)
+               add_disk(disks[i]);
+       return 0;
+
+out_mem4:
+       while (i--)
+               blk_cleanup_queue(loop_dev[i].lo_queue);
+       i = max_loop;
+out_mem3:
+       while (i--)
+               put_disk(disks[i]);
+       OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+       OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+       unregister_blkdev(lloop_major, "lloop");
+       ll_iocontrol_unregister(ll_iocontrol_magic);
+       CERROR("lloop: ran out of memory\n");
+       return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+       int i;
+
+       ll_iocontrol_unregister(ll_iocontrol_magic);
+       for (i = 0; i < max_loop; i++) {
+               del_gendisk(disks[i]);
+               blk_cleanup_queue(loop_dev[i].lo_queue);
+               put_disk(disks[i]);
+       }
+       if (ll_unregister_blkdev(lloop_major, "lloop"))
+               CWARN("lloop: cannot unregister blkdev\n");
+       else
+               CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
+
+       OBD_FREE(disks, max_loop * sizeof(*disks));
+       OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644 (file)
index 0000000..22e19a6
--- /dev/null
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+
+struct proc_dir_entry *proc_lustre_fs_root;
+
+#ifdef LPROCFS
+/* /proc/lustre/llite mount point registration */
+extern struct file_operations vvp_dump_pgcache_file_ops;
+struct file_operations ll_rw_extents_stats_fops;
+struct file_operations ll_rw_extents_stats_pp_fops;
+struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_rd_blksize(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+             *eof = 1;
+             rc = snprintf(page, count, "%u\n", osfs.os_bsize);
+       }
+
+       return rc;
+}
+
+static int ll_rd_kbytestotal(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+
+}
+
+static int ll_rd_kbytesfree(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+}
+
+static int ll_rd_kbytesavail(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+}
+
+static int ll_rd_filestotal(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+                *eof = 1;
+                rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+       }
+       return rc;
+}
+
+static int ll_rd_filesfree(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+                *eof = 1;
+                rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+       }
+       return rc;
+
+}
+
+static int ll_rd_client_type(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)data);
+       int rc;
+
+       LASSERT(sbi != NULL);
+
+       *eof = 1;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               rc = snprintf(page, count, "remote client\n");
+       else
+               rc = snprintf(page, count, "local client\n");
+
+       return rc;
+}
+
+static int ll_rd_fstype(char *page, char **start, off_t off, int count,
+                       int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block*)data;
+
+       LASSERT(sb != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%s\n", sb->s_type->name);
+}
+
+static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct super_block *sb = (struct super_block *)data;
+
+       LASSERT(sb != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+}
+
+static int ll_rd_site_stats(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+
+       /*
+        * See description of statistical counters in struct cl_site, and
+        * struct lu_site.
+        */
+       return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site),
+                                  page, count);
+}
+
+static int ll_rd_max_readahead_mb(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_pages;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0 || pages_number > num_physpages / 2) {
+               CERROR("can't set file readahead more than %lu MB\n",
+                      num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_pages = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                         unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0 ||
+               pages_number > sbi->ll_ra_info.ra_max_pages) {
+               CERROR("can't set file readahead more than"
+                      "max_read_ahead_mb %lu MB\n",
+                      sbi->ll_ra_info.ra_max_pages);
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+
+static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
+                                        unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       /* Cap this at the current max readahead window size, the readahead
+        * algorithm does this anyway so it's pointless to set it larger. */
+       if (pages_number < 0 ||
+           pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+               CERROR("can't set max_read_ahead_whole_mb more than "
+                      "max_read_ahead_per_file_mb: %lu\n",
+                       sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+
+static int ll_rd_max_cached_mb(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct super_block     *sb    = data;
+       struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+       struct cl_client_cache *cache = &sbi->ll_cache;
+       int shift = 20 - PAGE_CACHE_SHIFT;
+       int max_cached_mb;
+       int unused_mb;
+
+       *eof = 1;
+       max_cached_mb = cache->ccc_lru_max >> shift;
+       unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+       return snprintf(page, count,
+                       "users: %d\n"
+                       "max_cached_mb: %d\n"
+                       "used_mb: %d\n"
+                       "unused_mb: %d\n"
+                       "reclaim_count: %u\n",
+                       atomic_read(&cache->ccc_users),
+                       max_cached_mb,
+                       max_cached_mb - unused_mb,
+                       unused_mb,
+                       cache->ccc_lru_shrinkers);
+}
+
+static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = &sbi->ll_cache;
+       int mult, rc, pages_number;
+       int diff = 0;
+       int nrpages = 0;
+       ENTRY;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       buffer = lprocfs_find_named_value(buffer, "max_cached_mb:", &count);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               RETURN(rc);
+
+       if (pages_number < 0 || pages_number > num_physpages) {
+               CERROR("%s: can't set max cache more than %lu MB\n",
+                      ll_get_fsname(sb, NULL, 0),
+                      num_physpages >> (20 - PAGE_CACHE_SHIFT));
+               RETURN(-ERANGE);
+       }
+
+       if (sbi->ll_dt_exp == NULL)
+               RETURN(-ENODEV);
+
+       spin_lock(&sbi->ll_lock);
+       diff = pages_number - cache->ccc_lru_max;
+       spin_unlock(&sbi->ll_lock);
+
+       /* easy - add more LRU slots. */
+       if (diff >= 0) {
+               atomic_add(diff, &cache->ccc_lru_left);
+               GOTO(out, rc = 0);
+       }
+
+       diff = -diff;
+       while (diff > 0) {
+               int tmp;
+
+               /* reduce LRU budget from free slots. */
+               do {
+                       int ov, nv;
+
+                       ov = atomic_read(&cache->ccc_lru_left);
+                       if (ov == 0)
+                               break;
+
+                       nv = ov > diff ? ov - diff : 0;
+                       rc = cfs_atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+                       if (likely(ov == rc)) {
+                               diff -= ov - nv;
+                               nrpages += ov - nv;
+                               break;
+                       }
+               } while (1);
+
+               if (diff <= 0)
+                       break;
+
+               /* difficult - have to ask OSCs to drop LRU slots. */
+               tmp = diff << 1;
+               rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+                               sizeof(KEY_CACHE_LRU_SHRINK),
+                               KEY_CACHE_LRU_SHRINK,
+                               sizeof(tmp), &tmp, NULL);
+               if (rc < 0)
+                       break;
+       }
+
+out:
+       if (rc >= 0) {
+               spin_lock(&sbi->ll_lock);
+               cache->ccc_lru_max = pages_number;
+               spin_unlock(&sbi->ll_lock);
+               rc = count;
+       } else {
+               atomic_add(nrpages, &cache->ccc_lru_left);
+       }
+       return rc;
+}
+
+static int ll_rd_checksum(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return snprintf(page, count, "%u\n",
+                       (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+}
+
+static int ll_wr_checksum(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       if (!sbi->ll_dt_exp)
+               /* Not set up yet */
+               return -EAGAIN;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       if (val)
+               sbi->ll_flags |= LL_SBI_CHECKSUM;
+       else
+               sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+       rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+                               KEY_CHECKSUM, sizeof(val), &val, NULL);
+       if (rc)
+               CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+       return count;
+}
+
+static int ll_rd_max_rw_chunk(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+
+       return snprintf(page, count, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+}
+
+static int ll_wr_max_rw_chunk(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       int rc, val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       ll_s2sbi(sb)->ll_max_rw_chunk = val;
+       return count;
+}
+
+static int ll_rd_track_id(char *page, int count, void *data,
+                         enum stats_track_type type)
+{
+       struct super_block *sb = data;
+
+       if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+               return snprintf(page, count, "%d\n",
+                               ll_s2sbi(sb)->ll_stats_track_id);
+
+       } else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+               return snprintf(page, count, "0 (all)\n");
+       } else {
+               return snprintf(page, count, "untracked\n");
+       }
+}
+
+static int ll_wr_track_id(const char *buffer, unsigned long count, void *data,
+                         enum stats_track_type type)
+{
+       struct super_block *sb = data;
+       int rc, pid;
+
+       rc = lprocfs_write_helper(buffer, count, &pid);
+       if (rc)
+               return rc;
+       ll_s2sbi(sb)->ll_stats_track_id = pid;
+       if (pid == 0)
+               ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+       else
+               ll_s2sbi(sb)->ll_stats_track_type = type;
+       lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+       return count;
+}
+
+static int ll_rd_track_pid(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       return (ll_rd_track_id(page, count, data, STATS_TRACK_PID));
+}
+
+static int ll_wr_track_pid(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PID));
+}
+
+static int ll_rd_track_ppid(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       return (ll_rd_track_id(page, count, data, STATS_TRACK_PPID));
+}
+
+static int ll_wr_track_ppid(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PPID));
+}
+
+static int ll_rd_track_gid(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       return (ll_rd_track_id(page, count, data, STATS_TRACK_GID));
+}
+
+static int ll_wr_track_gid(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       return (ll_wr_track_id(buffer, count, data, STATS_TRACK_GID));
+}
+
+static int ll_rd_statahead_max(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return snprintf(page, count, "%u\n", sbi->ll_sa_max);
+}
+
+static int ll_wr_statahead_max(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val >= 0 && val <= LL_SA_RPC_MAX)
+               sbi->ll_sa_max = val;
+       else
+               CERROR("Bad statahead_max value %d. Valid values are in the "
+                      "range [0, %d]\n", val, LL_SA_RPC_MAX);
+
+       return count;
+}
+
+static int ll_rd_statahead_agl(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return snprintf(page, count, "%u\n",
+                       sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+}
+
+static int ll_wr_statahead_agl(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val)
+               sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+       else
+               sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+       return count;
+}
+
+static int ll_rd_statahead_stats(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return snprintf(page, count,
+                       "statahead total: %u\n"
+                       "statahead wrong: %u\n"
+                       "agl total: %u\n",
+                       atomic_read(&sbi->ll_sa_total),
+                       atomic_read(&sbi->ll_sa_wrong),
+                       atomic_read(&sbi->ll_agl_total));
+}
+
+static int ll_rd_lazystatfs(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return snprintf(page, count, "%u\n",
+                       (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+}
+
+static int ll_wr_lazystatfs(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val)
+               sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+       else
+               sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+       return count;
+}
+
+static int ll_rd_maxea_size(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct super_block *sb = data;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       unsigned int ealen;
+       int rc;
+
+       rc = ll_get_max_mdsize(sbi, &ealen);
+       if (rc)
+               return rc;
+
+       return snprintf(page, count, "%u\n", ealen);
+}
+
+static int ll_rd_sbi_flags(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+       const char *str[] = LL_SBI_FLAGS;
+       struct super_block *sb = data;
+       int flags = ll_s2sbi(sb)->ll_flags;
+       int i = 0;
+       int rc = 0;
+
+       while (flags != 0) {
+               if (ARRAY_SIZE(str) <= i) {
+                       CERROR("%s: Revise array LL_SBI_FLAGS to match sbi "
+                               "flags please.\n", ll_get_fsname(sb, NULL, 0));
+                       return -EINVAL;
+               }
+
+               if (flags & 0x1)
+                       rc += snprintf(page + rc, count - rc, "%s ", str[i]);
+               flags >>= 1;
+               ++i;
+       }
+       if (rc > 0)
+               rc += snprintf(page + rc, count - rc, "\b\n");
+       return rc;
+}
+
+static int ll_rd_unstable_stats(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct super_block      *sb    = data;
+       struct ll_sb_info       *sbi   = ll_s2sbi(sb);
+       struct cl_client_cache  *cache = &sbi->ll_cache;
+       int pages, mb, rc;
+
+       pages = atomic_read(&cache->ccc_unstable_nr);
+       mb    = (pages * PAGE_CACHE_SIZE) >> 20;
+
+       rc = snprintf(page, count, "unstable_pages: %8d\n"
+                                  "unstable_mb:    %8d\n", pages, mb);
+
+       return rc;
+}
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+       { "uuid",        ll_rd_sb_uuid,   0, 0 },
+       //{ "mntpt_path",   ll_rd_path,      0, 0 },
+       { "fstype",       ll_rd_fstype,    0, 0 },
+       { "site",        ll_rd_site_stats,       0, 0 },
+       { "blocksize",    ll_rd_blksize,          0, 0 },
+       { "kbytestotal",  ll_rd_kbytestotal,      0, 0 },
+       { "kbytesfree",   ll_rd_kbytesfree,       0, 0 },
+       { "kbytesavail",  ll_rd_kbytesavail,      0, 0 },
+       { "filestotal",   ll_rd_filestotal,       0, 0 },
+       { "filesfree",    ll_rd_filesfree,      0, 0 },
+       { "client_type",  ll_rd_client_type,      0, 0 },
+       //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
+       { "max_read_ahead_mb", ll_rd_max_readahead_mb,
+                              ll_wr_max_readahead_mb, 0 },
+       { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                       ll_wr_max_readahead_per_file_mb, 0 },
+       { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
+                                    ll_wr_max_read_ahead_whole_mb, 0 },
+       { "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
+       { "checksum_pages",   ll_rd_checksum, ll_wr_checksum, 0 },
+       { "max_rw_chunk",     ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 },
+       { "stats_track_pid",  ll_rd_track_pid, ll_wr_track_pid, 0 },
+       { "stats_track_ppid", ll_rd_track_ppid, ll_wr_track_ppid, 0 },
+       { "stats_track_gid",  ll_rd_track_gid, ll_wr_track_gid, 0 },
+       { "statahead_max",    ll_rd_statahead_max, ll_wr_statahead_max, 0 },
+       { "statahead_agl",    ll_rd_statahead_agl, ll_wr_statahead_agl, 0 },
+       { "statahead_stats",  ll_rd_statahead_stats, 0, 0 },
+       { "lazystatfs",       ll_rd_lazystatfs, ll_wr_lazystatfs, 0 },
+       { "max_easize",       ll_rd_maxea_size, 0, 0 },
+       { "sbi_flags",  ll_rd_sbi_flags, 0, 0 },
+       { "unstable_stats",   ll_rd_unstable_stats, 0, 0},
+       { 0 }
+};
+
+#define MAX_STRING_SIZE 128
+
+struct llite_file_opcode {
+       __u32       opcode;
+       __u32       type;
+       const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+       /* file operation */
+       { LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+       { LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+       { LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "read_bytes" },
+       { LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "write_bytes" },
+       { LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                  "brw_read" },
+       { LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                  "brw_write" },
+       { LPROC_LL_OSC_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "osc_read" },
+       { LPROC_LL_OSC_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "osc_write" },
+       { LPROC_LL_IOCTL,         LPROCFS_TYPE_REGS, "ioctl" },
+       { LPROC_LL_OPEN,           LPROCFS_TYPE_REGS, "open" },
+       { LPROC_LL_RELEASE,     LPROCFS_TYPE_REGS, "close" },
+       { LPROC_LL_MAP,     LPROCFS_TYPE_REGS, "mmap" },
+       { LPROC_LL_LLSEEK,       LPROCFS_TYPE_REGS, "seek" },
+       { LPROC_LL_FSYNC,         LPROCFS_TYPE_REGS, "fsync" },
+       { LPROC_LL_READDIR,     LPROCFS_TYPE_REGS, "readdir" },
+       /* inode operation */
+       { LPROC_LL_SETATTR,     LPROCFS_TYPE_REGS, "setattr" },
+       { LPROC_LL_TRUNC,         LPROCFS_TYPE_REGS, "truncate" },
+       { LPROC_LL_FLOCK,         LPROCFS_TYPE_REGS, "flock" },
+       { LPROC_LL_GETATTR,     LPROCFS_TYPE_REGS, "getattr" },
+       /* dir inode operation */
+       { LPROC_LL_CREATE,       LPROCFS_TYPE_REGS, "create" },
+       { LPROC_LL_LINK,           LPROCFS_TYPE_REGS, "link" },
+       { LPROC_LL_UNLINK,       LPROCFS_TYPE_REGS, "unlink" },
+       { LPROC_LL_SYMLINK,     LPROCFS_TYPE_REGS, "symlink" },
+       { LPROC_LL_MKDIR,         LPROCFS_TYPE_REGS, "mkdir" },
+       { LPROC_LL_RMDIR,         LPROCFS_TYPE_REGS, "rmdir" },
+       { LPROC_LL_MKNOD,         LPROCFS_TYPE_REGS, "mknod" },
+       { LPROC_LL_RENAME,       LPROCFS_TYPE_REGS, "rename" },
+       /* special inode operation */
+       { LPROC_LL_STAFS,         LPROCFS_TYPE_REGS, "statfs" },
+       { LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+       { LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+       { LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+       { LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+       { LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+       { LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+       if (!sbi->ll_stats)
+               return;
+       if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+                sbi->ll_stats_track_id == current->pid)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+                sbi->ll_stats_track_id == current->parent->pid)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+                sbi->ll_stats_track_id == current_gid())
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+       [RA_STAT_HIT] = "hits",
+       [RA_STAT_MISS] = "misses",
+       [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+       [RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+       [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+       [RA_STAT_FAILED_MATCH] = "failed lock match",
+       [RA_STAT_DISCARDED] = "read but discarded",
+       [RA_STAT_ZERO_LEN] = "zero length file",
+       [RA_STAT_ZERO_WINDOW] = "zero size window",
+       [RA_STAT_EOF] = "read-ahead to EOF",
+       [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+       [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                               struct super_block *sb, char *osc, char *mdc)
+{
+       struct lprocfs_vars lvars[2];
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       char name[MAX_STRING_SIZE + 1], *ptr;
+       int err, id, len, rc;
+       ENTRY;
+
+       memset(lvars, 0, sizeof(lvars));
+
+       name[MAX_STRING_SIZE] = '\0';
+       lvars[0].name = name;
+
+       LASSERT(sbi != NULL);
+       LASSERT(mdc != NULL);
+       LASSERT(osc != NULL);
+
+       /* Get fsname */
+       len = strlen(lsi->lsi_lmd->lmd_profile);
+       ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+       if (ptr && (strcmp(ptr, "-client") == 0))
+               len -= 7;
+
+       /* Mount info */
+       snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+                lsi->lsi_lmd->lmd_profile, sb);
+
+       sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+       if (IS_ERR(sbi->ll_proc_root)) {
+               err = PTR_ERR(sbi->ll_proc_root);
+               sbi->ll_proc_root = NULL;
+               RETURN(err);
+       }
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+                               &vvp_dump_pgcache_file_ops, sbi);
+       if (rc)
+               CWARN("Error adding the dump_page_cache file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+                               &ll_rw_extents_stats_fops, sbi);
+       if (rc)
+               CWARN("Error adding the extent_stats file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+                               0644, &ll_rw_extents_stats_pp_fops, sbi);
+       if (rc)
+               CWARN("Error adding the extents_stats_per_process file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+                               &ll_rw_offset_stats_fops, sbi);
+       if (rc)
+               CWARN("Error adding the offset_stats file\n");
+
+       /* File operations stats */
+       sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+                                           LPROCFS_STATS_FLAG_NONE);
+       if (sbi->ll_stats == NULL)
+               GOTO(out, err = -ENOMEM);
+       /* do counter init */
+       for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+               __u32 type = llite_opcode_table[id].type;
+               void *ptr = NULL;
+               if (type & LPROCFS_TYPE_REGS)
+                       ptr = "regs";
+               else if (type & LPROCFS_TYPE_BYTES)
+                       ptr = "bytes";
+               else if (type & LPROCFS_TYPE_PAGES)
+                       ptr = "pages";
+               lprocfs_counter_init(sbi->ll_stats,
+                                    llite_opcode_table[id].opcode,
+                                    (type & LPROCFS_CNTR_AVGMINMAX),
+                                    llite_opcode_table[id].opname, ptr);
+       }
+       err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+       if (err)
+               GOTO(out, err);
+
+       sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+                                              LPROCFS_STATS_FLAG_NONE);
+       if (sbi->ll_ra_stats == NULL)
+               GOTO(out, err = -ENOMEM);
+
+       for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+               lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+                                    ra_stat_string[id], "pages");
+       err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+                                    sbi->ll_ra_stats);
+       if (err)
+               GOTO(out, err);
+
+
+       err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+       if (err)
+               GOTO(out, err);
+
+       /* MDC info */
+       obd = class_name2obd(mdc);
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_name != NULL);
+
+       snprintf(name, MAX_STRING_SIZE, "%s/common_name",
+                obd->obd_type->typ_name);
+       lvars[0].read_fptr = lprocfs_rd_name;
+       err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name);
+       lvars[0].read_fptr = lprocfs_rd_uuid;
+       err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       /* OSC */
+       obd = class_name2obd(osc);
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_name != NULL);
+
+       snprintf(name, MAX_STRING_SIZE, "%s/common_name",
+                obd->obd_type->typ_name);
+       lvars[0].read_fptr = lprocfs_rd_name;
+       err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name);
+       lvars[0].read_fptr = lprocfs_rd_uuid;
+       err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd);
+out:
+       if (err) {
+               lprocfs_remove(&sbi->ll_proc_root);
+               lprocfs_free_stats(&sbi->ll_ra_stats);
+               lprocfs_free_stats(&sbi->ll_stats);
+       }
+       RETURN(err);
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+       if (sbi->ll_proc_root) {
+               lprocfs_remove(&sbi->ll_proc_root);
+               lprocfs_free_stats(&sbi->ll_ra_stats);
+               lprocfs_free_stats(&sbi->ll_stats);
+       }
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+                                  struct seq_file *seq, int which)
+{
+       unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+       unsigned long start, end, r, w;
+       char *unitp = "KMGTPEZY";
+       int i, units = 10;
+       struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+       read_cum = 0;
+       write_cum = 0;
+       start = 0;
+
+       for(i = 0; i < LL_HIST_MAX; i++) {
+               read_tot += pp_info->pp_r_hist.oh_buckets[i];
+               write_tot += pp_info->pp_w_hist.oh_buckets[i];
+       }
+
+       for(i = 0; i < LL_HIST_MAX; i++) {
+               r = pp_info->pp_r_hist.oh_buckets[i];
+               w = pp_info->pp_w_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               end = 1 << (i + LL_HIST_START - units);
+               seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+                          "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+                          (i == LL_HIST_MAX - 1) ? '+' : ' ',
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+               start = end;
+               if (start == 1<<10) {
+                       start = 1;
+                       units += 10;
+                       unitp++;
+               }
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int k;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+       seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+                  "extents", "calls", "%", "cum%",
+                  "calls", "%", "cum%");
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+               if (io_extents->pp_extents[k].pid != 0) {
+                       seq_printf(seq, "\nPID: %d\n",
+                                  io_extents->pp_extents[k].pid);
+                       ll_display_extents_info(io_extents, seq, k);
+               }
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+       return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+                                               const char *buf, size_t len,
+                                               loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int i;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                      strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               io_extents->pp_extents[i].pid = 0;
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+
+       seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+       seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+                  "extents", "calls", "%", "cum%",
+                  "calls", "%", "cum%");
+       spin_lock(&sbi->ll_lock);
+       ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+       spin_unlock(&sbi->ll_lock);
+
+       return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
+                                       size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int i;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                      strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+               io_extents->pp_extents[i].pid = 0;
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                      struct ll_file_data *file, loff_t pos,
+                      size_t count, int rw)
+{
+       int i, cur = -1;
+       struct ll_rw_process_info *process;
+       struct ll_rw_process_info *offset;
+       int *off_count = &sbi->ll_rw_offset_entry_count;
+       int *process_count = &sbi->ll_offset_process_count;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+       if(!sbi->ll_rw_stats_on)
+               return;
+       process = sbi->ll_rw_process_info;
+       offset = sbi->ll_rw_offset_info;
+
+       spin_lock(&sbi->ll_pp_extent_lock);
+       /* Extent statistics */
+       for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if(io_extents->pp_extents[i].pid == pid) {
+                       cur = i;
+                       break;
+               }
+       }
+
+       if (cur == -1) {
+               /* new process */
+               sbi->ll_extent_process_count =
+                       (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+               cur = sbi->ll_extent_process_count;
+               io_extents->pp_extents[cur].pid = pid;
+               lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+       }
+
+       for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+            (i < (LL_HIST_MAX - 1)); i++);
+       if (rw == 0) {
+               io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+               io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+       } else {
+               io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+               io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+
+       spin_lock(&sbi->ll_process_lock);
+       /* Offset statistics */
+       for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if (process[i].rw_pid == pid) {
+                       if (process[i].rw_last_file != file) {
+                               process[i].rw_range_start = pos;
+                               process[i].rw_last_file_pos = pos + count;
+                               process[i].rw_smallest_extent = count;
+                               process[i].rw_largest_extent = count;
+                               process[i].rw_offset = 0;
+                               process[i].rw_last_file = file;
+                               spin_unlock(&sbi->ll_process_lock);
+                               return;
+                       }
+                       if (process[i].rw_last_file_pos != pos) {
+                               *off_count =
+                                   (*off_count + 1) % LL_OFFSET_HIST_MAX;
+                               offset[*off_count].rw_op = process[i].rw_op;
+                               offset[*off_count].rw_pid = pid;
+                               offset[*off_count].rw_range_start =
+                                       process[i].rw_range_start;
+                               offset[*off_count].rw_range_end =
+                                       process[i].rw_last_file_pos;
+                               offset[*off_count].rw_smallest_extent =
+                                       process[i].rw_smallest_extent;
+                               offset[*off_count].rw_largest_extent =
+                                       process[i].rw_largest_extent;
+                               offset[*off_count].rw_offset =
+                                       process[i].rw_offset;
+                               process[i].rw_op = rw;
+                               process[i].rw_range_start = pos;
+                               process[i].rw_smallest_extent = count;
+                               process[i].rw_largest_extent = count;
+                               process[i].rw_offset = pos -
+                                       process[i].rw_last_file_pos;
+                       }
+                       if(process[i].rw_smallest_extent > count)
+                               process[i].rw_smallest_extent = count;
+                       if(process[i].rw_largest_extent < count)
+                               process[i].rw_largest_extent = count;
+                       process[i].rw_last_file_pos = pos + count;
+                       spin_unlock(&sbi->ll_process_lock);
+                       return;
+               }
+       }
+       *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+       process[*process_count].rw_pid = pid;
+       process[*process_count].rw_op = rw;
+       process[*process_count].rw_range_start = pos;
+       process[*process_count].rw_last_file_pos = pos + count;
+       process[*process_count].rw_smallest_extent = count;
+       process[*process_count].rw_largest_extent = count;
+       process[*process_count].rw_offset = 0;
+       process[*process_count].rw_last_file = file;
+       spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+       struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+       int i;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       spin_lock(&sbi->ll_process_lock);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+                  "R/W", "PID", "RANGE START", "RANGE END",
+                  "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+       /* We stored the discontiguous offsets here; print them first */
+       for(i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+               if (offset[i].rw_pid != 0)
+                       seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+                                  offset[i].rw_op ? 'W' : 'R',
+                                  offset[i].rw_pid,
+                                  offset[i].rw_range_start,
+                                  offset[i].rw_range_end,
+                                  (unsigned long)offset[i].rw_smallest_extent,
+                                  (unsigned long)offset[i].rw_largest_extent,
+                                  offset[i].rw_offset);
+       }
+       /* Then print the current offsets for each process */
+       for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if (process[i].rw_pid != 0)
+                       seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+                                  process[i].rw_op ? 'W' : 'R',
+                                  process[i].rw_pid,
+                                  process[i].rw_range_start,
+                                  process[i].rw_last_file_pos,
+                                  (unsigned long)process[i].rw_smallest_extent,
+                                  (unsigned long)process[i].rw_largest_extent,
+                                  process[i].rw_offset);
+       }
+       spin_unlock(&sbi->ll_process_lock);
+
+       return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+       struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                          strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+
+       spin_lock(&sbi->ll_process_lock);
+       sbi->ll_offset_process_count = 0;
+       sbi->ll_rw_offset_entry_count = 0;
+       memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+              LL_PROCESS_HIST_MAX);
+       memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+              LL_OFFSET_HIST_MAX);
+       spin_unlock(&sbi->ll_process_lock);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = NULL;
+    lvars->obd_vars     = lprocfs_llite_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644 (file)
index 0000000..e6b3f54
--- /dev/null
@@ -0,0 +1,1279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+                       int, struct lookup_intent *);
+
+/*
+ * Check if we have something mounted at the named dchild.
+ * In such a case there would always be dentry present.
+ */
+static int ll_d_mountpoint(struct dentry *dparent, struct dentry *dchild,
+                          struct qstr *name)
+{
+       int mounted = 0;
+
+       if (unlikely(dchild)) {
+               mounted = d_mountpoint(dchild);
+       } else if (dparent) {
+               dchild = d_lookup(dparent, name);
+               if (dchild) {
+                       mounted = d_mountpoint(dchild);
+                       dput(dchild);
+               }
+       }
+       return mounted;
+}
+
+int ll_unlock(__u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+
+       ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lustre_md     *md = opaque;
+
+       if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+               CERROR("MDS body missing FID\n");
+               return 0;
+       }
+
+       if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+               return 0;
+
+       return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+       if (unlikely(!(body->valid & OBD_MD_FLID))) {
+               CERROR("MDS body missing FID\n");
+               return -EINVAL;
+       }
+
+       lli->lli_fid = body->fid1;
+       if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+               CERROR("Can not initialize inode "DFID" without object type: "
+                      "valid = "LPX64"\n", PFID(&lli->lli_fid), body->valid);
+               return -EINVAL;
+       }
+
+       inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+       if (unlikely(inode->i_mode == 0)) {
+               CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+               return -EINVAL;
+       }
+
+       ll_lli_init(lli);
+
+       return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                     struct lustre_md *md)
+{
+       struct inode     *inode;
+       ENTRY;
+
+       LASSERT(hash != 0);
+       inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+       if (inode) {
+               if (inode->i_state & I_NEW) {
+                       int rc = 0;
+
+                       ll_read_inode2(inode, md);
+                       if (S_ISREG(inode->i_mode) &&
+                           ll_i2info(inode)->lli_clob == NULL) {
+                               CDEBUG(D_INODE,
+                                       "%s: apply lsm %p to inode "DFID".\n",
+                                       ll_get_fsname(sb, NULL, 0), md->lsm,
+                                       PFID(ll_inode2fid(inode)));
+                               rc = cl_file_inode_init(inode, md);
+                       }
+                       if (rc != 0) {
+                               make_bad_inode(inode);
+                               unlock_new_inode(inode);
+                               iput(inode);
+                               inode = ERR_PTR(rc);
+                       } else
+                               unlock_new_inode(inode);
+               } else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+                       ll_update_inode(inode, md);
+               CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+                      inode, PFID(&md->body->fid1));
+       }
+       RETURN(inode);
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+       struct dentry *dentry, *tmp_subdir;
+       struct ll_d_hlist_node *p;
+
+       ll_lock_dcache(dir);
+       ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_alias) {
+               spin_lock(&dentry->d_lock);
+               if (!list_empty(&dentry->d_subdirs)) {
+                       struct dentry *child;
+
+                       list_for_each_entry_safe(child, tmp_subdir,
+                                                &dentry->d_subdirs,
+                                                d_u.d_child) {
+                               if (child->d_inode == NULL)
+                                       d_lustre_invalidate(child);
+                       }
+               }
+               spin_unlock(&dentry->d_lock);
+       }
+       ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                      void *data, int flag)
+{
+       int rc;
+       struct lustre_handle lockh;
+       ENTRY;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               if (rc < 0) {
+                       CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+                       RETURN(rc);
+               }
+               break;
+       case LDLM_CB_CANCELING: {
+               struct inode *inode = ll_inode_from_resource_lock(lock);
+               struct ll_inode_info *lli;
+               __u64 bits = lock->l_policy_data.l_inodebits.bits;
+               struct lu_fid *fid;
+               ldlm_mode_t mode = lock->l_req_mode;
+
+               /* Inode is set to lock->l_resource->lr_lvb_inode
+                * for mdc - bug 24555 */
+               LASSERT(lock->l_ast_data == NULL);
+
+               /* Invalidate all dentries associated with this inode */
+               if (inode == NULL)
+                       break;
+
+               LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+               /* For OPEN locks we differentiate between lock modes
+                * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+               if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+                       ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+               if (bits & MDS_INODELOCK_OPEN)
+                       ll_have_md_lock(inode, &bits, mode);
+
+               fid = ll_inode2fid(inode);
+               if (lock->l_resource->lr_name.name[0] != fid_seq(fid) ||
+                   lock->l_resource->lr_name.name[1] != fid_oid(fid) ||
+                   lock->l_resource->lr_name.name[2] != fid_ver(fid)) {
+                       LDLM_ERROR(lock, "data mismatch with object "
+                                  DFID" (%p)", PFID(fid), inode);
+               }
+
+               if (bits & MDS_INODELOCK_OPEN) {
+                       int flags = 0;
+                       switch (lock->l_req_mode) {
+                       case LCK_CW:
+                               flags = FMODE_WRITE;
+                               break;
+                       case LCK_PR:
+                               flags = FMODE_EXEC;
+                               break;
+                       case LCK_CR:
+                               flags = FMODE_READ;
+                               break;
+                       default:
+                               CERROR("Unexpected lock mode for OPEN lock "
+                                      "%d, inode %ld\n", lock->l_req_mode,
+                                      inode->i_ino);
+                       }
+                       ll_md_real_close(inode, flags);
+               }
+
+               lli = ll_i2info(inode);
+               if (bits & MDS_INODELOCK_LAYOUT) {
+                       struct cl_object_conf conf = { { 0 } };
+
+                       conf.coc_opc = OBJECT_CONF_INVALIDATE;
+                       conf.coc_inode = inode;
+                       rc = ll_layout_conf(inode, &conf);
+                       if (rc)
+                               CDEBUG(D_INODE, "invaliding layout %d.\n", rc);
+               }
+
+               if (bits & MDS_INODELOCK_UPDATE)
+                       lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+
+               if (S_ISDIR(inode->i_mode) &&
+                    (bits & MDS_INODELOCK_UPDATE)) {
+                       CDEBUG(D_INODE, "invalidating inode %lu\n",
+                              inode->i_ino);
+                       truncate_inode_pages(inode->i_mapping, 0);
+                       ll_invalidate_negative_children(inode);
+               }
+
+               if (inode->i_sb->s_root &&
+                   inode != inode->i_sb->s_root->d_inode &&
+                   (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)))
+                       ll_invalidate_aliases(inode);
+               iput(inode);
+               break;
+       }
+       default:
+               LBUG();
+       }
+
+       RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+       if (current_is_in_group(i->i_gid))
+               return (__u32)i->i_gid;
+       else
+               return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+       int i;
+#endif
+
+       LASSERT(i1 != NULL);
+       LASSERT(suppgids != NULL);
+
+       suppgids[0] = ll_i2suppgid(i1);
+
+       if (i2)
+               suppgids[1] = ll_i2suppgid(i2);
+               else
+                       suppgids[1] = -1;
+
+#if 0
+       for (i = 0; i < current_ngroups; i++) {
+               if (suppgids[0] == -1) {
+                       if (current_groups[i] != suppgids[1])
+                               suppgids[0] = current_groups[i];
+                       continue;
+               }
+               if (suppgids[1] == -1) {
+                       if (current_groups[i] != suppgids[0])
+                               suppgids[1] = current_groups[i];
+                       continue;
+               }
+               break;
+       }
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+       struct dentry *alias, *discon_alias, *invalid_alias;
+       struct ll_d_hlist_node *p;
+
+       if (ll_d_hlist_empty(&inode->i_dentry))
+               return NULL;
+
+       discon_alias = invalid_alias = NULL;
+
+       ll_lock_dcache(inode);
+       ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+               LASSERT(alias != dentry);
+
+               spin_lock(&alias->d_lock);
+               if (alias->d_flags & DCACHE_DISCONNECTED)
+                       /* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+                       discon_alias = alias;
+               else if (alias->d_parent == dentry->d_parent         &&
+                        alias->d_name.hash == dentry->d_name.hash       &&
+                        alias->d_name.len == dentry->d_name.len         &&
+                        memcmp(alias->d_name.name, dentry->d_name.name,
+                               dentry->d_name.len) == 0)
+                       invalid_alias = alias;
+               spin_unlock(&alias->d_lock);
+
+               if (invalid_alias)
+                       break;
+       }
+       alias = invalid_alias ?: discon_alias ?: NULL;
+       if (alias) {
+               spin_lock(&alias->d_lock);
+               dget_dlock(alias);
+               spin_unlock(&alias->d_lock);
+       }
+       ll_unlock_dcache(inode);
+
+       return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+       struct dentry *new;
+
+       if (inode) {
+               new = ll_find_alias(inode, de);
+               if (new) {
+                       ll_dops_init(new, 1, 1);
+                       d_move(new, de);
+                       iput(inode);
+                       CDEBUG(D_DENTRY,
+                              "Reuse dentry %p inode %p refc %d flags %#x\n",
+                             new, new->d_inode, d_refcount(new), new->d_flags);
+                       return new;
+               }
+       }
+       ll_dops_init(de, 1, 1);
+       __d_lustre_invalidate(de);
+       d_add(de, inode);
+       CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+              de, de->d_inode, d_refcount(de), de->d_flags);
+       return de;
+}
+
+int ll_lookup_it_finish(struct ptlrpc_request *request,
+                       struct lookup_intent *it, void *data)
+{
+       struct it_cb_data *icbd = data;
+       struct dentry **de = icbd->icbd_childp;
+       struct inode *parent = icbd->icbd_parent;
+       struct inode *inode = NULL;
+       __u64 bits = 0;
+       int rc;
+       ENTRY;
+
+       /* NB 1 request reference will be taken away by ll_intent_lock()
+        * when I return */
+       CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+              it->d.lustre.it_disposition);
+       if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+               rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+               if (rc)
+                       RETURN(rc);
+
+               ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+               /* We used to query real size from OSTs here, but actually
+                  this is not needed. For stat() calls size would be updated
+                  from subsequent do_revalidate()->ll_inode_revalidate_it() in
+                  2.4 and
+                  vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+                  Everybody else who needs correct file size would call
+                  ll_glimpse_size or some equivalent themselves anyway.
+                  Also see bug 7198. */
+       }
+
+       /* Only hash *de if it is unhashed (new dentry).
+        * Atoimc_open may passin hashed dentries for open.
+        */
+       if (d_unhashed(*de))
+               *de = ll_splice_alias(inode, *de);
+
+       if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+               /* we have lookup look - unhide dentry */
+               if (bits & MDS_INODELOCK_LOOKUP)
+                       d_lustre_revalidate(*de);
+       } else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+               /* If file created on server, don't depend on parent UPDATE
+                * lock to unhide it. It is left hidden and next lookup can
+                * find it in ll_splice_alias.
+                */
+               /* Check that parent has UPDATE lock. */
+               struct lookup_intent parent_it = {
+                                       .it_op = IT_GETATTR,
+                                       .d.lustre.it_lock_handle = 0 };
+
+               if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+                                      &ll_i2info(parent)->lli_fid, NULL)) {
+                       d_lustre_revalidate(*de);
+                       ll_intent_release(&parent_it);
+               }
+       }
+
+       RETURN(0);
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+                                  struct lookup_intent *it, int lookup_flags)
+{
+       struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+       struct dentry *save = dentry, *retval;
+       struct ptlrpc_request *req = NULL;
+       struct md_op_data *op_data;
+       struct it_cb_data icbd;
+       __u32 opc;
+       int rc;
+       ENTRY;
+
+       if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+               RETURN(ERR_PTR(-ENAMETOOLONG));
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+              dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+              parent->i_generation, parent, LL_IT2STR(it));
+
+       if (d_mountpoint(dentry))
+               CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+       ll_frob_intent(&it, &lookup_it);
+
+       /* As do_lookup is called before follow_mount, root dentry may be left
+        * not valid, revalidate it here. */
+       if (parent->i_sb->s_root && (parent->i_sb->s_root->d_inode == parent) &&
+           (it->it_op & (IT_OPEN | IT_CREAT))) {
+               rc = ll_inode_revalidate_it(parent->i_sb->s_root, it,
+                                           MDS_INODELOCK_LOOKUP);
+               if (rc)
+                       RETURN(ERR_PTR(rc));
+       }
+
+       if (it->it_op == IT_GETATTR) {
+               rc = ll_statahead_enter(parent, &dentry, 0);
+               if (rc == 1) {
+                       if (dentry == save)
+                               GOTO(out, retval = NULL);
+                       GOTO(out, retval = dentry);
+               }
+       }
+
+       icbd.icbd_childp = &dentry;
+       icbd.icbd_parent = parent;
+
+       if (it->it_op & IT_CREAT ||
+           (it->it_op & IT_OPEN && it->it_create_mode & O_CREAT))
+               opc = LUSTRE_OPC_CREATE;
+       else
+               opc = LUSTRE_OPC_ANY;
+
+       op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+                                    dentry->d_name.len, lookup_flags, opc,
+                                    NULL);
+       if (IS_ERR(op_data))
+               RETURN((void *)op_data);
+
+       /* enforce umask if acl disabled or MDS doesn't support umask */
+       if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+               it->it_create_mode &= ~current_umask();
+
+       rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+                           lookup_flags, &req, ll_md_blocking_ast, 0);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0)
+               GOTO(out, retval = ERR_PTR(rc));
+
+       rc = ll_lookup_it_finish(req, it, &icbd);
+       if (rc != 0) {
+               ll_intent_release(it);
+               GOTO(out, retval = ERR_PTR(rc));
+       }
+
+       if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+           !S_ISREG(dentry->d_inode->i_mode) &&
+           !S_ISDIR(dentry->d_inode->i_mode)) {
+               ll_release_openhandle(dentry, it);
+       }
+       ll_lookup_finish_locks(it, dentry);
+
+       if (dentry == save)
+               GOTO(out, retval = NULL);
+       else
+               GOTO(out, retval = dentry);
+ out:
+       if (req)
+               ptlrpc_req_finished(req);
+       if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+               ll_statahead_mark(parent, dentry);
+       return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+                                  unsigned int flags)
+{
+       struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+       struct dentry *de;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),flags=%u\n",
+              dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+              parent->i_generation, parent, flags);
+
+       /* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+       if ((flags & LOOKUP_CREATE ) && !(flags & LOOKUP_OPEN)) {
+               ll_dops_init(dentry, 1, 1);
+               __d_lustre_invalidate(dentry);
+               d_add(dentry, NULL);
+               return NULL;
+       }
+
+       if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+               itp = NULL;
+       else
+               itp = &it;
+       de = ll_lookup_it(parent, dentry, itp, 0);
+
+       if (itp != NULL)
+               ll_intent_release(itp);
+
+       return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+                         struct file *file, unsigned open_flags,
+                         umode_t mode, int *opened)
+{
+       struct lookup_intent *it;
+       struct dentry *de;
+       long long lookup_flags = LOOKUP_OPEN;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),file %p,"
+                          "open_flags %x,mode %x opened %d\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, file, open_flags, mode, *opened);
+
+       OBD_ALLOC(it, sizeof(*it));
+       if (!it)
+               RETURN(-ENOMEM);
+
+       it->it_op = IT_OPEN;
+       if (mode) {
+               it->it_op |= IT_CREAT;
+               lookup_flags |= LOOKUP_CREATE;
+       }
+       it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+       it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+       /* Dentry added to dcache tree in ll_lookup_it */
+       de = ll_lookup_it(dir, dentry, it, lookup_flags);
+       if (IS_ERR(de))
+               rc = PTR_ERR(de);
+       else if (de != NULL)
+               dentry = de;
+
+       if (!rc) {
+               if (it_disposition(it, DISP_OPEN_CREATE)) {
+                       /* Dentry instantiated in ll_create_it. */
+                       rc = ll_create_it(dir, dentry, mode, it);
+                       if (rc) {
+                               /* We dget in ll_splice_alias. */
+                               if (de != NULL)
+                                       dput(de);
+                               goto out_release;
+                       }
+
+                       *opened |= FILE_CREATED;
+               }
+               if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
+                       /* Open dentry. */
+                       if (S_ISFIFO(dentry->d_inode->i_mode)) {
+                               /* We cannot call open here as it would
+                                * deadlock.
+                                */
+                               if (it_disposition(it, DISP_ENQ_OPEN_REF))
+                                       ptlrpc_req_finished(
+                                                      (struct ptlrpc_request *)
+                                                         it->d.lustre.it_data);
+                               rc = finish_no_open(file, de);
+                       } else {
+                               file->private_data = it;
+                               rc = finish_open(file, dentry, NULL, opened);
+                               /* We dget in ll_splice_alias. finish_open takes
+                                * care of dget for fd open.
+                                */
+                               if (de != NULL)
+                                       dput(de);
+                       }
+               } else {
+                       rc = finish_no_open(file, de);
+               }
+       }
+
+out_release:
+       ll_intent_release(it);
+       OBD_FREE(it, sizeof(*it));
+
+       RETURN(rc);
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, const char *name,
+                                   int namelen, const void *data, int datalen,
+                                   int mode, __u64 extra,
+                                   struct lookup_intent *it)
+{
+       struct inode *inode = NULL;
+       struct ptlrpc_request *request = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int rc;
+       ENTRY;
+
+       LASSERT(it && it->d.lustre.it_disposition);
+
+       LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+       request = it->d.lustre.it_data;
+       it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+       rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+       if (rc)
+               GOTO(out, inode = ERR_PTR(rc));
+
+       LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+       /* We asked for a lock on the directory, but were granted a
+        * lock on the inode.  Since we finally have an inode pointer,
+        * stuff it in the lock. */
+       CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+              inode, inode->i_ino, inode->i_generation);
+       ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+       EXIT;
+ out:
+       ptlrpc_req_finished(request);
+       return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+                       struct lookup_intent *it)
+{
+       struct inode *inode;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, LL_IT2STR(it));
+
+       rc = it_open_error(DISP_OPEN_CREATE, it);
+       if (rc)
+               RETURN(rc);
+
+       inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
+                              NULL, 0, mode, 0, it);
+       if (IS_ERR(inode))
+               RETURN(PTR_ERR(inode));
+
+       if (filename_is_volatile(dentry->d_name.name, dentry->d_name.len, NULL))
+               ll_i2info(inode)->lli_volatile = true;
+
+       d_instantiate(dentry, inode);
+       RETURN(0);
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+                           struct inode *inode)
+{
+       struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+                                                      &RMF_MDT_BODY);
+
+       LASSERT(body);
+       if (body->valid & OBD_MD_FLMTIME &&
+           body->mtime > LTIME_S(inode->i_mtime)) {
+               CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
+                      inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+               LTIME_S(inode->i_mtime) = body->mtime;
+       }
+       if (body->valid & OBD_MD_FLCTIME &&
+           body->ctime > LTIME_S(inode->i_ctime))
+               LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct qstr *name,
+                      const char *tgt, int mode, int rdev,
+                      struct dentry *dchild, __u32 opc)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       struct inode *inode = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int tgt_len = 0;
+       int err;
+
+       ENTRY;
+       if (unlikely(tgt != NULL))
+               tgt_len = strlen(tgt) + 1;
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+                                    name->len, 0, opc, NULL);
+       if (IS_ERR(op_data))
+               GOTO(err_exit, err = PTR_ERR(op_data));
+
+       err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+                       current_fsuid(), current_fsgid(),
+                       cfs_curproc_cap_pack(), rdev, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(err_exit, err);
+
+       ll_update_times(request, dir);
+
+       if (dchild) {
+               err = ll_prep_inode(&inode, request, dchild->d_sb, NULL);
+               if (err)
+                    GOTO(err_exit, err);
+
+               d_instantiate(dchild, inode);
+       }
+       EXIT;
+err_exit:
+       ptlrpc_req_finished(request);
+
+       return err;
+}
+
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+                           unsigned rdev, struct dentry *dchild)
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir,
+              mode, rdev);
+
+       if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+               mode &= ~current_umask();
+
+       switch (mode & S_IFMT) {
+       case 0:
+               mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+       case S_IFREG:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFIFO:
+       case S_IFSOCK:
+               err = ll_new_node(dir, name, NULL, mode, rdev, dchild,
+                                 LUSTRE_OPC_MKNOD);
+               break;
+       case S_IFDIR:
+               err = -EPERM;
+               break;
+       default:
+               err = -EINVAL;
+       }
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+       RETURN(err);
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+                       umode_t mode, bool want_excl)
+{
+       int rc;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),"
+                          "flags=%u, excl=%d\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, mode, want_excl);
+
+       rc = ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
+
+       ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n",
+              dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry));
+
+       return rc;
+}
+
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+                             const char *tgt, struct dentry *dchild)
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%.*s\n",
+              name->len, name->name, dir->i_ino, dir->i_generation,
+              dir, 3000, tgt);
+
+       err = ll_new_node(dir, name, (char *)tgt, S_IFLNK | S_IRWXUGO,
+                         0, dchild, LUSTRE_OPC_SYMLINK);
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+       RETURN(err);
+}
+
+static int ll_link_generic(struct inode *src,  struct inode *dir,
+                          struct qstr *name, struct dentry *dchild)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int err;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE,
+              "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
+              src->i_ino, src->i_generation, src, dir->i_ino,
+              dir->i_generation, dir, name->len, name->name);
+
+       op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       err = md_link(sbi->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(out, err);
+
+       ll_update_times(request, dir);
+       ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+       EXIT;
+out:
+       ptlrpc_req_finished(request);
+       RETURN(err);
+}
+
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name,
+                           int mode, struct dentry *dchild)
+
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+               mode &= ~current_umask();
+       mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+       err = ll_new_node(dir, name, NULL, mode, 0, dchild, LUSTRE_OPC_MKDIR);
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+       RETURN(err);
+}
+
+/* Try to find the child dentry by its name.
+   If found, put the result fid into @fid. */
+static void ll_get_child_fid(struct inode * dir, struct qstr *name,
+                            struct lu_fid *fid)
+{
+       struct dentry *parent, *child;
+
+       parent = ll_d_hlist_entry(dir->i_dentry, struct dentry, d_alias);
+       child = d_lookup(parent, name);
+       if (child) {
+               if (child->d_inode)
+                       *fid = *ll_inode2fid(child->d_inode);
+               dput(child);
+       }
+}
+
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+                           struct dentry *dchild, struct qstr *name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+                                    S_IFDIR, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(dir, name, &op_data->op_fid3);
+       op_data->op_fid2 = op_data->op_fid3;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               ll_update_times(request, dir);
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+       }
+
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              namelen, name, dir->i_ino, dir->i_generation, dir);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+                                    S_IFDIR, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+       op_data->op_cli_flags |= CLI_RM_ENTRY;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               ll_update_times(request, dir);
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+       }
+
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+       struct mdt_body *body;
+       struct lov_mds_md *eadata;
+       struct lov_stripe_md *lsm = NULL;
+       struct obd_trans_info oti = { 0 };
+       struct obdo *oa;
+       struct obd_capa *oc = NULL;
+       int rc;
+       ENTRY;
+
+       /* req is swabbed so this is safe */
+       body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+       if (!(body->valid & OBD_MD_FLEASIZE))
+               RETURN(0);
+
+       if (body->eadatasize == 0) {
+               CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+               GOTO(out, rc = -EPROTO);
+       }
+
+       /* The MDS sent back the EA because we unlinked the last reference
+        * to this file. Use this EA to unlink the objects on the OST.
+        * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+        * check it is complete and sensible. */
+       eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+                                             body->eadatasize);
+       LASSERT(eadata != NULL);
+
+       rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+       if (rc < 0) {
+               CERROR("obd_unpackmd: %d\n", rc);
+               GOTO(out, rc);
+       }
+       LASSERT(rc >= sizeof(*lsm));
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               GOTO(out_free_memmd, rc = -ENOMEM);
+
+       oa->o_oi = lsm->lsm_oi;
+       oa->o_mode = body->mode & S_IFMT;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+       if (body->valid & OBD_MD_FLCOOKIE) {
+               oa->o_valid |= OBD_MD_FLCOOKIE;
+               oti.oti_logcookies =
+                       req_capsule_server_sized_get(&request->rq_pill,
+                                                    &RMF_LOGCOOKIES,
+                                                  sizeof(struct llog_cookie) *
+                                                    lsm->lsm_stripe_count);
+               if (oti.oti_logcookies == NULL) {
+                       oa->o_valid &= ~OBD_MD_FLCOOKIE;
+                       body->valid &= ~OBD_MD_FLCOOKIE;
+               }
+       }
+
+       if (body->valid & OBD_MD_FLOSSCAPA) {
+               rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+               if (rc)
+                       GOTO(out_free_memmd, rc);
+       }
+
+       rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+                        ll_i2mdexp(dir), oc);
+       capa_put(oc);
+       if (rc)
+               CERROR("obd destroy objid "DOSTID" error %d\n",
+                      POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+       obd_free_memmd(ll_i2dtexp(dir), &lsm);
+       OBDO_FREE(oa);
+out:
+       return rc;
+}
+
+/* ll_unlink_generic() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink_generic(struct inode *dir, struct dentry *dparent,
+                            struct dentry *dchild, struct qstr *name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       /*
+        * XXX: unlink bind mountpoint maybe call to here,
+        * just check it as vfs_unlink does.
+        */
+       if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+                                    name->len, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(dir, name, &op_data->op_fid3);
+       op_data->op_fid2 = op_data->op_fid3;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc)
+               GOTO(out, rc);
+
+       ll_update_times(request, dir);
+       ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+       rc = ll_objects_destroy(request, dir);
+ out:
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+static int ll_rename_generic(struct inode *src, struct dentry *src_dparent,
+                            struct dentry *src_dchild, struct qstr *src_name,
+                            struct inode *tgt, struct dentry *tgt_dparent,
+                            struct dentry *tgt_dchild, struct qstr *tgt_name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(src);
+       struct md_op_data *op_data;
+       int err;
+       ENTRY;
+       CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
+              "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+              src->i_ino, src->i_generation, src, tgt_name->len,
+              tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
+
+       if (unlikely(ll_d_mountpoint(src_dparent, src_dchild, src_name) ||
+           ll_d_mountpoint(tgt_dparent, tgt_dchild, tgt_name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(src, src_name, &op_data->op_fid3);
+       ll_get_child_fid(tgt, tgt_name, &op_data->op_fid4);
+       err = md_rename(sbi->ll_md_exp, op_data,
+                       src_name->name, src_name->len,
+                       tgt_name->name, tgt_name->len, &request);
+       ll_finish_md_op_data(op_data);
+       if (!err) {
+               ll_update_times(request, src);
+               ll_update_times(request, tgt);
+               ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+               err = ll_objects_destroy(request, src);
+       }
+
+       ptlrpc_req_finished(request);
+
+       RETURN(err);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
+                   dev_t rdev)
+{
+       return ll_mknod_generic(dir, &dchild->d_name, mode,
+                               old_encode_dev(rdev), dchild);
+}
+
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+       return ll_unlink_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+       return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       return ll_rmdir_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+                     const char *oldname)
+{
+       return ll_symlink_generic(dir, &dentry->d_name, oldname, dentry);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+                  struct dentry *new_dentry)
+{
+       return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name,
+                              new_dentry);
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+                    struct inode *new_dir, struct dentry *new_dentry)
+{
+       int err;
+       err = ll_rename_generic(old_dir, NULL,
+                                old_dentry, &old_dentry->d_name,
+                                new_dir, NULL, new_dentry,
+                                &new_dentry->d_name);
+       if (!err) {
+                       d_move(old_dentry, new_dentry);
+       }
+       return err;
+}
+
+struct inode_operations ll_dir_inode_operations = {
+       .mknod        = ll_mknod,
+       .atomic_open        = ll_atomic_open,
+       .lookup      = ll_lookup_nd,
+       .create      = ll_create_nd,
+       /* We need all these non-raw things for NFSD, to not patch it. */
+       .unlink      = ll_unlink,
+       .mkdir        = ll_mkdir,
+       .rmdir        = ll_rmdir,
+       .symlink            = ll_symlink,
+       .link          = ll_link,
+       .rename      = ll_rename,
+       .setattr            = ll_setattr,
+       .getattr            = ll_getattr,
+       .permission      = ll_inode_permission,
+       .setxattr          = ll_setxattr,
+       .getxattr          = ll_getxattr,
+       .listxattr        = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl            = ll_get_acl,
+};
+
+struct inode_operations ll_special_inode_operations = {
+       .setattr        = ll_setattr,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl            = ll_get_acl,
+};
diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644 (file)
index 0000000..68b2dc4
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+       struct ll_remote_perm *lrp;
+
+       OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+       if (lrp)
+               INIT_HLIST_NODE(&lrp->lrp_list);
+       return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+       if (!lrp)
+               return;
+
+       if (!hlist_unhashed(&lrp->lrp_list))
+               hlist_del(&lrp->lrp_list);
+       OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+struct hlist_head *alloc_rmtperm_hash(void)
+{
+       struct hlist_head *hash;
+       int i;
+
+       OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+                          REMOTE_PERM_HASHSIZE * sizeof(*hash),
+                          GFP_IOFS);
+       if (!hash)
+               return NULL;
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+               INIT_HLIST_HEAD(hash + i);
+
+       return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+       int i;
+       struct ll_remote_perm *lrp;
+       struct hlist_node *next;
+
+       if(!hash)
+               return;
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+               hlist_for_each_entry_safe(lrp, next, hash + i,
+                                             lrp_list)
+                       free_ll_remote_perm(lrp);
+       OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+                     REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+       return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+       struct hlist_head *head;
+       struct ll_remote_perm *lrp;
+       int found = 0, rc;
+       ENTRY;
+
+       if (!lli->lli_remote_perms)
+               RETURN(-ENOENT);
+
+       head = lli->lli_remote_perms + remote_perm_hashfunc(current_uid());
+
+       spin_lock(&lli->lli_lock);
+       hlist_for_each_entry(lrp, head, lrp_list) {
+               if (lrp->lrp_uid != current_uid())
+                       continue;
+               if (lrp->lrp_gid != current_gid())
+                       continue;
+               if (lrp->lrp_fsuid != current_fsuid())
+                       continue;
+               if (lrp->lrp_fsgid != current_fsgid())
+                       continue;
+               found = 1;
+               break;
+       }
+
+       if (!found)
+               GOTO(out, rc = -ENOENT);
+
+       CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+              lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+              lrp->lrp_access_perm);
+       rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+       spin_unlock(&lli->lli_lock);
+       return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+       struct hlist_head *head, *perm_hash = NULL;
+       ENTRY;
+
+       LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+       if (perm->rp_uid != current->uid ||
+           perm->rp_gid != current->gid ||
+           perm->rp_fsuid != current->fsuid ||
+           perm->rp_fsgid != current->fsgid) {
+               /* user might setxid in this small period */
+               CDEBUG(D_SEC,
+                      "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+                      perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+                      perm->rp_fsgid, current->uid, current->gid,
+                      current->fsuid, current->fsgid);
+               RETURN(-EAGAIN);
+       }
+#endif
+
+       if (!lli->lli_remote_perms) {
+               perm_hash = alloc_rmtperm_hash();
+               if (perm_hash == NULL) {
+                       CERROR("alloc lli_remote_perms failed!\n");
+                       RETURN(-ENOMEM);
+               }
+       }
+
+       spin_lock(&lli->lli_lock);
+
+       if (!lli->lli_remote_perms)
+               lli->lli_remote_perms = perm_hash;
+       else if (perm_hash)
+               free_rmtperm_hash(perm_hash);
+
+       head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+       hlist_for_each_entry(tmp, head, lrp_list) {
+               if (tmp->lrp_uid != perm->rp_uid)
+                       continue;
+               if (tmp->lrp_gid != perm->rp_gid)
+                       continue;
+               if (tmp->lrp_fsuid != perm->rp_fsuid)
+                       continue;
+               if (tmp->lrp_fsgid != perm->rp_fsgid)
+                       continue;
+               if (lrp)
+                       free_ll_remote_perm(lrp);
+               lrp = tmp;
+               break;
+       }
+
+       if (!lrp) {
+               spin_unlock(&lli->lli_lock);
+               lrp = alloc_ll_remote_perm();
+               if (!lrp) {
+                       CERROR("alloc memory for ll_remote_perm failed!\n");
+                       RETURN(-ENOMEM);
+               }
+               spin_lock(&lli->lli_lock);
+               goto again;
+       }
+
+       lrp->lrp_access_perm = perm->rp_access_perm;
+       if (lrp != tmp) {
+               lrp->lrp_uid     = perm->rp_uid;
+               lrp->lrp_gid     = perm->rp_gid;
+               lrp->lrp_fsuid       = perm->rp_fsuid;
+               lrp->lrp_fsgid       = perm->rp_fsgid;
+               hlist_add_head(&lrp->lrp_list, head);
+       }
+       lli->lli_rmtperm_time = cfs_time_current();
+       spin_unlock(&lli->lli_lock);
+
+       CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+              lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+              lrp->lrp_access_perm);
+
+       RETURN(0);
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       struct mdt_remote_perm *perm;
+       struct obd_capa *oc;
+       cfs_time_t save;
+       int i = 0, rc;
+       ENTRY;
+
+       do {
+               save = lli->lli_rmtperm_time;
+               rc = do_check_remote_perm(lli, mask);
+               if (!rc || (rc != -ENOENT && i))
+                       break;
+
+               might_sleep();
+
+               mutex_lock(&lli->lli_rmtperm_mutex);
+               /* check again */
+               if (save != lli->lli_rmtperm_time) {
+                       rc = do_check_remote_perm(lli, mask);
+                       if (!rc || (rc != -ENOENT && i)) {
+                               mutex_unlock(&lli->lli_rmtperm_mutex);
+                               break;
+                       }
+               }
+
+               if (i++ > 5) {
+                       CERROR("check remote perm falls in dead loop!\n");
+                       LBUG();
+               }
+
+               oc = ll_mdscapa_get(inode);
+               rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                                       ll_i2suppgid(inode), &req);
+               capa_put(oc);
+               if (rc) {
+                       mutex_unlock(&lli->lli_rmtperm_mutex);
+                       break;
+               }
+
+               perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+                                                  lustre_swab_mdt_remote_perm);
+               if (unlikely(perm == NULL)) {
+                       mutex_unlock(&lli->lli_rmtperm_mutex);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               rc = ll_update_remote_perm(inode, perm);
+               mutex_unlock(&lli->lli_rmtperm_mutex);
+               if (rc == -ENOMEM)
+                       break;
+
+               ptlrpc_req_finished(req);
+               req = NULL;
+       } while (1);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+#if 0  /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+       * because it will fail sanity test 48.
+       */
+void ll_free_remote_perms(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct hlist_head *hash = lli->lli_remote_perms;
+       struct ll_remote_perm *lrp;
+       struct hlist_node *node, *next;
+       int i;
+
+       LASSERT(hash);
+
+       spin_lock(&lli->lli_lock);
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+               hlist_for_each_entry_safe(lrp, node, next, hash + i,
+                                             lrp_list)
+                       free_ll_remote_perm(lrp);
+       }
+
+       spin_unlock(&lli->lli_lock);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644 (file)
index 0000000..0a0ac26
--- /dev/null
@@ -0,0 +1,1307 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+       struct lu_env  *env  = lcc->lcc_env;
+       struct cl_io   *io   = lcc->lcc_io;
+       struct cl_page *page = lcc->lcc_page;
+
+       LASSERT(lcc->lcc_cookie == current);
+       LASSERT(env != NULL);
+
+       if (page != NULL) {
+               lu_ref_del(&page->cp_reference, "cl_io", io);
+               cl_page_put(env, page);
+       }
+
+       if (io && lcc->lcc_created) {
+               cl_io_end(env, io);
+               cl_io_unlock(env, io);
+               cl_io_iter_fini(env, io);
+               cl_io_fini(env, io);
+       }
+       cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+                                       struct page *vmpage, int create)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env    *env;
+       struct cl_io     *io;
+       struct cl_object *clob;
+       struct ccc_io    *cio;
+
+       int refcheck;
+       int result = 0;
+
+       clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+       LASSERT(clob != NULL);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return ERR_PTR(PTR_ERR(env));
+
+       lcc = &vvp_env_info(env)->vti_io_ctx;
+       memset(lcc, 0, sizeof(*lcc));
+       lcc->lcc_env = env;
+       lcc->lcc_refcheck = refcheck;
+       lcc->lcc_cookie = current;
+
+       cio = ccc_env_io(env);
+       io = cio->cui_cl.cis_io;
+       if (io == NULL && create) {
+               struct inode *inode = vmpage->mapping->host;
+               loff_t pos;
+
+               if (mutex_trylock(&inode->i_mutex)) {
+                       mutex_unlock(&(inode)->i_mutex);
+
+                       /* this is too bad. Someone is trying to write the
+                        * page w/o holding inode mutex. This means we can
+                        * add dirty pages into cache during truncate */
+                       CERROR("Proc %s is dirting page w/o inode lock, this"
+                              "will break truncate.\n", current->comm);
+                       libcfs_debug_dumpstack(NULL);
+                       LBUG();
+                       return ERR_PTR(-EIO);
+               }
+
+               /*
+                * Loop-back driver calls ->prepare_write() and ->sendfile()
+                * methods directly, bypassing file system ->write() operation,
+                * so cl_io has to be created here.
+                */
+               io = ccc_env_thread_io(env);
+               ll_io_init(io, file, 1);
+
+               /* No lock at all for this kind of IO - we can't do it because
+                * we have held page lock, it would cause deadlock.
+                * XXX: This causes poor performance to loop device - One page
+                *      per RPC.
+                *      In order to get better performance, users should use
+                *      lloop driver instead.
+                */
+               io->ci_lockreq = CILR_NEVER;
+
+               pos = (vmpage->index << PAGE_CACHE_SHIFT);
+
+               /* Create a temp IO to serve write. */
+               result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+               if (result == 0) {
+                       cio->cui_fd = LUSTRE_FPRIVATE(file);
+                       cio->cui_iov = NULL;
+                       cio->cui_nrsegs = 0;
+                       result = cl_io_iter_init(env, io);
+                       if (result == 0) {
+                               result = cl_io_lock(env, io);
+                               if (result == 0)
+                                       result = cl_io_start(env, io);
+                       }
+               } else
+                       result = io->ci_result;
+               lcc->lcc_created = 1;
+       }
+
+       lcc->lcc_io = io;
+       if (io == NULL)
+               result = -EIO;
+       if (result == 0) {
+               struct cl_page   *page;
+
+               LASSERT(io != NULL);
+               LASSERT(io->ci_state == CIS_IO_GOING);
+               LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+               page = cl_page_find(env, clob, vmpage->index, vmpage,
+                                   CPT_CACHEABLE);
+               if (!IS_ERR(page)) {
+                       lcc->lcc_page = page;
+                       lu_ref_add(&page->cp_reference, "cl_io", io);
+                       result = 0;
+               } else
+                       result = PTR_ERR(page);
+       }
+       if (result) {
+               ll_cl_fini(lcc);
+               lcc = ERR_PTR(result);
+       }
+
+       CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+              vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+              env, io);
+       return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env *env;
+       int refcheck;
+
+       env = cl_env_get(&refcheck);
+       LASSERT(!IS_ERR(env));
+       lcc = &vvp_env_info(env)->vti_io_ctx;
+       LASSERT(env == lcc->lcc_env);
+       LASSERT(current == lcc->lcc_cookie);
+       cl_env_put(env, &refcheck);
+
+       /* env has got in ll_cl_init, so it is still usable. */
+       return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+                    unsigned to)
+{
+       struct ll_cl_context *lcc;
+       int result;
+       ENTRY;
+
+       lcc = ll_cl_init(file, vmpage, 1);
+       if (!IS_ERR(lcc)) {
+               struct lu_env  *env = lcc->lcc_env;
+               struct cl_io   *io  = lcc->lcc_io;
+               struct cl_page *page = lcc->lcc_page;
+
+               cl_page_assume(env, io, page);
+
+               result = cl_io_prepare_write(env, io, page, from, to);
+               if (result == 0) {
+                       /*
+                        * Add a reference, so that page is not evicted from
+                        * the cache until ->commit_write() is called.
+                        */
+                       cl_page_get(page);
+                       lu_ref_add(&page->cp_reference, "prepare_write",
+                                  current);
+               } else {
+                       cl_page_unassume(env, io, page);
+                       ll_cl_fini(lcc);
+               }
+               /* returning 0 in prepare assumes commit must be called
+                * afterwards */
+       } else {
+               result = PTR_ERR(lcc);
+       }
+       RETURN(result);
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+                   unsigned to)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env    *env;
+       struct cl_io     *io;
+       struct cl_page   *page;
+       int result = 0;
+       ENTRY;
+
+       lcc  = ll_cl_get();
+       env  = lcc->lcc_env;
+       page = lcc->lcc_page;
+       io   = lcc->lcc_io;
+
+       LASSERT(cl_page_is_owned(page, io));
+       LASSERT(from <= to);
+       if (from != to) /* handle short write case. */
+               result = cl_io_commit_write(env, io, page, from, to);
+       if (cl_page_is_owned(page, io))
+               cl_page_unassume(env, io, page);
+
+       /*
+        * Release reference acquired by ll_prepare_write().
+        */
+       lu_ref_del(&page->cp_reference, "prepare_write", current);
+       cl_page_put(env, page);
+       ll_cl_fini(lcc);
+       RETURN(result);
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+       __u64 opc;
+
+       opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+       return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+                                    struct ra_io_arg *ria,
+                                    unsigned long pages)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       long ret;
+       ENTRY;
+
+       /* If read-ahead pages left are less than 1M, do not do read-ahead,
+        * otherwise it will form small read RPC(< 1M), which hurt server
+        * performance a lot. */
+       ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+       if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+               GOTO(out, ret = 0);
+
+       /* If the non-strided (ria_pages == 0) readahead window
+        * (ria_start + ret) has grown across an RPC boundary, then trim
+        * readahead size by the amount beyond the RPC so it ends on an
+        * RPC boundary. If the readahead window is already ending on
+        * an RPC boundary (beyond_rpc == 0), or smaller than a full
+        * RPC (beyond_rpc < ret) the readahead size is unchanged.
+        * The (beyond_rpc != 0) check is skipped since the conditional
+        * branch is more expensive than subtracting zero from the result.
+        *
+        * Strided read is left unaligned to avoid small fragments beyond
+        * the RPC boundary from needing an extra read RPC. */
+       if (ria->ria_pages == 0) {
+               long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+               if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+                       ret -= beyond_rpc;
+       }
+
+       if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+               atomic_sub(ret, &ra->ra_cur_pages);
+               ret = 0;
+       }
+
+out:
+       RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+       LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+       lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+       ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+       CDEBUG(D_READA,                                               \
+              "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
+              "csr %lu sf %lu sp %lu sl %lu \n",                           \
+              ras->ras_last_readpage, ras->ras_consecutive_requests,   \
+              ras->ras_consecutive_pages, ras->ras_window_start,           \
+              ras->ras_window_len, ras->ras_next_readahead,             \
+              ras->ras_requests, ras->ras_request_index,                   \
+              ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+              ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+                          unsigned long before, unsigned long after)
+{
+       unsigned long start = point - before, end = point + after;
+
+       if (start > point)
+              start = 0;
+       if (end < point)
+              end = ~0;
+
+       return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+       struct ll_file_data       *fd;
+
+       fd = LUSTRE_FPRIVATE(f);
+       return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+       struct ll_readahead_state *ras;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       ras->ras_requests++;
+       ras->ras_request_index = 0;
+       ras->ras_consecutive_requests++;
+       rar->lrr_reader = current;
+
+       list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+       spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+       struct ll_readahead_state *ras;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       list_del_init(&rar->lrr_linkage);
+       spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+       struct ll_ra_read *scan;
+
+       list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+               if (scan->lrr_reader == current)
+                       return scan;
+       }
+       return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+       struct ll_readahead_state *ras;
+       struct ll_ra_read        *bead;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       bead = ll_ra_read_get_locked(ras);
+       spin_unlock(&ras->ras_lock);
+       return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page_list *queue, struct cl_page *page,
+                             struct page *vmpage)
+{
+       struct ccc_page *cp;
+       int           rc;
+
+       ENTRY;
+
+       rc = 0;
+       cl_page_assume(env, io, page);
+       lu_ref_add(&page->cp_reference, "ra", current);
+       cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+       if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+               rc = cl_page_is_under_lock(env, io, page);
+               if (rc == -EBUSY) {
+                       cp->cpg_defer_uptodate = 1;
+                       cp->cpg_ra_used = 0;
+                       cl_page_list_add(queue, page);
+                       rc = 1;
+               } else {
+                       cl_page_delete(env, page);
+                       rc = -ENOLCK;
+               }
+       } else {
+               /* skip completed pages */
+               cl_page_unassume(env, io, page);
+       }
+       lu_ref_del(&page->cp_reference, "ra", current);
+       cl_page_put(env, page);
+       RETURN(rc);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *               read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page_list *queue,
+                             pgoff_t index, struct address_space *mapping)
+{
+       struct page      *vmpage;
+       struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+       struct cl_page   *page;
+       enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+       unsigned int      gfp_mask;
+       int            rc    = 0;
+       const char       *msg   = NULL;
+
+       ENTRY;
+
+       gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
+#ifdef __GFP_NOWARN
+       gfp_mask |= __GFP_NOWARN;
+#endif
+       vmpage = grab_cache_page_nowait(mapping, index);
+       if (vmpage != NULL) {
+               /* Check if vmpage was truncated or reclaimed */
+               if (vmpage->mapping == mapping) {
+                       page = cl_page_find(env, clob, vmpage->index,
+                                           vmpage, CPT_CACHEABLE);
+                       if (!IS_ERR(page)) {
+                               rc = cl_read_ahead_page(env, io, queue,
+                                                       page, vmpage);
+                               if (rc == -ENOLCK) {
+                                       which = RA_STAT_FAILED_MATCH;
+                                       msg   = "lock match failed";
+                               }
+                       } else {
+                               which = RA_STAT_FAILED_GRAB_PAGE;
+                               msg   = "cl_page_find failed";
+                       }
+               } else {
+                       which = RA_STAT_WRONG_GRAB_PAGE;
+                       msg   = "g_c_p_n returned invalid page";
+               }
+               if (rc != 1)
+                       unlock_page(vmpage);
+               page_cache_release(vmpage);
+       } else {
+               which = RA_STAT_FAILED_GRAB_PAGE;
+               msg   = "g_c_p_n failed";
+       }
+       if (msg != NULL) {
+               ll_ra_stats_inc(mapping, which);
+               CDEBUG(D_READA, "%s\n", msg);
+       }
+       RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)                                                \
+       CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+       ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+       ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is.  If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance siginificantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+       return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *           How many pages it should read in such pattern
+ *           |-------------------------------------------------------------|
+ *           off
+ *           |<------            length                      ------->|
+ *
+ *       =   |<----->|  +  |-------------------------------------| +   |---|
+ *          start_left          st_pgs * i                 end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+               unsigned long off, unsigned long length)
+{
+       __u64 start = off > st_off ? off - st_off : 0;
+       __u64 end = off + length > st_off ? off + length - st_off : 0;
+       unsigned long start_left = 0;
+       unsigned long end_left = 0;
+       unsigned long pg_count;
+
+       if (st_len == 0 || length == 0 || end == 0)
+               return length;
+
+       start_left = do_div(start, st_len);
+       if (start_left < st_pgs)
+               start_left = st_pgs - start_left;
+       else
+               start_left = 0;
+
+       end_left = do_div(end, st_len);
+       if (end_left > st_pgs)
+               end_left = st_pgs;
+
+       CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n",
+              start, end, start_left, end_left);
+
+       if (start == end)
+               pg_count = end_left - (st_pgs - start_left);
+       else
+               pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+       CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu"
+              "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
+
+       return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+       __u64 length = ria->ria_end >= ria->ria_start ?
+                      ria->ria_end - ria->ria_start + 1 : 0;
+
+       return stride_pg_count(ria->ria_stoff, ria->ria_length,
+                              ria->ria_pages, ria->ria_start,
+                              length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+       /* If ria_length == ria_pages, it means non-stride I/O mode,
+        * idx should always inside read-ahead window in this case
+        * For stride I/O mode, just check whether the idx is inside
+        * the ria_pages. */
+       return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+              (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+               ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+                              struct cl_io *io, struct cl_page_list *queue,
+                              struct ra_io_arg *ria,
+                              unsigned long *reserved_pages,
+                              struct address_space *mapping,
+                              unsigned long *ra_end)
+{
+       int rc, count = 0, stride_ria;
+       unsigned long page_idx;
+
+       LASSERT(ria != NULL);
+       RIA_DEBUG(ria);
+
+       stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+       for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+                       *reserved_pages > 0; page_idx++) {
+               if (ras_inside_ra_window(page_idx, ria)) {
+                       /* If the page is inside the read-ahead window*/
+                       rc = ll_read_ahead_page(env, io, queue,
+                                               page_idx, mapping);
+                       if (rc == 1) {
+                               (*reserved_pages)--;
+                               count ++;
+                       } else if (rc == -ENOLCK)
+                               break;
+               } else if (stride_ria) {
+                       /* If it is not in the read-ahead window, and it is
+                        * read-ahead mode, then check whether it should skip
+                        * the stride gap */
+                       pgoff_t offset;
+                       /* FIXME: This assertion only is valid when it is for
+                        * forward read-ahead, it will be fixed when backward
+                        * read-ahead is implemented */
+                       LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu"
+                               "rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx,
+                               ria->ria_start, ria->ria_end, ria->ria_stoff,
+                               ria->ria_length, ria->ria_pages);
+                       offset = page_idx - ria->ria_stoff;
+                       offset = offset % (ria->ria_length);
+                       if (offset > ria->ria_pages) {
+                               page_idx += ria->ria_length - offset;
+                               CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+                                      ria->ria_length - offset);
+                               continue;
+                       }
+               }
+       }
+       *ra_end = page_idx;
+       return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+                struct ll_readahead_state *ras, struct address_space *mapping,
+                struct cl_page_list *queue, int flags)
+{
+       struct vvp_io *vio = vvp_env_io(env);
+       struct vvp_thread_info *vti = vvp_env_info(env);
+       struct cl_attr *attr = ccc_env_thread_attr(env);
+       unsigned long start = 0, end = 0, reserved;
+       unsigned long ra_end, len;
+       struct inode *inode;
+       struct ll_ra_read *bead;
+       struct ra_io_arg *ria = &vti->vti_ria;
+       struct ll_inode_info *lli;
+       struct cl_object *clob;
+       int ret = 0;
+       __u64 kms;
+       ENTRY;
+
+       inode = mapping->host;
+       lli = ll_i2info(inode);
+       clob = lli->lli_clob;
+
+       memset(ria, 0, sizeof *ria);
+
+       cl_object_attr_lock(clob);
+       ret = cl_object_attr_get(env, clob, attr);
+       cl_object_attr_unlock(clob);
+
+       if (ret != 0)
+               RETURN(ret);
+       kms = attr->cat_kms;
+       if (kms == 0) {
+               ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+               RETURN(0);
+       }
+
+       spin_lock(&ras->ras_lock);
+       if (vio->cui_ra_window_set)
+               bead = &vio->cui_bead;
+       else
+               bead = NULL;
+
+       /* Enlarge the RA window to encompass the full read */
+       if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+           bead->lrr_start + bead->lrr_count) {
+               ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+                                     ras->ras_window_start;
+       }
+       /* Reserve a part of the read-ahead window that we'll be issuing */
+       if (ras->ras_window_len) {
+               start = ras->ras_next_readahead;
+               end = ras->ras_window_start + ras->ras_window_len - 1;
+       }
+       if (end != 0) {
+               unsigned long rpc_boundary;
+               /*
+                * Align RA window to an optimal boundary.
+                *
+                * XXX This would be better to align to cl_max_pages_per_rpc
+                * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+                * be aligned to the RAID stripe size in the future and that
+                * is more important than the RPC size.
+                */
+               /* Note: we only trim the RPC, instead of extending the RPC
+                * to the boundary, so to avoid reading too much pages during
+                * random reading. */
+               rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)));
+               if (rpc_boundary > 0)
+                       rpc_boundary--;
+
+               if (rpc_boundary  > start)
+                       end = rpc_boundary;
+
+               /* Truncate RA window to end of file */
+               end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+               ras->ras_next_readahead = max(end, end + 1);
+               RAS_CDEBUG(ras);
+       }
+       ria->ria_start = start;
+       ria->ria_end = end;
+       /* If stride I/O mode is detected, get stride window*/
+       if (stride_io_mode(ras)) {
+               ria->ria_stoff = ras->ras_stride_offset;
+               ria->ria_length = ras->ras_stride_length;
+               ria->ria_pages = ras->ras_stride_pages;
+       }
+       spin_unlock(&ras->ras_lock);
+
+       if (end == 0) {
+               ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+               RETURN(0);
+       }
+       len = ria_page_count(ria);
+       if (len == 0)
+               RETURN(0);
+
+       reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+       if (reserved < len)
+               ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+       CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+              atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+              ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+       ret = ll_read_ahead_pages(env, io, queue,
+                                 ria, &reserved, mapping, &ra_end);
+
+       LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+       if (reserved != 0)
+               ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+       if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+               ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+       /* if we didn't get to the end of the region we reserved from
+        * the ras we need to go back and update the ras so that the
+        * next read-ahead tries from where we left off.  we only do so
+        * if the region we failed to issue read-ahead on is still ahead
+        * of the app and behind the next index to start read-ahead from */
+       CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+              ra_end, end, ria->ria_end);
+
+       if (ra_end != end + 1) {
+               spin_lock(&ras->ras_lock);
+               if (ra_end < ras->ras_next_readahead &&
+                   index_in_window(ra_end, ras->ras_window_start, 0,
+                                   ras->ras_window_len)) {
+                       ras->ras_next_readahead = ra_end;
+                       RAS_CDEBUG(ras);
+               }
+               spin_unlock(&ras->ras_lock);
+       }
+
+       RETURN(ret);
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+                         unsigned long index)
+{
+       ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+                     unsigned long index)
+{
+       ras->ras_last_readpage = index;
+       ras->ras_consecutive_requests = 0;
+       ras->ras_consecutive_pages = 0;
+       ras->ras_window_len = 0;
+       ras_set_start(inode, ras, index);
+       ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+       RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+       ras->ras_consecutive_stride_requests = 0;
+       ras->ras_stride_length = 0;
+       ras->ras_stride_pages = 0;
+       RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+       spin_lock_init(&ras->ras_lock);
+       ras_reset(inode, ras, 0);
+       ras->ras_requests = 0;
+       INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+                                 unsigned long index)
+{
+       unsigned long stride_gap;
+
+       if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+           ras->ras_stride_pages == ras->ras_stride_length)
+               return 0;
+
+       stride_gap = index - ras->ras_last_readpage - 1;
+
+       /* If it is contiguous read */
+       if (stride_gap == 0)
+               return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+       /* Otherwise check the stride by itself */
+       return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+               ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+                                      unsigned long index)
+{
+       unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+       if (!stride_io_mode(ras) && (stride_gap != 0 ||
+            ras->ras_consecutive_stride_requests == 0)) {
+               ras->ras_stride_pages = ras->ras_consecutive_pages;
+               ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+       }
+       LASSERT(ras->ras_request_index == 0);
+       LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+       if (index <= ras->ras_last_readpage) {
+               /*Reset stride window for forward read*/
+               ras_stride_reset(ras);
+               return;
+       }
+
+       ras->ras_stride_pages = ras->ras_consecutive_pages;
+       ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+       RAS_CDEBUG(ras);
+       return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+       return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+                              ras->ras_stride_pages, ras->ras_stride_offset,
+                              len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+                                      struct ll_ra_info *ra,
+                                      unsigned long inc_len)
+{
+       unsigned long left, step, window_len;
+       unsigned long stride_len;
+
+       LASSERT(ras->ras_stride_length > 0);
+       LASSERTF(ras->ras_window_start + ras->ras_window_len
+                >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
+                " stride_offset %lu\n", ras->ras_window_start,
+                ras->ras_window_len, ras->ras_stride_offset);
+
+       stride_len = ras->ras_window_start + ras->ras_window_len -
+                    ras->ras_stride_offset;
+
+       left = stride_len % ras->ras_stride_length;
+       window_len = ras->ras_window_len - left;
+
+       if (left < ras->ras_stride_pages)
+               left += inc_len;
+       else
+               left = ras->ras_stride_pages + inc_len;
+
+       LASSERT(ras->ras_stride_pages != 0);
+
+       step = left / ras->ras_stride_pages;
+       left %= ras->ras_stride_pages;
+
+       window_len += step * ras->ras_stride_length + left;
+
+       if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+               ras->ras_window_len = window_len;
+
+       RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+                               struct ll_readahead_state *ras,
+                               struct ll_ra_info *ra)
+{
+       /* The stretch of ra-window should be aligned with max rpc_size
+        * but current clio architecture does not support retrieve such
+        * information from lower layer. FIXME later
+        */
+       if (stride_io_mode(ras))
+               ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+       else
+               ras->ras_window_len = min(ras->ras_window_len +
+                                         RAS_INCREASE_STEP(inode),
+                                         ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+               struct ll_readahead_state *ras, unsigned long index,
+               unsigned hit)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       int zero = 0, stride_detect = 0, ra_miss = 0;
+       ENTRY;
+
+       spin_lock(&ras->ras_lock);
+
+       ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+       /* reset the read-ahead window in two cases.  First when the app seeks
+        * or reads to some other part of the file.  Secondly if we get a
+        * read-ahead miss that we think we've previously issued.  This can
+        * be a symptom of there being so many read-ahead pages that the VM is
+        * reclaiming it before we get to it. */
+       if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+               zero = 1;
+               ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+       } else if (!hit && ras->ras_window_len &&
+                  index < ras->ras_next_readahead &&
+                  index_in_window(index, ras->ras_window_start, 0,
+                                  ras->ras_window_len)) {
+               ra_miss = 1;
+               ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+       }
+
+       /* On the second access to a file smaller than the tunable
+        * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+        * file up to ra_max_pages_per_file.  This is simply a best effort
+        * and only occurs once per open file.  Normal RA behavior is reverted
+        * to for subsequent IO.  The mmap case does not increment
+        * ras_requests and thus can never trigger this behavior. */
+       if (ras->ras_requests == 2 && !ras->ras_request_index) {
+               __u64 kms_pages;
+
+               kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT;
+
+               CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+                      ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+               if (kms_pages &&
+                   kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+                       ras->ras_window_start = 0;
+                       ras->ras_last_readpage = 0;
+                       ras->ras_next_readahead = 0;
+                       ras->ras_window_len = min(ra->ra_max_pages_per_file,
+                               ra->ra_max_read_ahead_whole_pages);
+                       GOTO(out_unlock, 0);
+               }
+       }
+       if (zero) {
+               /* check whether it is in stride I/O mode*/
+               if (!index_in_stride_window(ras, index)) {
+                       if (ras->ras_consecutive_stride_requests == 0 &&
+                           ras->ras_request_index == 0) {
+                               ras_update_stride_detector(ras, index);
+                               ras->ras_consecutive_stride_requests++;
+                       } else {
+                               ras_stride_reset(ras);
+                       }
+                       ras_reset(inode, ras, index);
+                       ras->ras_consecutive_pages++;
+                       GOTO(out_unlock, 0);
+               } else {
+                       ras->ras_consecutive_pages = 0;
+                       ras->ras_consecutive_requests = 0;
+                       if (++ras->ras_consecutive_stride_requests > 1)
+                               stride_detect = 1;
+                       RAS_CDEBUG(ras);
+               }
+       } else {
+               if (ra_miss) {
+                       if (index_in_stride_window(ras, index) &&
+                           stride_io_mode(ras)) {
+                               /*If stride-RA hit cache miss, the stride dector
+                                *will not be reset to avoid the overhead of
+                                *redetecting read-ahead mode */
+                               if (index != ras->ras_last_readpage + 1)
+                                       ras->ras_consecutive_pages = 0;
+                               ras_reset(inode, ras, index);
+                               RAS_CDEBUG(ras);
+                       } else {
+                               /* Reset both stride window and normal RA
+                                * window */
+                               ras_reset(inode, ras, index);
+                               ras->ras_consecutive_pages++;
+                               ras_stride_reset(ras);
+                               GOTO(out_unlock, 0);
+                       }
+               } else if (stride_io_mode(ras)) {
+                       /* If this is contiguous read but in stride I/O mode
+                        * currently, check whether stride step still is valid,
+                        * if invalid, it will reset the stride ra window*/
+                       if (!index_in_stride_window(ras, index)) {
+                               /* Shrink stride read-ahead window to be zero */
+                               ras_stride_reset(ras);
+                               ras->ras_window_len = 0;
+                               ras->ras_next_readahead = index;
+                       }
+               }
+       }
+       ras->ras_consecutive_pages++;
+       ras->ras_last_readpage = index;
+       ras_set_start(inode, ras, index);
+
+       if (stride_io_mode(ras))
+               /* Since stride readahead is sentivite to the offset
+                * of read-ahead, so we use original offset here,
+                * instead of ras_window_start, which is RPC aligned */
+               ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+       else
+               ras->ras_next_readahead = max(ras->ras_window_start,
+                                             ras->ras_next_readahead);
+       RAS_CDEBUG(ras);
+
+       /* Trigger RA in the mmap case where ras_consecutive_requests
+        * is not incremented and thus can't be used to trigger RA */
+       if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+               ras->ras_window_len = RAS_INCREASE_STEP(inode);
+               GOTO(out_unlock, 0);
+       }
+
+       /* Initially reset the stride window offset to next_readahead*/
+       if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+               /**
+                * Once stride IO mode is detected, next_readahead should be
+                * reset to make sure next_readahead > stride offset
+                */
+               ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+               ras->ras_stride_offset = index;
+               ras->ras_window_len = RAS_INCREASE_STEP(inode);
+       }
+
+       /* The initial ras_window_len is set to the request size.  To avoid
+        * uselessly reading and discarding pages for random IO the window is
+        * only increased once per consecutive request received. */
+       if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+           !ras->ras_request_index)
+               ras_increase_window(inode, ras, ra);
+       EXIT;
+out_unlock:
+       RAS_CDEBUG(ras);
+       ras->ras_request_index++;
+       spin_unlock(&ras->ras_lock);
+       return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+       struct inode           *inode = vmpage->mapping->host;
+       struct ll_inode_info   *lli   = ll_i2info(inode);
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_page   *page;
+       struct cl_object       *clob;
+       struct cl_env_nest      nest;
+       bool redirtied = false;
+       bool unlocked = false;
+       int result;
+       ENTRY;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageWriteback(vmpage));
+
+       LASSERT(ll_i2dtexp(inode) != NULL);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               GOTO(out, result = PTR_ERR(env));
+
+       clob  = ll_i2info(inode)->lli_clob;
+       LASSERT(clob != NULL);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = clob;
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, clob);
+       if (result == 0) {
+               page = cl_page_find(env, clob, vmpage->index,
+                                   vmpage, CPT_CACHEABLE);
+               if (!IS_ERR(page)) {
+                       lu_ref_add(&page->cp_reference, "writepage",
+                                  current);
+                       cl_page_assume(env, io, page);
+                       result = cl_page_flush(env, io, page);
+                       if (result != 0) {
+                               /*
+                                * Re-dirty page on error so it retries write,
+                                * but not in case when IO has actually
+                                * occurred and completed with an error.
+                                */
+                               if (!PageError(vmpage)) {
+                                       redirty_page_for_writepage(wbc, vmpage);
+                                       result = 0;
+                                       redirtied = true;
+                               }
+                       }
+                       cl_page_disown(env, io, page);
+                       unlocked = true;
+                       lu_ref_del(&page->cp_reference,
+                                  "writepage", current);
+                       cl_page_put(env, page);
+               } else {
+                       result = PTR_ERR(page);
+               }
+       }
+       cl_io_fini(env, io);
+
+       if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+               loff_t offset = cl_offset(clob, vmpage->index);
+
+               /* Flush page failed because the extent is being written out.
+                * Wait for the write of extent to be finished to avoid
+                * breaking kernel which assumes ->writepage should mark
+                * PageWriteback or clean the page. */
+               result = cl_sync_file_range(inode, offset,
+                                           offset + PAGE_CACHE_SIZE - 1,
+                                           CL_FSYNC_LOCAL);
+               if (result > 0) {
+                       /* actually we may have written more than one page.
+                        * decreasing this page because the caller will count
+                        * it. */
+                       wbc->nr_to_write -= result - 1;
+                       result = 0;
+               }
+       }
+
+       cl_env_nested_put(&nest, env);
+       GOTO(out, result);
+
+out:
+       if (result < 0) {
+               if (!lli->lli_async_rc)
+                       lli->lli_async_rc = result;
+               SetPageError(vmpage);
+               if (!unlocked)
+                       unlock_page(vmpage);
+       }
+       return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+       struct inode *inode = mapping->host;
+       loff_t start;
+       loff_t end;
+       enum cl_fsync_mode mode;
+       int range_whole = 0;
+       int result;
+       ENTRY;
+
+       if (wbc->range_cyclic) {
+               start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+               end = OBD_OBJECT_EOF;
+       } else {
+               start = wbc->range_start;
+               end = wbc->range_end;
+               if (end == LLONG_MAX) {
+                       end = OBD_OBJECT_EOF;
+                       range_whole = start == 0;
+               }
+       }
+
+       mode = CL_FSYNC_NONE;
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               mode = CL_FSYNC_LOCAL;
+
+       result = cl_sync_file_range(inode, start, end, mode);
+       if (result > 0) {
+               wbc->nr_to_write -= result;
+               result = 0;
+        }
+
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+               if (end == OBD_OBJECT_EOF)
+                       end = i_size_read(inode);
+               mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+       }
+       RETURN(result);
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+       struct ll_cl_context *lcc;
+       int result;
+       ENTRY;
+
+       lcc = ll_cl_init(file, vmpage, 0);
+       if (!IS_ERR(lcc)) {
+               struct lu_env  *env  = lcc->lcc_env;
+               struct cl_io   *io   = lcc->lcc_io;
+               struct cl_page *page = lcc->lcc_page;
+
+               LASSERT(page->cp_type == CPT_CACHEABLE);
+               if (likely(!PageUptodate(vmpage))) {
+                       cl_page_assume(env, io, page);
+                       result = cl_io_read_page(env, io, page);
+               } else {
+                       /* Page from a non-object file. */
+                       unlock_page(vmpage);
+                       result = 0;
+               }
+               ll_cl_fini(lcc);
+       } else {
+               unlock_page(vmpage);
+               result = PTR_ERR(lcc);
+       }
+       RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644 (file)
index 0000000..27e4e64
--- /dev/null
@@ -0,0 +1,586 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <asm/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned long offset)
+{
+       struct inode     *inode;
+       struct lu_env    *env;
+       struct cl_page   *page;
+       struct cl_object *obj;
+
+       int refcheck;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageWriteback(vmpage));
+
+       /*
+        * It is safe to not check anything in invalidatepage/releasepage
+        * below because they are run with page locked and all our io is
+        * happening with locked page too
+        */
+       if (offset == 0) {
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       inode = vmpage->mapping->host;
+                       obj = ll_i2info(inode)->lli_clob;
+                       if (obj != NULL) {
+                               page = cl_vmpage_page(vmpage, obj);
+                               if (page != NULL) {
+                                       lu_ref_add(&page->cp_reference,
+                                                  "delete", vmpage);
+                                       cl_page_delete(env, page);
+                                       lu_ref_del(&page->cp_reference,
+                                                  "delete", vmpage);
+                                       cl_page_put(env, page);
+                               }
+                       } else
+                               LASSERT(vmpage->private == 0);
+                       cl_env_put(env, &refcheck);
+               }
+       }
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+       struct cl_env_nest nest;
+       struct lu_env     *env;
+       struct cl_object  *obj;
+       struct cl_page    *page;
+       struct address_space *mapping;
+       int result;
+
+       LASSERT(PageLocked(vmpage));
+       if (PageWriteback(vmpage) || PageDirty(vmpage))
+               return 0;
+
+       mapping = vmpage->mapping;
+       if (mapping == NULL)
+               return 1;
+
+       obj = ll_i2info(mapping->host)->lli_clob;
+       if (obj == NULL)
+               return 1;
+
+       /* 1 for page allocator, 1 for cl_page and 1 for page cache */
+       if (page_count(vmpage) > 3)
+               return 0;
+
+       /* TODO: determine what gfp should be used by @gfp_mask. */
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               /* If we can't allocate an env we won't call cl_page_put()
+                * later on which further means it's impossible to drop
+                * page refcount by cl_page, so ask kernel to not free
+                * this page. */
+               return 0;
+
+       page = cl_vmpage_page(vmpage, obj);
+       result = page == NULL;
+       if (page != NULL) {
+               if (!cl_page_in_use(page)) {
+                       result = 1;
+                       cl_page_delete(env, page);
+               }
+               cl_page_put(env, page);
+       }
+       cl_env_nested_put(&nest, env);
+       return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+       struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+       struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+       struct vvp_page   *cpg;
+
+       /*
+        * XXX should page method be called here?
+        */
+       LASSERT(&obj->co_cl == page->cp_obj);
+       cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+       /*
+        * XXX cannot do much here, because page is possibly not locked:
+        * sys_munmap()->...
+        *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+        */
+       vvp_write_pending(obj, cpg);
+#endif
+       RETURN(__set_page_dirty_nobuffers(vmpage));
+}
+
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+                                   size_t size, struct page ***pages,
+                                   int *max_pages)
+{
+       int result = -ENOMEM;
+
+       /* set an arbitrary limit to prevent arithmetic overflow */
+       if (size > MAX_DIRECTIO_SIZE) {
+               *pages = NULL;
+               return -EFBIG;
+       }
+
+       *max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       *max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+       OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+       if (*pages) {
+               down_read(&current->mm->mmap_sem);
+               result = get_user_pages(current, current->mm, user_addr,
+                                       *max_pages, (rw == READ), 0, *pages,
+                                       NULL);
+               up_read(&current->mm->mmap_sem);
+               if (unlikely(result <= 0))
+                       OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+       }
+
+       return result;
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+       int i;
+
+       for (i = 0; i < npages; i++) {
+               if (pages[i] == NULL)
+                       break;
+               if (do_dirty)
+                       set_page_dirty_lock(pages[i]);
+               page_cache_release(pages[i]);
+       }
+
+       OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                          int rw, struct inode *inode,
+                          struct ll_dio_pages *pv)
+{
+       struct cl_page    *clp;
+       struct cl_2queue  *queue;
+       struct cl_object  *obj = io->ci_obj;
+       int i;
+       ssize_t rc = 0;
+       loff_t file_offset  = pv->ldp_start_offset;
+       long size          = pv->ldp_size;
+       int page_count      = pv->ldp_nr;
+       struct page **pages = pv->ldp_pages;
+       long page_size      = cl_page_size(obj);
+       bool do_io;
+       int  io_pages       = 0;
+       ENTRY;
+
+       queue = &io->ci_queue;
+       cl_2queue_init(queue);
+       for (i = 0; i < page_count; i++) {
+               if (pv->ldp_offsets)
+                   file_offset = pv->ldp_offsets[i];
+
+               LASSERT(!(file_offset & (page_size - 1)));
+               clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+                                  pv->ldp_pages[i], CPT_TRANSIENT);
+               if (IS_ERR(clp)) {
+                       rc = PTR_ERR(clp);
+                       break;
+               }
+
+               rc = cl_page_own(env, io, clp);
+               if (rc) {
+                       LASSERT(clp->cp_state == CPS_FREEING);
+                       cl_page_put(env, clp);
+                       break;
+               }
+
+               do_io = true;
+
+               /* check the page type: if the page is a host page, then do
+                * write directly */
+               if (clp->cp_type == CPT_CACHEABLE) {
+                       struct page *vmpage = cl_page_vmpage(env, clp);
+                       struct page *src_page;
+                       struct page *dst_page;
+                       void       *src;
+                       void       *dst;
+
+                       src_page = (rw == WRITE) ? pages[i] : vmpage;
+                       dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+                       src = ll_kmap_atomic(src_page, KM_USER0);
+                       dst = ll_kmap_atomic(dst_page, KM_USER1);
+                       memcpy(dst, src, min(page_size, size));
+                       ll_kunmap_atomic(dst, KM_USER1);
+                       ll_kunmap_atomic(src, KM_USER0);
+
+                       /* make sure page will be added to the transfer by
+                        * cl_io_submit()->...->vvp_page_prep_write(). */
+                       if (rw == WRITE)
+                               set_page_dirty(vmpage);
+
+                       if (rw == READ) {
+                               /* do not issue the page for read, since it
+                                * may reread a ra page which has NOT uptodate
+                                * bit set. */
+                               cl_page_disown(env, io, clp);
+                               do_io = false;
+                       }
+               }
+
+               if (likely(do_io)) {
+                       cl_2queue_add(queue, clp);
+
+                       /*
+                        * Set page clip to tell transfer formation engine
+                        * that page has to be sent even if it is beyond KMS.
+                        */
+                       cl_page_clip(env, clp, 0, min(size, page_size));
+
+                       ++io_pages;
+               }
+
+               /* drop the reference count for cl_page_find */
+               cl_page_put(env, clp);
+               size -= page_size;
+               file_offset += page_size;
+       }
+
+       if (rc == 0 && io_pages) {
+               rc = cl_io_submit_sync(env, io,
+                                      rw == READ ? CRT_READ : CRT_WRITE,
+                                      queue, 0);
+       }
+       if (rc == 0)
+               rc = pv->ldp_size;
+
+       cl_2queue_discard(env, io, queue);
+       cl_2queue_disown(env, io, queue);
+       cl_2queue_fini(env, queue);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+                                  int rw, struct inode *inode,
+                                  struct address_space *mapping,
+                                  size_t size, loff_t file_offset,
+                                  struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages    = pages,
+                                .ldp_nr           = page_count,
+                                .ldp_size       = size,
+                                .ldp_offsets      = NULL,
+                                .ldp_start_offset = file_offset
+                              };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+                     ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t file_offset,
+                              unsigned long nr_segs)
+{
+       struct lu_env *env;
+       struct cl_io *io;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct ccc_object *obj = cl_inode2ccc(inode);
+       long count = iov_length(iov, nr_segs);
+       long tot_bytes = 0, result = 0;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       unsigned long seg = 0;
+       long size = MAX_DIO_SIZE;
+       int refcheck;
+       ENTRY;
+
+       if (!lli->lli_has_smd)
+               RETURN(-EBADF);
+
+       /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+       if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
+              "offset=%lld=%llx, pages %lu (max %lu)\n",
+              inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+              file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+              MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+       /* Check that all user buffers are aligned as well */
+       for (seg = 0; seg < nr_segs; seg++) {
+               if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
+                   (iov[seg].iov_len & ~CFS_PAGE_MASK))
+                       RETURN(-EINVAL);
+       }
+
+       env = cl_env_get(&refcheck);
+       LASSERT(!IS_ERR(env));
+       io = ccc_env_io(env)->cui_cl.cis_io;
+       LASSERT(io != NULL);
+
+       /* 0. Need locking between buffered and direct access. and race with
+        *    size changing by concurrent truncates and writes.
+        * 1. Need inode mutex to operate transient pages.
+        */
+       if (rw == READ)
+               mutex_lock(&inode->i_mutex);
+
+       LASSERT(obj->cob_transient_pages == 0);
+       for (seg = 0; seg < nr_segs; seg++) {
+               long iov_left = iov[seg].iov_len;
+               unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+
+               if (rw == READ) {
+                       if (file_offset >= i_size_read(inode))
+                               break;
+                       if (file_offset + iov_left > i_size_read(inode))
+                               iov_left = i_size_read(inode) - file_offset;
+               }
+
+               while (iov_left > 0) {
+                       struct page **pages;
+                       int page_count, max_pages = 0;
+                       long bytes;
+
+                       bytes = min(size, iov_left);
+                       page_count = ll_get_user_pages(rw, user_addr, bytes,
+                                                      &pages, &max_pages);
+                       if (likely(page_count > 0)) {
+                               if (unlikely(page_count <  max_pages))
+                                       bytes = page_count << PAGE_CACHE_SHIFT;
+                               result = ll_direct_IO_26_seg(env, io, rw, inode,
+                                                            file->f_mapping,
+                                                            bytes, file_offset,
+                                                            pages, page_count);
+                               ll_free_user_pages(pages, max_pages, rw==READ);
+                       } else if (page_count == 0) {
+                               GOTO(out, result = -EFAULT);
+                       } else {
+                               result = page_count;
+                       }
+                       if (unlikely(result <= 0)) {
+                               /* If we can't allocate a large enough buffer
+                                * for the request, shrink it to a smaller
+                                * PAGE_SIZE multiple and try again.
+                                * We should always be able to kmalloc for a
+                                * page worth of page pointers = 4MB on i386. */
+                               if (result == -ENOMEM &&
+                                   size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+                                          PAGE_CACHE_SIZE) {
+                                       size = ((((size / 2) - 1) |
+                                                ~CFS_PAGE_MASK) + 1) &
+                                               CFS_PAGE_MASK;
+                                       CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+                                              size);
+                                       continue;
+                               }
+
+                               GOTO(out, result);
+                       }
+
+                       tot_bytes += result;
+                       file_offset += result;
+                       iov_left -= result;
+                       user_addr += result;
+               }
+       }
+out:
+       LASSERT(obj->cob_transient_pages == 0);
+       if (rw == READ)
+               mutex_unlock(&inode->i_mutex);
+
+       if (tot_bytes > 0) {
+               if (rw == WRITE) {
+                       struct lov_stripe_md *lsm;
+
+                       lsm = ccc_inode_lsm_get(inode);
+                       LASSERT(lsm != NULL);
+                       lov_stripe_lock(lsm);
+                       obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+                       lov_stripe_unlock(lsm);
+                       ccc_inode_lsm_put(inode, lsm);
+               }
+       }
+
+       cl_env_put(env, &refcheck);
+       RETURN(tot_bytes ? : result);
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+       pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+       struct page *page;
+       int rc;
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       ENTRY;
+
+       page = grab_cache_page_write_begin(mapping, index, flags);
+       if (!page)
+               RETURN(-ENOMEM);
+
+       *pagep = page;
+
+       rc = ll_prepare_write(file, page, from, from + len);
+       if (rc) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       RETURN(rc);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+                       loff_t pos, unsigned len, unsigned copied,
+                       struct page *page, void *fsdata)
+{
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       int rc;
+
+       rc = ll_commit_write(file, page, from, from + copied);
+       unlock_page(page);
+       page_cache_release(page);
+
+       return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+int ll_migratepage(struct address_space *mapping,
+               struct page *newpage, struct page *page
+               , enum migrate_mode mode
+               )
+{
+       /* Always fail page migration until we have a proper implementation */
+       return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+struct address_space_operations ll_aops = {
+       .readpage       = ll_readpage,
+//     .readpages      = ll_readpages,
+       .direct_IO      = ll_direct_IO_26,
+       .writepage      = ll_writepage,
+       .writepages     = ll_writepages,
+       .set_page_dirty = ll_set_page_dirty,
+       .write_begin    = ll_write_begin,
+       .write_end      = ll_write_end,
+       .invalidatepage = ll_invalidatepage,
+       .releasepage    = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+       .migratepage    = ll_migratepage,
+#endif
+       .bmap      = NULL
+};
+#else
+struct address_space_operations_ext ll_aops = {
+       .orig_aops.readpage       = ll_readpage,
+//     .orig_aops.readpages      = ll_readpages,
+       .orig_aops.direct_IO      = ll_direct_IO_26,
+       .orig_aops.writepage      = ll_writepage,
+       .orig_aops.writepages     = ll_writepages,
+       .orig_aops.set_page_dirty = ll_set_page_dirty,
+       .orig_aops.prepare_write  = ll_prepare_write,
+       .orig_aops.commit_write   = ll_commit_write,
+       .orig_aops.invalidatepage = ll_invalidatepage,
+       .orig_aops.releasepage    = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+       .orig_aops.migratepage    = ll_migratepage,
+#endif
+       .orig_aops.bmap    = NULL,
+       .write_begin    = ll_write_begin,
+       .write_end      = ll_write_end
+};
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644 (file)
index 0000000..7747f8f
--- /dev/null
@@ -0,0 +1,1722 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+       /** negative values are for error cases */
+       SA_ENTRY_INIT = 0,      /** init entry */
+       SA_ENTRY_SUCC = 1,      /** stat succeed */
+       SA_ENTRY_INVA = 2,      /** invalid entry */
+       SA_ENTRY_DEST = 3,      /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+       /* link into sai->sai_entries */
+       struct list_head              se_link;
+       /* link into sai->sai_entries_{received,stated} */
+       struct list_head              se_list;
+       /* link into sai hash table locally */
+       struct list_head              se_hash;
+       /* entry reference count */
+       atomic_t            se_refcount;
+       /* entry index in the sai */
+       __u64              se_index;
+       /* low layer ldlm lock handle */
+       __u64              se_handle;
+       /* entry status */
+       se_stat_t              se_stat;
+       /* entry size, contains name */
+       int                  se_size;
+       /* pointer to async getattr enqueue info */
+       struct md_enqueue_info *se_minfo;
+       /* pointer to the async getattr request */
+       struct ptlrpc_request  *se_req;
+       /* pointer to the target inode */
+       struct inode       *se_inode;
+       /* entry name */
+       struct qstr          se_qstr;
+};
+
+static unsigned int sai_generation = 0;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+       return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+       smp_rmb();
+       return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+       return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+       spin_lock(&sai->sai_cache_lock[i]);
+       list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+       spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+       spin_lock(&sai->sai_cache_lock[i]);
+       list_del_init(&entry->se_hash);
+       spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+                                struct inode *inode)
+{
+       return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+       return list_entry(sai->sai_entries_received.next,
+                             struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+       return list_entry(sai->sai_entries_agl.next,
+                             struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+       return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+       return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+       return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+       return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+               (sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+       return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+                sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+                 const char *name, int len)
+{
+       struct ll_inode_info *lli;
+       struct ll_sa_entry   *entry;
+       int                entry_size;
+       char             *dname;
+       ENTRY;
+
+       entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+       OBD_ALLOC(entry, entry_size);
+       if (unlikely(entry == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n",
+              len, name, entry, index);
+
+       entry->se_index = index;
+
+       /*
+        * Statahead entry reference rules:
+        *
+        * 1) When statahead entry is initialized, its reference is set as 2.
+        *    One reference is used by the directory scanner. When the scanner
+        *    searches the statahead cache for the given name, it can perform
+        *    lockless hash lookup (only the scanner can remove entry from hash
+        *    list), and once found, it needn't to call "atomic_inc()" for the
+        *    entry reference. So the performance is improved. After using the
+        *    statahead entry, the scanner will call "atomic_dec()" to drop the
+        *    reference held when initialization. If it is the last reference,
+        *    the statahead entry will be freed.
+        *
+        * 2) All other threads, including statahead thread and ptlrpcd thread,
+        *    when they process the statahead entry, the reference for target
+        *    should be held to guarantee the entry will not be released by the
+        *    directory scanner. After processing the entry, these threads will
+        *    drop the entry reference. If it is the last reference, the entry
+        *    will be freed.
+        *
+        *    The second reference when initializes the statahead entry is used
+        *    by the statahead thread, following the rule 2).
+        */
+       atomic_set(&entry->se_refcount, 2);
+       entry->se_stat = SA_ENTRY_INIT;
+       entry->se_size = entry_size;
+       dname = (char *)entry + sizeof(struct ll_sa_entry);
+       memcpy(dname, name, len);
+       dname[len] = 0;
+       entry->se_qstr.hash = full_name_hash(name, len);
+       entry->se_qstr.len = len;
+       entry->se_qstr.name = dname;
+
+       lli = ll_i2info(sai->sai_inode);
+       spin_lock(&lli->lli_sa_lock);
+       list_add_tail(&entry->se_link, &sai->sai_entries);
+       INIT_LIST_HEAD(&entry->se_list);
+       ll_sa_entry_enhash(sai, entry);
+       spin_unlock(&lli->lli_sa_lock);
+
+       atomic_inc(&sai->sai_cache_count);
+
+       RETURN(entry);
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+       struct ll_sa_entry *entry;
+       int i = ll_sa_entry_hash(qstr->hash);
+
+       list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+               if (entry->se_qstr.hash == qstr->hash &&
+                   entry->se_qstr.len == qstr->len &&
+                   memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+                       return entry;
+       }
+       return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+       struct ll_sa_entry *entry;
+
+       list_for_each_entry(entry, &sai->sai_entries, se_link) {
+               if (entry->se_index == index) {
+                       LASSERT(atomic_read(&entry->se_refcount) > 0);
+                       atomic_inc(&entry->se_refcount);
+                       return entry;
+               }
+               if (entry->se_index > index)
+                       break;
+       }
+       return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+                                struct ll_sa_entry *entry)
+{
+       struct md_enqueue_info *minfo = entry->se_minfo;
+       struct ptlrpc_request  *req   = entry->se_req;
+
+       if (minfo) {
+               entry->se_minfo = NULL;
+               ll_intent_release(&minfo->mi_it);
+               iput(minfo->mi_dir);
+               OBD_FREE_PTR(minfo);
+       }
+
+       if (req) {
+               entry->se_req = NULL;
+               ptlrpc_req_finished(req);
+       }
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+                            struct ll_sa_entry *entry)
+{
+       if (atomic_dec_and_test(&entry->se_refcount)) {
+               CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n",
+                      entry->se_qstr.len, entry->se_qstr.name, entry,
+                      entry->se_index);
+
+               LASSERT(list_empty(&entry->se_link));
+               LASSERT(list_empty(&entry->se_list));
+               LASSERT(ll_sa_entry_unhashed(entry));
+
+               ll_sa_entry_cleanup(sai, entry);
+               if (entry->se_inode)
+                       iput(entry->se_inode);
+
+               OBD_FREE(entry, entry->se_size);
+               atomic_dec(&sai->sai_cache_count);
+       }
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+       LASSERT(!ll_sa_entry_unhashed(entry));
+       LASSERT(!list_empty(&entry->se_link));
+
+       ll_sa_entry_unhash(sai, entry);
+
+       spin_lock(&lli->lli_sa_lock);
+       entry->se_stat = SA_ENTRY_DEST;
+       list_del_init(&entry->se_link);
+       if (likely(!list_empty(&entry->se_list)))
+               list_del_init(&entry->se_list);
+       spin_unlock(&lli->lli_sa_lock);
+
+       ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ll_sa_entry *pos, *next;
+
+       if (entry)
+               do_sa_entry_fini(sai, entry);
+
+       /* drop old entry, only 'scanner' process does this, no need to lock */
+       list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+               if (!is_omitted_entry(sai, pos->se_index))
+                       break;
+               do_sa_entry_fini(sai, pos);
+       }
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+                     struct ll_sa_entry *entry, se_stat_t stat)
+{
+       struct ll_sa_entry *se;
+       struct list_head         *pos = &sai->sai_entries_stated;
+
+       if (!list_empty(&entry->se_list))
+               list_del_init(&entry->se_list);
+
+       list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+               if (se->se_index < entry->se_index) {
+                       pos = &se->se_list;
+                       break;
+               }
+       }
+
+       list_add(&entry->se_list, pos);
+       entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1    -- entry to be destroyed.
+ * \retval 0    -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+                     struct ll_sa_entry *entry, se_stat_t stat)
+{
+       struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+       int                ret = 1;
+
+       ll_sa_entry_cleanup(sai, entry);
+
+       spin_lock(&lli->lli_sa_lock);
+       if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+               do_sa_entry_to_stated(sai, entry, stat);
+               ret = 0;
+       }
+       spin_unlock(&lli->lli_sa_lock);
+
+       return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+                      struct inode *inode, int index)
+{
+       struct ll_inode_info *child  = ll_i2info(inode);
+       struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+       int                added  = 0;
+
+       spin_lock(&child->lli_agl_lock);
+       if (child->lli_agl_index == 0) {
+               child->lli_agl_index = index;
+               spin_unlock(&child->lli_agl_lock);
+
+               LASSERT(list_empty(&child->lli_agl_list));
+
+               igrab(inode);
+               spin_lock(&parent->lli_agl_lock);
+               if (agl_list_empty(sai))
+                       added = 1;
+               list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+               spin_unlock(&parent->lli_agl_lock);
+       } else {
+               spin_unlock(&child->lli_agl_lock);
+       }
+
+       if (added > 0)
+               wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+       struct ll_statahead_info *sai;
+       int                    i;
+       ENTRY;
+
+       OBD_ALLOC_PTR(sai);
+       if (!sai)
+               RETURN(NULL);
+
+       atomic_set(&sai->sai_refcount, 1);
+
+       spin_lock(&sai_generation_lock);
+       sai->sai_generation = ++sai_generation;
+       if (unlikely(sai_generation == 0))
+               sai->sai_generation = ++sai_generation;
+       spin_unlock(&sai_generation_lock);
+
+       sai->sai_max = LL_SA_RPC_MIN;
+       sai->sai_index = 1;
+       init_waitqueue_head(&sai->sai_waitq);
+       init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+       init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+       INIT_LIST_HEAD(&sai->sai_entries);
+       INIT_LIST_HEAD(&sai->sai_entries_received);
+       INIT_LIST_HEAD(&sai->sai_entries_stated);
+       INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+       for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+               INIT_LIST_HEAD(&sai->sai_cache[i]);
+               spin_lock_init(&sai->sai_cache_lock[i]);
+       }
+       atomic_set(&sai->sai_cache_count, 0);
+
+       RETURN(sai);
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+       atomic_inc(&sai->sai_refcount);
+       return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+       struct inode     *inode = sai->sai_inode;
+       struct ll_inode_info *lli   = ll_i2info(inode);
+       ENTRY;
+
+       if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+               struct ll_sa_entry *entry, *next;
+
+               if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+                       /* It is race case, the interpret callback just hold
+                        * a reference count */
+                       spin_unlock(&lli->lli_sa_lock);
+                       RETURN_EXIT;
+               }
+
+               LASSERT(lli->lli_opendir_key == NULL);
+               LASSERT(thread_is_stopped(&sai->sai_thread));
+               LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+               lli->lli_sai = NULL;
+               lli->lli_opendir_pid = 0;
+               spin_unlock(&lli->lli_sa_lock);
+
+               if (sai->sai_sent > sai->sai_replied)
+                       CDEBUG(D_READA,"statahead for dir "DFID" does not "
+                             "finish: [sent:"LPU64"] [replied:"LPU64"]\n",
+                             PFID(&lli->lli_fid),
+                             sai->sai_sent, sai->sai_replied);
+
+               list_for_each_entry_safe(entry, next,
+                                            &sai->sai_entries, se_link)
+                       do_sa_entry_fini(sai, entry);
+
+               LASSERT(list_empty(&sai->sai_entries));
+               LASSERT(sa_received_empty(sai));
+               LASSERT(list_empty(&sai->sai_entries_stated));
+
+               LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+               LASSERT(agl_list_empty(sai));
+
+               iput(inode);
+               OBD_FREE_PTR(sai);
+       }
+
+       EXIT;
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+       struct ll_inode_info *lli   = ll_i2info(inode);
+       __u64            index = lli->lli_agl_index;
+       int                rc;
+       ENTRY;
+
+       LASSERT(list_empty(&lli->lli_agl_list));
+
+       /* AGL maybe fall behind statahead with one entry */
+       if (is_omitted_entry(sai, index + 1)) {
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       /* Someone is in glimpse (sync or async), do nothing. */
+       rc = down_write_trylock(&lli->lli_glimpse_sem);
+       if (rc == 0) {
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       /*
+        * Someone triggered glimpse within 1 sec before.
+        * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+        *    if the lock is still cached on client, AGL needs to do nothing. If
+        *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+        *    for no glimpse callback triggered by AGL.
+        * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+        *    Under such case, it is quite possible that the OST will not grant
+        *    glimpse lock for AGL also.
+        * 3) The former glimpse failed, compared with other two cases, it is
+        *    relative rare. AGL can ignore such case, and it will not muchly
+        *    affect the performance.
+        */
+       if (lli->lli_glimpse_time != 0 &&
+           cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+               up_write(&lli->lli_glimpse_sem);
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+              DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index);
+
+       cl_agl(inode);
+       lli->lli_agl_index = 0;
+       lli->lli_glimpse_time = cfs_time_current();
+       up_write(&lli->lli_glimpse_sem);
+
+       CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+              DFID", idx = "LPU64", rc = %d\n",
+              PFID(&lli->lli_fid), index, rc);
+
+       iput(inode);
+
+       EXIT;
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+       struct inode       *dir   = sai->sai_inode;
+       struct inode       *child;
+       struct ll_inode_info   *lli   = ll_i2info(dir);
+       struct ll_sa_entry     *entry;
+       struct md_enqueue_info *minfo;
+       struct lookup_intent   *it;
+       struct ptlrpc_request  *req;
+       struct mdt_body *body;
+       int                  rc    = 0;
+       ENTRY;
+
+       spin_lock(&lli->lli_sa_lock);
+       if (unlikely(sa_received_empty(sai))) {
+               spin_unlock(&lli->lli_sa_lock);
+               RETURN_EXIT;
+       }
+       entry = sa_first_received_entry(sai);
+       atomic_inc(&entry->se_refcount);
+       list_del_init(&entry->se_list);
+       spin_unlock(&lli->lli_sa_lock);
+
+       LASSERT(entry->se_handle != 0);
+
+       minfo = entry->se_minfo;
+       it = &minfo->mi_it;
+       req = entry->se_req;
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       child = entry->se_inode;
+       if (child == NULL) {
+               /*
+                * lookup.
+                */
+               LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+               /* XXX: No fid in reply, this is probaly cross-ref case.
+                * SA can't handle it yet. */
+               if (body->valid & OBD_MD_MDS)
+                       GOTO(out, rc = -EAGAIN);
+       } else {
+               /*
+                * revalidate.
+                */
+               /* unlinked and re-created with the same name */
+               if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+                       entry->se_inode = NULL;
+                       iput(child);
+                       child = NULL;
+               }
+       }
+
+       it->d.lustre.it_lock_handle = entry->se_handle;
+       rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+       if (rc != 1)
+               GOTO(out, rc = -EAGAIN);
+
+       rc = ll_prep_inode(&child, req, dir->i_sb, it);
+       if (rc)
+               GOTO(out, rc);
+
+       CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+              child, child->i_ino, child->i_generation);
+       ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+       entry->se_inode = child;
+
+       if (agl_should_run(sai, child))
+               ll_agl_add(sai, child, entry->se_index);
+
+       EXIT;
+
+out:
+       /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+        * reference count by calling "ll_intent_drop_lock()" in spite of the
+        * above operations failed or not. Do not worry about calling
+        * "ll_intent_drop_lock()" more than once. */
+       rc = ll_sa_entry_to_stated(sai, entry,
+                                  rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+       if (rc == 0 && entry->se_index == sai->sai_index_wait)
+               wake_up(&sai->sai_waitq);
+       ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+                                 struct md_enqueue_info *minfo, int rc)
+{
+       struct lookup_intent     *it  = &minfo->mi_it;
+       struct inode         *dir = minfo->mi_dir;
+       struct ll_inode_info     *lli = ll_i2info(dir);
+       struct ll_statahead_info *sai = NULL;
+       struct ll_sa_entry       *entry;
+       int                    wakeup;
+       ENTRY;
+
+       if (it_disposition(it, DISP_LOOKUP_NEG))
+               rc = -ENOENT;
+
+       spin_lock(&lli->lli_sa_lock);
+       /* stale entry */
+       if (unlikely(lli->lli_sai == NULL ||
+                    lli->lli_sai->sai_generation != minfo->mi_generation)) {
+               spin_unlock(&lli->lli_sa_lock);
+               GOTO(out, rc = -ESTALE);
+       } else {
+               sai = ll_sai_get(lli->lli_sai);
+               if (unlikely(!thread_is_running(&sai->sai_thread))) {
+                       sai->sai_replied++;
+                       spin_unlock(&lli->lli_sa_lock);
+                       GOTO(out, rc = -EBADFD);
+               }
+
+               entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+               if (entry == NULL) {
+                       sai->sai_replied++;
+                       spin_unlock(&lli->lli_sa_lock);
+                       GOTO(out, rc = -EIDRM);
+               }
+
+               if (rc != 0) {
+                       do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+                       wakeup = (entry->se_index == sai->sai_index_wait);
+               } else {
+                       entry->se_minfo = minfo;
+                       entry->se_req = ptlrpc_request_addref(req);
+                       /* Release the async ibits lock ASAP to avoid deadlock
+                        * when statahead thread tries to enqueue lock on parent
+                        * for readpage and other tries to enqueue lock on child
+                        * with parent's lock held, for example: unlink. */
+                       entry->se_handle = it->d.lustre.it_lock_handle;
+                       ll_intent_drop_lock(it);
+                       wakeup = sa_received_empty(sai);
+                       list_add_tail(&entry->se_list,
+                                         &sai->sai_entries_received);
+               }
+               sai->sai_replied++;
+               spin_unlock(&lli->lli_sa_lock);
+
+               ll_sa_entry_put(sai, entry);
+               if (wakeup)
+                       wake_up(&sai->sai_thread.t_ctl_waitq);
+       }
+
+       EXIT;
+
+out:
+       if (rc != 0) {
+               ll_intent_release(it);
+               iput(dir);
+               OBD_FREE_PTR(minfo);
+       }
+       if (sai != NULL)
+               ll_sai_put(sai);
+       return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+                        struct ldlm_enqueue_info *einfo)
+{
+       LASSERT(minfo && einfo);
+       iput(minfo->mi_dir);
+       capa_put(minfo->mi_data.op_capa1);
+       capa_put(minfo->mi_data.op_capa2);
+       OBD_FREE_PTR(minfo);
+       OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+                       struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+                       struct ldlm_enqueue_info **pei,
+                       struct obd_capa **pcapa)
+{
+       struct qstr           *qstr = &entry->se_qstr;
+       struct ll_inode_info     *lli  = ll_i2info(dir);
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct md_op_data       *op_data;
+
+       OBD_ALLOC_PTR(einfo);
+       if (einfo == NULL)
+               return -ENOMEM;
+
+       OBD_ALLOC_PTR(minfo);
+       if (minfo == NULL) {
+               OBD_FREE_PTR(einfo);
+               return -ENOMEM;
+       }
+
+       op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+                                    qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data)) {
+               OBD_FREE_PTR(einfo);
+               OBD_FREE_PTR(minfo);
+               return PTR_ERR(op_data);
+       }
+
+       minfo->mi_it.it_op = IT_GETATTR;
+       minfo->mi_dir = igrab(dir);
+       minfo->mi_cb = ll_statahead_interpret;
+       minfo->mi_generation = lli->lli_sai->sai_generation;
+       minfo->mi_cbdata = entry->se_index;
+
+       einfo->ei_type   = LDLM_IBITS;
+       einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+       einfo->ei_cb_bl  = ll_md_blocking_ast;
+       einfo->ei_cb_cp  = ldlm_completion_ast;
+       einfo->ei_cb_gl  = NULL;
+       einfo->ei_cbdata = NULL;
+
+       *pmi = minfo;
+       *pei = einfo;
+       pcapa[0] = op_data->op_capa1;
+       pcapa[1] = op_data->op_capa2;
+
+       return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct obd_capa   *capas[2];
+       int                    rc;
+       ENTRY;
+
+       rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+       if (rc)
+               RETURN(rc);
+
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+       if (!rc) {
+               capa_put(capas[0]);
+               capa_put(capas[1]);
+       } else {
+               sa_args_fini(minfo, einfo);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+                           struct dentry *dentry)
+{
+       struct inode         *inode = dentry->d_inode;
+       struct lookup_intent      it = { .it_op = IT_GETATTR,
+                                        .d.lustre.it_lock_handle = 0 };
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct obd_capa   *capas[2];
+       int rc;
+       ENTRY;
+
+       if (unlikely(inode == NULL))
+               RETURN(1);
+
+       if (d_mountpoint(dentry))
+               RETURN(1);
+
+       if (unlikely(dentry == dentry->d_sb->s_root))
+               RETURN(1);
+
+       entry->se_inode = igrab(inode);
+       rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL);
+       if (rc == 1) {
+               entry->se_handle = it.d.lustre.it_lock_handle;
+               ll_intent_release(&it);
+               RETURN(1);
+       }
+
+       rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+       if (rc) {
+               entry->se_inode = NULL;
+               iput(inode);
+               RETURN(rc);
+       }
+
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+       if (!rc) {
+               capa_put(capas[0]);
+               capa_put(capas[1]);
+       } else {
+               entry->se_inode = NULL;
+               iput(inode);
+               sa_args_fini(minfo, einfo);
+       }
+
+       RETURN(rc);
+}
+
+static void ll_statahead_one(struct dentry *parent, const char* entry_name,
+                            int entry_name_len)
+{
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *lli    = ll_i2info(dir);
+       struct ll_statahead_info *sai    = lli->lli_sai;
+       struct dentry       *dentry = NULL;
+       struct ll_sa_entry       *entry;
+       int                    rc;
+       int                    rc1;
+       ENTRY;
+
+       entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+                                 entry_name_len);
+       if (IS_ERR(entry))
+               RETURN_EXIT;
+
+       dentry = d_lookup(parent, &entry->se_qstr);
+       if (!dentry) {
+               rc = do_sa_lookup(dir, entry);
+       } else {
+               rc = do_sa_revalidate(dir, entry, dentry);
+               if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+                       ll_agl_add(sai, dentry->d_inode, entry->se_index);
+       }
+
+       if (dentry != NULL)
+               dput(dentry);
+
+       if (rc) {
+               rc1 = ll_sa_entry_to_stated(sai, entry,
+                                       rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+               if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+                       wake_up(&sai->sai_waitq);
+       } else {
+               sai->sai_sent++;
+       }
+
+       sai->sai_index++;
+       /* drop one refcount on entry by ll_sa_entry_alloc */
+       ll_sa_entry_put(sai, entry);
+
+       EXIT;
+}
+
+static int ll_agl_thread(void *arg)
+{
+       struct dentry       *parent = (struct dentry *)arg;
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *plli   = ll_i2info(dir);
+       struct ll_inode_info     *clli;
+       struct ll_sb_info       *sbi    = ll_i2sbi(dir);
+       struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+       struct ptlrpc_thread     *thread = &sai->sai_agl_thread;
+       struct l_wait_info      lwi    = { 0 };
+       ENTRY;
+
+       CDEBUG(D_READA, "agl thread started: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       atomic_inc(&sbi->ll_agl_total);
+       spin_lock(&plli->lli_agl_lock);
+       sai->sai_agl_valid = 1;
+       thread_set_flags(thread, SVC_RUNNING);
+       spin_unlock(&plli->lli_agl_lock);
+       wake_up(&thread->t_ctl_waitq);
+
+       while (1) {
+               l_wait_event(thread->t_ctl_waitq,
+                            !agl_list_empty(sai) ||
+                            !thread_is_running(thread),
+                            &lwi);
+
+               if (!thread_is_running(thread))
+                       break;
+
+               spin_lock(&plli->lli_agl_lock);
+               /* The statahead thread maybe help to process AGL entries,
+                * so check whether list empty again. */
+               if (!agl_list_empty(sai)) {
+                       clli = agl_first_entry(sai);
+                       list_del_init(&clli->lli_agl_list);
+                       spin_unlock(&plli->lli_agl_lock);
+                       ll_agl_trigger(&clli->lli_vfs_inode, sai);
+               } else {
+                       spin_unlock(&plli->lli_agl_lock);
+               }
+       }
+
+       spin_lock(&plli->lli_agl_lock);
+       sai->sai_agl_valid = 0;
+       while (!agl_list_empty(sai)) {
+               clli = agl_first_entry(sai);
+               list_del_init(&clli->lli_agl_list);
+               spin_unlock(&plli->lli_agl_lock);
+               clli->lli_agl_index = 0;
+               iput(&clli->lli_vfs_inode);
+               spin_lock(&plli->lli_agl_lock);
+       }
+       thread_set_flags(thread, SVC_STOPPED);
+       spin_unlock(&plli->lli_agl_lock);
+       wake_up(&thread->t_ctl_waitq);
+       ll_sai_put(sai);
+       CDEBUG(D_READA, "agl thread stopped: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+       RETURN(0);
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+       struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+       struct l_wait_info    lwi    = { 0 };
+       struct ll_inode_info  *plli;
+       task_t        *task;
+       ENTRY;
+
+       CDEBUG(D_READA, "start agl thread: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       plli = ll_i2info(parent->d_inode);
+       task = kthread_run(ll_agl_thread, parent,
+                              "ll_agl_%u", plli->lli_opendir_pid);
+       if (IS_ERR(task)) {
+               CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+               thread_set_flags(thread, SVC_STOPPED);
+               RETURN_EXIT;
+       }
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+       EXIT;
+}
+
+static int ll_statahead_thread(void *arg)
+{
+       struct dentry       *parent = (struct dentry *)arg;
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *plli   = ll_i2info(dir);
+       struct ll_inode_info     *clli;
+       struct ll_sb_info       *sbi    = ll_i2sbi(dir);
+       struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+       struct ptlrpc_thread     *thread = &sai->sai_thread;
+       struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+       struct page           *page;
+       __u64                pos    = 0;
+       int                    first  = 0;
+       int                    rc     = 0;
+       struct ll_dir_chain       chain;
+       struct l_wait_info      lwi    = { 0 };
+       ENTRY;
+
+       CDEBUG(D_READA, "statahead thread started: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+               ll_start_agl(parent, sai);
+
+       atomic_inc(&sbi->ll_sa_total);
+       spin_lock(&plli->lli_sa_lock);
+       thread_set_flags(thread, SVC_RUNNING);
+       spin_unlock(&plli->lli_sa_lock);
+       wake_up(&thread->t_ctl_waitq);
+
+       ll_dir_chain_init(&chain);
+       page = ll_get_dir_page(dir, pos, &chain);
+
+       while (1) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (IS_ERR(page)) {
+                       rc = PTR_ERR(page);
+                       CDEBUG(D_READA, "error reading dir "DFID" at "LPU64
+                              "/"LPU64": [rc %d] [parent %u]\n",
+                              PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+                              rc, plli->lli_opendir_pid);
+                       GOTO(out, rc);
+               }
+
+               dp = page_address(page);
+               for (ent = lu_dirent_start(dp); ent != NULL;
+                    ent = lu_dirent_next(ent)) {
+                       __u64 hash;
+                       int namelen;
+                       char *name;
+
+                       hash = le64_to_cpu(ent->lde_hash);
+                       if (unlikely(hash < pos))
+                               /*
+                                * Skip until we find target hash value.
+                                */
+                               continue;
+
+                       namelen = le16_to_cpu(ent->lde_namelen);
+                       if (unlikely(namelen == 0))
+                               /*
+                                * Skip dummy record.
+                                */
+                               continue;
+
+                       name = ent->lde_name;
+                       if (name[0] == '.') {
+                               if (namelen == 1) {
+                                       /*
+                                        * skip "."
+                                        */
+                                       continue;
+                               } else if (name[1] == '.' && namelen == 2) {
+                                       /*
+                                        * skip ".."
+                                        */
+                                       continue;
+                               } else if (!sai->sai_ls_all) {
+                                       /*
+                                        * skip hidden files.
+                                        */
+                                       sai->sai_skip_hidden++;
+                                       continue;
+                               }
+                       }
+
+                       /*
+                        * don't stat-ahead first entry.
+                        */
+                       if (unlikely(++first == 1))
+                               continue;
+
+keep_it:
+                       l_wait_event(thread->t_ctl_waitq,
+                                    !sa_sent_full(sai) ||
+                                    !sa_received_empty(sai) ||
+                                    !agl_list_empty(sai) ||
+                                    !thread_is_running(thread),
+                                    &lwi);
+
+interpret_it:
+                       while (!sa_received_empty(sai))
+                               ll_post_statahead(sai);
+
+                       if (unlikely(!thread_is_running(thread))) {
+                               ll_release_page(page, 0);
+                               GOTO(out, rc = 0);
+                       }
+
+                       /* If no window for metadata statahead, but there are
+                        * some AGL entries to be triggered, then try to help
+                        * to process the AGL entries. */
+                       if (sa_sent_full(sai)) {
+                               spin_lock(&plli->lli_agl_lock);
+                               while (!agl_list_empty(sai)) {
+                                       clli = agl_first_entry(sai);
+                                       list_del_init(&clli->lli_agl_list);
+                                       spin_unlock(&plli->lli_agl_lock);
+                                       ll_agl_trigger(&clli->lli_vfs_inode,
+                                                      sai);
+
+                                       if (!sa_received_empty(sai))
+                                               goto interpret_it;
+
+                                       if (unlikely(
+                                               !thread_is_running(thread))) {
+                                               ll_release_page(page, 0);
+                                               GOTO(out, rc = 0);
+                                       }
+
+                                       if (!sa_sent_full(sai))
+                                               goto do_it;
+
+                                       spin_lock(&plli->lli_agl_lock);
+                               }
+                               spin_unlock(&plli->lli_agl_lock);
+
+                               goto keep_it;
+                       }
+
+do_it:
+                       ll_statahead_one(parent, name, namelen);
+               }
+               pos = le64_to_cpu(dp->ldp_hash_end);
+               if (pos == MDS_DIR_END_OFF) {
+                       /*
+                        * End of directory reached.
+                        */
+                       ll_release_page(page, 0);
+                       while (1) {
+                               l_wait_event(thread->t_ctl_waitq,
+                                            !sa_received_empty(sai) ||
+                                            sai->sai_sent == sai->sai_replied||
+                                            !thread_is_running(thread),
+                                            &lwi);
+
+                               while (!sa_received_empty(sai))
+                                       ll_post_statahead(sai);
+
+                               if (unlikely(!thread_is_running(thread)))
+                                       GOTO(out, rc = 0);
+
+                               if (sai->sai_sent == sai->sai_replied &&
+                                   sa_received_empty(sai))
+                                       break;
+                       }
+
+                       spin_lock(&plli->lli_agl_lock);
+                       while (!agl_list_empty(sai) &&
+                              thread_is_running(thread)) {
+                               clli = agl_first_entry(sai);
+                               list_del_init(&clli->lli_agl_list);
+                               spin_unlock(&plli->lli_agl_lock);
+                               ll_agl_trigger(&clli->lli_vfs_inode, sai);
+                               spin_lock(&plli->lli_agl_lock);
+                       }
+                       spin_unlock(&plli->lli_agl_lock);
+
+                       GOTO(out, rc = 0);
+               } else if (1) {
+                       /*
+                        * chain is exhausted.
+                        * Normal case: continue to the next page.
+                        */
+                       ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+                                             LDF_COLLIDE);
+                       sai->sai_in_readpage = 1;
+                       page = ll_get_dir_page(dir, pos, &chain);
+                       sai->sai_in_readpage = 0;
+               } else {
+                       LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                       ll_release_page(page, 1);
+                       /*
+                        * go into overflow page.
+                        */
+               }
+       }
+       EXIT;
+
+out:
+       if (sai->sai_agl_valid) {
+               spin_lock(&plli->lli_agl_lock);
+               thread_set_flags(agl_thread, SVC_STOPPING);
+               spin_unlock(&plli->lli_agl_lock);
+               wake_up(&agl_thread->t_ctl_waitq);
+
+               CDEBUG(D_READA, "stop agl thread: [pid %d]\n",
+                      current_pid());
+               l_wait_event(agl_thread->t_ctl_waitq,
+                            thread_is_stopped(agl_thread),
+                            &lwi);
+       } else {
+               /* Set agl_thread flags anyway. */
+               thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+       }
+       ll_dir_chain_fini(&chain);
+       spin_lock(&plli->lli_sa_lock);
+       if (!sa_received_empty(sai)) {
+               thread_set_flags(thread, SVC_STOPPING);
+               spin_unlock(&plli->lli_sa_lock);
+
+               /* To release the resources held by received entries. */
+               while (!sa_received_empty(sai))
+                       ll_post_statahead(sai);
+
+               spin_lock(&plli->lli_sa_lock);
+       }
+       thread_set_flags(thread, SVC_STOPPED);
+       spin_unlock(&plli->lli_sa_lock);
+       wake_up(&sai->sai_waitq);
+       wake_up(&thread->t_ctl_waitq);
+       ll_sai_put(sai);
+       dput(parent);
+       CDEBUG(D_READA, "statahead thread stopped: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+       return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+       struct ll_inode_info *lli = ll_i2info(dir);
+
+       if (unlikely(key == NULL))
+               return;
+
+       spin_lock(&lli->lli_sa_lock);
+       if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+               spin_unlock(&lli->lli_sa_lock);
+               return;
+       }
+
+       lli->lli_opendir_key = NULL;
+
+       if (lli->lli_sai) {
+               struct l_wait_info lwi = { 0 };
+               struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+               if (!thread_is_stopped(thread)) {
+                       thread_set_flags(thread, SVC_STOPPING);
+                       spin_unlock(&lli->lli_sa_lock);
+                       wake_up(&thread->t_ctl_waitq);
+
+                       CDEBUG(D_READA, "stop statahead thread: [pid %d]\n",
+                              current_pid());
+                       l_wait_event(thread->t_ctl_waitq,
+                                    thread_is_stopped(thread),
+                                    &lwi);
+               } else {
+                       spin_unlock(&lli->lli_sa_lock);
+               }
+
+               /*
+                * Put the ref which was held when first statahead_enter.
+                * It maybe not the last ref for some statahead requests
+                * maybe inflight.
+                */
+               ll_sai_put(lli->lli_sai);
+       } else {
+               lli->lli_opendir_pid = 0;
+               spin_unlock(&lli->lli_sa_lock);
+       }
+}
+
+enum {
+       /**
+        * not first dirent, or is "."
+        */
+       LS_NONE_FIRST_DE = 0,
+       /**
+        * the first non-hidden dirent
+        */
+       LS_FIRST_DE,
+       /**
+        * the first hidden dirent, that is "."
+        */
+       LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+       struct ll_dir_chain   chain;
+       struct qstr       *target = &dentry->d_name;
+       struct page       *page;
+       __u64            pos    = 0;
+       int                dot_de;
+       int                rc     = LS_NONE_FIRST_DE;
+       ENTRY;
+
+       ll_dir_chain_init(&chain);
+       page = ll_get_dir_page(dir, pos, &chain);
+
+       while (1) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (IS_ERR(page)) {
+                       struct ll_inode_info *lli = ll_i2info(dir);
+
+                       rc = PTR_ERR(page);
+                       CERROR("error reading dir "DFID" at "LPU64": "
+                              "[rc %d] [parent %u]\n",
+                              PFID(ll_inode2fid(dir)), pos,
+                              rc, lli->lli_opendir_pid);
+                       break;
+               }
+
+               dp = page_address(page);
+               for (ent = lu_dirent_start(dp); ent != NULL;
+                    ent = lu_dirent_next(ent)) {
+                       __u64 hash;
+                       int namelen;
+                       char *name;
+
+                       hash = le64_to_cpu(ent->lde_hash);
+                       /* The ll_get_dir_page() can return any page containing
+                        * the given hash which may be not the start hash. */
+                       if (unlikely(hash < pos))
+                               continue;
+
+                       namelen = le16_to_cpu(ent->lde_namelen);
+                       if (unlikely(namelen == 0))
+                               /*
+                                * skip dummy record.
+                                */
+                               continue;
+
+                       name = ent->lde_name;
+                       if (name[0] == '.') {
+                               if (namelen == 1)
+                                       /*
+                                        * skip "."
+                                        */
+                                       continue;
+                               else if (name[1] == '.' && namelen == 2)
+                                       /*
+                                        * skip ".."
+                                        */
+                                       continue;
+                               else
+                                       dot_de = 1;
+                       } else {
+                               dot_de = 0;
+                       }
+
+                       if (dot_de && target->name[0] != '.') {
+                               CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+                                      target->len, target->name,
+                                      namelen, name);
+                               continue;
+                       }
+
+                       if (target->len != namelen ||
+                           memcmp(target->name, name, namelen) != 0)
+                               rc = LS_NONE_FIRST_DE;
+                       else if (!dot_de)
+                               rc = LS_FIRST_DE;
+                       else
+                               rc = LS_FIRST_DOT_DE;
+
+                       ll_release_page(page, 0);
+                       GOTO(out, rc);
+               }
+               pos = le64_to_cpu(dp->ldp_hash_end);
+               if (pos == MDS_DIR_END_OFF) {
+                       /*
+                        * End of directory reached.
+                        */
+                       ll_release_page(page, 0);
+                       break;
+               } else if (1) {
+                       /*
+                        * chain is exhausted
+                        * Normal case: continue to the next page.
+                        */
+                       ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+                                             LDF_COLLIDE);
+                       page = ll_get_dir_page(dir, pos, &chain);
+               } else {
+                       /*
+                        * go into overflow page.
+                        */
+                       LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                       ll_release_page(page, 1);
+               }
+       }
+       EXIT;
+
+out:
+       ll_dir_chain_fini(&chain);
+       return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ptlrpc_thread *thread = &sai->sai_thread;
+       struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
+       int                hit;
+       ENTRY;
+
+       if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+               hit = 1;
+       else
+               hit = 0;
+
+       ll_sa_entry_fini(sai, entry);
+       if (hit) {
+               sai->sai_hit++;
+               sai->sai_consecutive_miss = 0;
+               sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+       } else {
+               struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+               sai->sai_miss++;
+               sai->sai_consecutive_miss++;
+               if (sa_low_hit(sai) && thread_is_running(thread)) {
+                       atomic_inc(&sbi->ll_sa_wrong);
+                       CDEBUG(D_READA, "Statahead for dir "DFID" hit "
+                              "ratio too low: hit/miss "LPU64"/"LPU64
+                              ", sent/replied "LPU64"/"LPU64", stopping "
+                              "statahead thread: pid %d\n",
+                              PFID(&lli->lli_fid), sai->sai_hit,
+                              sai->sai_miss, sai->sai_sent,
+                              sai->sai_replied, current_pid());
+                       spin_lock(&lli->lli_sa_lock);
+                       if (!thread_is_stopped(thread))
+                               thread_set_flags(thread, SVC_STOPPING);
+                       spin_unlock(&lli->lli_sa_lock);
+               }
+       }
+
+       if (!thread_is_stopped(thread))
+               wake_up(&thread->t_ctl_waitq);
+
+       EXIT;
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1       -- find entry with lock in cache, the caller needs to do
+ *                 nothing.
+ * \retval 0       -- find entry in cache, but without lock, the caller needs
+ *                 refresh from MDS.
+ * \retval others  -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+                      int only_unplug)
+{
+       struct ll_inode_info     *lli   = ll_i2info(dir);
+       struct ll_statahead_info *sai   = lli->lli_sai;
+       struct dentry       *parent;
+       struct ll_sa_entry       *entry;
+       struct ptlrpc_thread     *thread;
+       struct l_wait_info      lwi   = { 0 };
+       int                    rc    = 0;
+       struct ll_inode_info     *plli;
+       ENTRY;
+
+       LASSERT(lli->lli_opendir_pid == current_pid());
+
+       if (sai) {
+               thread = &sai->sai_thread;
+               if (unlikely(thread_is_stopped(thread) &&
+                            list_empty(&sai->sai_entries_stated))) {
+                       /* to release resource */
+                       ll_stop_statahead(dir, lli->lli_opendir_key);
+                       RETURN(-EAGAIN);
+               }
+
+               if ((*dentryp)->d_name.name[0] == '.') {
+                       if (sai->sai_ls_all ||
+                           sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+                               /*
+                                * Hidden dentry is the first one, or statahead
+                                * thread does not skip so many hidden dentries
+                                * before "sai_ls_all" enabled as below.
+                                */
+                       } else {
+                               if (!sai->sai_ls_all)
+                                       /*
+                                        * It maybe because hidden dentry is not
+                                        * the first one, "sai_ls_all" was not
+                                        * set, then "ls -al" missed. Enable
+                                        * "sai_ls_all" for such case.
+                                        */
+                                       sai->sai_ls_all = 1;
+
+                               /*
+                                * Such "getattr" has been skipped before
+                                * "sai_ls_all" enabled as above.
+                                */
+                               sai->sai_miss_hidden++;
+                               RETURN(-EAGAIN);
+                       }
+               }
+
+               entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+               if (entry == NULL || only_unplug) {
+                       ll_sai_unplug(sai, entry);
+                       RETURN(entry ? 1 : -EAGAIN);
+               }
+
+               /* if statahead is busy in readdir, help it do post-work */
+               while (!ll_sa_entry_stated(entry) &&
+                      sai->sai_in_readpage &&
+                      !sa_received_empty(sai))
+                       ll_post_statahead(sai);
+
+               if (!ll_sa_entry_stated(entry)) {
+                       sai->sai_index_wait = entry->se_index;
+                       lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+                                              LWI_ON_SIGNAL_NOOP, NULL);
+                       rc = l_wait_event(sai->sai_waitq,
+                                         ll_sa_entry_stated(entry) ||
+                                         thread_is_stopped(thread),
+                                         &lwi);
+                       if (rc < 0) {
+                               ll_sai_unplug(sai, entry);
+                               RETURN(-EAGAIN);
+                       }
+               }
+
+               if (entry->se_stat == SA_ENTRY_SUCC &&
+                   entry->se_inode != NULL) {
+                       struct inode *inode = entry->se_inode;
+                       struct lookup_intent it = { .it_op = IT_GETATTR,
+                                                   .d.lustre.it_lock_handle =
+                                                    entry->se_handle };
+                       __u64 bits;
+
+                       rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+                                               ll_inode2fid(inode), &bits);
+                       if (rc == 1) {
+                               if ((*dentryp)->d_inode == NULL) {
+                                       *dentryp = ll_splice_alias(inode,
+                                                                  *dentryp);
+                               } else if ((*dentryp)->d_inode != inode) {
+                                       /* revalidate, but inode is recreated */
+                                       CDEBUG(D_READA,
+                                             "stale dentry %.*s inode %lu/%u, "
+                                             "statahead inode %lu/%u\n",
+                                             (*dentryp)->d_name.len,
+                                             (*dentryp)->d_name.name,
+                                             (*dentryp)->d_inode->i_ino,
+                                             (*dentryp)->d_inode->i_generation,
+                                             inode->i_ino,
+                                             inode->i_generation);
+                                       ll_sai_unplug(sai, entry);
+                                       RETURN(-ESTALE);
+                               } else {
+                                       iput(inode);
+                               }
+                               entry->se_inode = NULL;
+
+                               if ((bits & MDS_INODELOCK_LOOKUP) &&
+                                   d_lustre_invalid(*dentryp))
+                                       d_lustre_revalidate(*dentryp);
+                               ll_intent_release(&it);
+                       }
+               }
+
+               ll_sai_unplug(sai, entry);
+               RETURN(rc);
+       }
+
+       /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+       rc = is_first_dirent(dir, *dentryp);
+       if (rc == LS_NONE_FIRST_DE)
+               /* It is not "ls -{a}l" operation, no need statahead for it. */
+               GOTO(out, rc = -EAGAIN);
+
+       sai = ll_sai_alloc();
+       if (sai == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+       sai->sai_inode = igrab(dir);
+       if (unlikely(sai->sai_inode == NULL)) {
+               CWARN("Do not start stat ahead on dying inode "DFID"\n",
+                     PFID(&lli->lli_fid));
+               GOTO(out, rc = -ESTALE);
+       }
+
+       /* get parent reference count here, and put it in ll_statahead_thread */
+       parent = dget((*dentryp)->d_parent);
+       if (unlikely(sai->sai_inode != parent->d_inode)) {
+               struct ll_inode_info *nlli = ll_i2info(parent->d_inode);
+
+               CWARN("Race condition, someone changed %.*s just now: "
+                     "old parent "DFID", new parent "DFID"\n",
+                     (*dentryp)->d_name.len, (*dentryp)->d_name.name,
+                     PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+               dput(parent);
+               iput(sai->sai_inode);
+               GOTO(out, rc = -EAGAIN);
+       }
+
+       CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       lli->lli_sai = sai;
+
+       plli = ll_i2info(parent->d_inode);
+       rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+                                "ll_sa_%u", plli->lli_opendir_pid));
+       thread = &sai->sai_thread;
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("can't start ll_sa thread, rc: %d\n", rc);
+               dput(parent);
+               lli->lli_opendir_key = NULL;
+               thread_set_flags(thread, SVC_STOPPED);
+               thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+               ll_sai_put(sai);
+               LASSERT(lli->lli_sai == NULL);
+               RETURN(-EAGAIN);
+       }
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+
+       /*
+        * We don't stat-ahead for the first dirent since we are already in
+        * lookup.
+        */
+       RETURN(-EAGAIN);
+
+out:
+       if (sai != NULL)
+               OBD_FREE_PTR(sai);
+       spin_lock(&lli->lli_sa_lock);
+       lli->lli_opendir_key = NULL;
+       lli->lli_opendir_pid = 0;
+       spin_unlock(&lli->lli_sa_lock);
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644 (file)
index 0000000..4101c52
--- /dev/null
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+       struct ll_inode_info *lli;
+       ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+       OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, __GFP_IO);
+       if (lli == NULL)
+               return NULL;
+
+       inode_init_once(&lli->lli_vfs_inode);
+       return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+       struct inode *inode = container_of(head, struct inode, i_rcu);
+       struct ll_inode_info *ptr = ll_i2info(inode);
+       OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+       call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+int ll_init_inodecache(void)
+{
+       ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+                                              sizeof(struct ll_inode_info),
+                                              0, SLAB_HWCACHE_ALIGN, NULL);
+       if (ll_inode_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void ll_destroy_inodecache(void)
+{
+       kmem_cache_destroy(ll_inode_cachep);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations =
+{
+       .alloc_inode   = ll_alloc_inode,
+       .destroy_inode = ll_destroy_inode,
+       .evict_inode   = ll_delete_inode,
+       .put_super     = ll_put_super,
+       .statfs = ll_statfs,
+       .umount_begin  = ll_umount_begin,
+       .remount_fs    = ll_remount_fs,
+       .show_options  = ll_show_options,
+};
+
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+static int __init init_lustre_lite(void)
+{
+       int i, rc, seed[2];
+       struct timeval tv;
+       lnet_process_id_t lnet_id;
+
+       CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre client module (%p).\n",
+              &lustre_super_operations);
+
+       rc = ll_init_inodecache();
+       if (rc)
+               return -ENOMEM;
+       ll_file_data_slab = kmem_cache_create("ll_file_data",
+                                                sizeof(struct ll_file_data), 0,
+                                                SLAB_HWCACHE_ALIGN, NULL);
+       if (ll_file_data_slab == NULL) {
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+                                                 sizeof(struct ll_remote_perm),
+                                                     0, 0, NULL);
+       if (ll_remote_perm_cachep == NULL) {
+               kmem_cache_destroy(ll_file_data_slab);
+               ll_file_data_slab = NULL;
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+                                                  REMOTE_PERM_HASHSIZE *
+                                                  sizeof(struct list_head),
+                                                  0, 0, NULL);
+       if (ll_rmtperm_hash_cachep == NULL) {
+               kmem_cache_destroy(ll_remote_perm_cachep);
+               ll_remote_perm_cachep = NULL;
+               kmem_cache_destroy(ll_file_data_slab);
+               ll_file_data_slab = NULL;
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       proc_lustre_fs_root = proc_lustre_root ?
+                             lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL;
+
+       lustre_register_client_fill_super(ll_fill_super);
+       lustre_register_kill_super_cb(ll_kill_super);
+
+       lustre_register_client_process_config(ll_process_config);
+
+       cfs_get_random_bytes(seed, sizeof(seed));
+
+       /* Nodes with small feet have little entropy
+        * the NID for this node gives the most entropy in the low bits */
+       for (i=0; ; i++) {
+               if (LNetGetId(i, &lnet_id) == -ENOENT) {
+                       break;
+               }
+               if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+                       seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+               }
+       }
+
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+       init_timer(&ll_capa_timer);
+       ll_capa_timer.function = ll_capa_timer_callback;
+       rc = ll_capa_thread_start();
+       /*
+        * XXX normal cleanup is needed here.
+        */
+       if (rc == 0)
+               rc = vvp_global_init();
+
+       return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+       vvp_global_fini();
+       del_timer(&ll_capa_timer);
+       ll_capa_thread_stop();
+       LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+                "client remaining capa count %d\n",
+                capa_count[CAPA_SITE_CLIENT]);
+
+       lustre_register_client_fill_super(NULL);
+       lustre_register_kill_super_cb(NULL);
+
+       lustre_register_client_process_config(NULL);
+
+       ll_destroy_inodecache();
+
+       kmem_cache_destroy(ll_rmtperm_hash_cachep);
+       ll_rmtperm_hash_cachep = NULL;
+
+       kmem_cache_destroy(ll_remote_perm_cachep);
+       ll_remote_perm_cachep = NULL;
+
+       kmem_cache_destroy(ll_file_data_slab);
+       if (proc_lustre_fs_root)
+               lprocfs_remove(&proc_lustre_fs_root);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644 (file)
index 0000000..5260e98
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+                               struct ptlrpc_request **request, char **symname)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       int rc, symlen = i_size_read(inode) + 1;
+       struct mdt_body *body;
+       struct md_op_data *op_data;
+       ENTRY;
+
+       *request = NULL;
+
+       if (lli->lli_symlink_name) {
+               int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+               *symname = lli->lli_symlink_name;
+               /* If the total CDEBUG() size is larger than a page, it
+                * will print a warning to the console, avoid this by
+                * printing just the last part of the symlink. */
+               CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+                      print_limit < symlen ? "..." : "", print_limit,
+                      (*symname) + symlen - print_limit, symlen);
+               RETURN(0);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_LINKNAME;
+       rc = md_getattr(sbi->ll_md_exp, op_data, request);
+       ll_finish_md_op_data(op_data);
+       if (rc) {
+               if (rc != -ENOENT)
+                       CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+               GOTO (failed, rc);
+       }
+
+       body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+       if ((body->valid & OBD_MD_LINKNAME) == 0) {
+               CERROR("OBD_MD_LINKNAME not set on reply\n");
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       LASSERT(symlen != 0);
+       if (body->eadatasize != symlen) {
+               CERROR("inode %lu: symlink length %d not expected %d\n",
+                       inode->i_ino, body->eadatasize - 1, symlen - 1);
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+       if (*symname == NULL ||
+           strnlen(*symname, symlen) != symlen - 1) {
+               /* not full/NULL terminated */
+               CERROR("inode %lu: symlink not NULL terminated string"
+                       "of length %d\n", inode->i_ino, symlen - 1);
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       OBD_ALLOC(lli->lli_symlink_name, symlen);
+       /* do not return an error if we cannot cache the symlink locally */
+       if (lli->lli_symlink_name) {
+               memcpy(lli->lli_symlink_name, *symname, symlen);
+               *symname = lli->lli_symlink_name;
+       }
+       RETURN(0);
+
+failed:
+       RETURN (rc);
+}
+
+static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *request;
+       char *symname;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op\n");
+
+       ll_inode_size_lock(inode);
+       rc = ll_readlink_internal(inode, &request, &symname);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = vfs_readlink(dentry, buffer, buflen, symname);
+ out:
+       ptlrpc_req_finished(request);
+       ll_inode_size_unlock(inode);
+       RETURN(rc);
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *request = NULL;
+       int rc;
+       char *symname;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op\n");
+       /* Limit the recursive symlink depth to 5 instead of default
+        * 8 links when kernel has 4k stack to prevent stack overflow.
+        * For 8k stacks we need to limit it to 7 for local servers. */
+       if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+               rc = -ELOOP;
+       } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+               rc = -ELOOP;
+       } else {
+               ll_inode_size_lock(inode);
+               rc = ll_readlink_internal(inode, &request, &symname);
+               ll_inode_size_unlock(inode);
+       }
+       if (rc) {
+               ptlrpc_req_finished(request);
+               request = NULL;
+               symname = ERR_PTR(rc);
+       }
+
+       nd_set_link(nd, symname);
+       /* symname may contain a pointer to the request message buffer,
+        * we delay request releasing until ll_put_link then.
+        */
+       RETURN(request);
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+       ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+       .readlink       = ll_readlink,
+       .setattr        = ll_setattr,
+       .follow_link    = ll_follow_link,
+       .put_link       = ll_put_link,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644 (file)
index 0000000..60daf75
--- /dev/null
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+       {
+               .ckd_cache = &vvp_thread_kmem,
+               .ckd_name  = "vvp_thread_kmem",
+               .ckd_size  = sizeof (struct vvp_thread_info),
+       },
+       {
+               .ckd_cache = &vvp_session_kmem,
+               .ckd_name  = "vvp_session_kmem",
+               .ckd_size  = sizeof (struct vvp_session)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct vvp_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct vvp_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct vvp_session *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct vvp_session *session = data;
+       OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = vvp_key_init,
+       .lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = vvp_session_key_init,
+       .lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+       .ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+       .cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+       .ldto_init = vvp_type_init,
+       .ldto_fini = vvp_type_fini,
+
+       .ldto_start = vvp_type_start,
+       .ldto_stop  = vvp_type_stop,
+
+       .ldto_device_alloc = vvp_device_alloc,
+       .ldto_device_free  = ccc_device_free,
+       .ldto_device_init  = ccc_device_init,
+       .ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_VVP_NAME,
+       .ldt_ops      = &vvp_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+       int result;
+
+       result = lu_kmem_init(vvp_caches);
+       if (result == 0) {
+               result = ccc_global_init(&vvp_device_type);
+               if (result != 0)
+                       lu_kmem_fini(vvp_caches);
+       }
+       return result;
+}
+
+void vvp_global_fini(void)
+{
+       ccc_global_fini(&vvp_device_type);
+       lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+       struct cl_device  *cl;
+       struct lu_env     *env;
+       int rc = 0;
+       int refcheck;
+
+       sbi  = ll_s2sbi(sb);
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               cl = cl_type_setup(env, NULL, &vvp_device_type,
+                                  sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+               if (!IS_ERR(cl)) {
+                       cl2ccc_dev(cl)->cdv_sb = sb;
+                       sbi->ll_cl = cl;
+                       sbi->ll_site = cl2lu_dev(cl)->ld_site;
+               }
+               cl_env_put(env, &refcheck);
+       } else
+               rc = PTR_ERR(env);
+       RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       struct cl_device  *cld;
+       int             refcheck;
+       int             result;
+
+       ENTRY;
+       sbi = ll_s2sbi(sb);
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               cld = sbi->ll_cl;
+
+               if (cld != NULL) {
+                       cl_stack_fini(env, cld);
+                       sbi->ll_cl = NULL;
+                       sbi->ll_site = NULL;
+               }
+               cl_env_put(env, &refcheck);
+               result = 0;
+       } else {
+               CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+               result = PTR_ERR(env);
+       }
+       /*
+        * If mount failed (sbi->ll_cl == NULL), and this there are no other
+        * mounts, stop device types manually (this usually happens
+        * automatically when last device is destroyed).
+        */
+       lu_types_stop();
+       RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head          4bits
+ *
+ *       - page index                             32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+       unsigned                 vpi_bucket;
+       unsigned                 vpi_depth;
+       uint32_t                 vpi_index;
+
+       unsigned                 vpi_curdep;
+       struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+       CLASSERT(sizeof(pos) == sizeof(__u64));
+
+       id->vpi_index  = pos & 0xffffffff;
+       id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+       id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+       return
+               ((__u64)id->vpi_index) |
+               ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+               ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                              struct hlist_node *hnode, void *data)
+{
+       struct vvp_pgcache_id   *id  = data;
+       struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+       if (id->vpi_curdep-- > 0)
+               return 0; /* continue */
+
+       if (lu_object_is_dying(hdr))
+               return 1;
+
+       cfs_hash_get(hs, hnode);
+       id->vpi_obj = hdr;
+       return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+                                        struct lu_device *dev,
+                                        struct vvp_pgcache_id *id)
+{
+       LASSERT(lu_device_is_cl(dev));
+
+       id->vpi_depth &= 0xf;
+       id->vpi_obj    = NULL;
+       id->vpi_curdep = id->vpi_depth;
+
+       cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+                               vvp_pgcache_obj_get, id);
+       if (id->vpi_obj != NULL) {
+               struct lu_object *lu_obj;
+
+               lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+               if (lu_obj != NULL) {
+                       lu_object_ref_add(lu_obj, "dump", current);
+                       return lu2cl(lu_obj);
+               }
+               lu_object_put(env, lu_object_top(id->vpi_obj));
+
+       } else if (id->vpi_curdep > 0) {
+               id->vpi_depth = 0xf;
+       }
+       return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+                              struct lu_device *dev, loff_t pos)
+{
+       struct cl_object     *clob;
+       struct lu_site       *site;
+       struct vvp_pgcache_id id;
+
+       site = dev->ld_site;
+       vvp_pgcache_id_unpack(pos, &id);
+
+       while (1) {
+               if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+                       return ~0ULL;
+               clob = vvp_pgcache_obj(env, dev, &id);
+               if (clob != NULL) {
+                       struct cl_object_header *hdr;
+                       int                   nr;
+                       struct cl_page    *pg;
+
+                       /* got an object. Find next page. */
+                       hdr = cl_object_header(clob);
+
+                       spin_lock(&hdr->coh_page_guard);
+                       nr = radix_tree_gang_lookup(&hdr->coh_tree,
+                                                   (void **)&pg,
+                                                   id.vpi_index, 1);
+                       if (nr > 0) {
+                               id.vpi_index = pg->cp_index;
+                               /* Cant support over 16T file */
+                               nr = !(pg->cp_index > 0xffffffff);
+                       }
+                       spin_unlock(&hdr->coh_page_guard);
+
+                       lu_object_ref_del(&clob->co_lu, "dump", current);
+                       cl_object_put(env, clob);
+                       if (nr > 0)
+                               return vvp_pgcache_id_pack(&id);
+               }
+               /* to the next object. */
+               ++id.vpi_depth;
+               id.vpi_depth &= 0xf;
+               if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+                       return ~0ULL;
+               id.vpi_index = 0;
+       }
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {           \
+       if (test_bit(PG_##flag, &(page)->flags)) {                \
+               seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+               has_flags = 1;                                    \
+       }                                                              \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+                                 struct seq_file *seq, struct cl_page *page)
+{
+       struct ccc_page *cpg;
+       struct page      *vmpage;
+       int           has_flags;
+
+       cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+       vmpage = cpg->cpg_page;
+       seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+                  0 /* gen */,
+                  cpg, page,
+                  "none",
+                  cpg->cpg_write_queued ? "wq" : "- ",
+                  cpg->cpg_defer_uptodate ? "du" : "- ",
+                  PageWriteback(vmpage) ? "wb" : "-",
+                  vmpage, vmpage->mapping->host->i_ino,
+                  vmpage->mapping->host->i_generation,
+                  vmpage->mapping->host, vmpage->index,
+                  page_count(vmpage));
+       has_flags = 0;
+       seq_page_flag(seq, vmpage, locked, has_flags);
+       seq_page_flag(seq, vmpage, error, has_flags);
+       seq_page_flag(seq, vmpage, referenced, has_flags);
+       seq_page_flag(seq, vmpage, uptodate, has_flags);
+       seq_page_flag(seq, vmpage, dirty, has_flags);
+       seq_page_flag(seq, vmpage, writeback, has_flags);
+       seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+       loff_t             pos;
+       struct ll_sb_info       *sbi;
+       struct cl_object        *clob;
+       struct lu_env      *env;
+       struct cl_page    *page;
+       struct cl_object_header *hdr;
+       struct vvp_pgcache_id    id;
+       int                   refcheck;
+       int                   result;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               pos = *(loff_t *) v;
+               vvp_pgcache_id_unpack(pos, &id);
+               sbi = f->private;
+               clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+               if (clob != NULL) {
+                       hdr = cl_object_header(clob);
+
+                       spin_lock(&hdr->coh_page_guard);
+                       page = cl_page_lookup(hdr, id.vpi_index);
+                       spin_unlock(&hdr->coh_page_guard);
+
+                       seq_printf(f, "%8x@"DFID": ",
+                                  id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+                       if (page != NULL) {
+                               vvp_pgcache_page_show(env, f, page);
+                               cl_page_put(env, page);
+                       } else
+                               seq_puts(f, "missing\n");
+                       lu_object_ref_del(&clob->co_lu, "dump", current);
+                       cl_object_put(env, clob);
+               } else
+                       seq_printf(f, "%llx missing\n", pos);
+               cl_env_put(env, &refcheck);
+               result = 0;
+       } else
+               result = PTR_ERR(env);
+       return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       int             refcheck;
+
+       sbi = f->private;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               sbi = f->private;
+               if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+                       pos = ERR_PTR(-EFBIG);
+               else {
+                       *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+                                               *pos);
+                       if (*pos == ~0ULL)
+                               pos = NULL;
+               }
+               cl_env_put(env, &refcheck);
+       }
+       return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       int             refcheck;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               sbi = f->private;
+               *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+               if (*pos == ~0ULL)
+                       pos = NULL;
+               cl_env_put(env, &refcheck);
+       }
+       return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+       /* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+       .start = vvp_pgcache_start,
+       .next  = vvp_pgcache_next,
+       .stop  = vvp_pgcache_stop,
+       .show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+       struct proc_dir_entry *dp  = PDE(inode);
+       struct ll_sb_info     *sbi = dp->data;
+       struct seq_file       *seq;
+       int                 result;
+
+       result = seq_open(filp, &vvp_pgcache_ops);
+       if (result == 0) {
+               seq = filp->private_data;
+               seq->private = sbi;
+       }
+       return result;
+}
+
+struct file_operations vvp_dump_pgcache_file_ops = {
+       .owner   = THIS_MODULE,
+       .open    = vvp_dump_pgcache_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644 (file)
index 0000000..c82bf17
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include <cl_object.h>
+#include "llite_internal.h"
+
+int           vvp_io_init     (const struct lu_env *env,
+                                  struct cl_object *obj, struct cl_io *io);
+int           vvp_lock_init   (const struct lu_env *env,
+                                  struct cl_object *obj, struct cl_lock *lock,
+                                  const struct cl_io *io);
+int              vvp_page_init   (const struct lu_env *env,
+                                  struct cl_object *obj,
+                                  struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern struct kmem_cache *vvp_thread_kmem;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644 (file)
index 0000000..8504d44
--- /dev/null
@@ -0,0 +1,1175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for sendfile() / splice_{read|write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+       struct vvp_io *vio = vvp_env_io(env);
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+                               struct inode *inode)
+{
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct ccc_io           *cio = ccc_env_io(env);
+       bool rc = true;
+
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               /* don't need lock here to check lli_layout_gen as we have held
+                * extent lock and GROUP lock has to hold to swap layout */
+               if (lli->lli_layout_gen != cio->cui_layout_gen) {
+                       io->ci_need_restart = 1;
+                       /* this will return application a short read/write */
+                       io->ci_continue = 0;
+                       rc = false;
+               }
+       case CIT_FAULT:
+               /* fault is okay because we've already had a page. */
+       default:
+               break;
+       }
+
+       return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+                                 const struct cl_io_slice *ios)
+{
+       struct vvp_io *vio   = cl2vvp_io(env, ios);
+       struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+       LASSERT(inode ==
+               cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
+       vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+       return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct cl_io     *io  = ios->cis_io;
+       struct cl_object *obj = io->ci_obj;
+       struct ccc_io    *cio = cl2ccc_io(env, ios);
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n",
+               io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen);
+
+       if (!io->ci_ignore_layout && io->ci_verify_layout) {
+               __u32 gen = 0;
+
+               /* check layout version */
+               ll_layout_refresh(ccc_object_inode(obj), &gen);
+               io->ci_need_restart = cio->cui_layout_gen != gen;
+               if (io->ci_need_restart)
+                       CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n",
+                               cio->cui_layout_gen, gen);
+       }
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_io   *io   = ios->cis_io;
+       struct cl_page *page = io->u.ci_fault.ft_page;
+
+       CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+       if (page != NULL) {
+               lu_ref_del(&page->cp_reference, "fault", io);
+               cl_page_put(env, page);
+               io->u.ci_fault.ft_page = NULL;
+       }
+       vvp_io_fini(env, ios);
+}
+
+enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+       /*
+        * we only want to hold PW locks if the mmap() can generate
+        * writes back to the file and that only happens in shared
+        * writable vmas
+        */
+       if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+               return CLM_WRITE;
+       return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+                         struct ccc_io *vio, struct cl_io *io)
+{
+       struct ccc_thread_info *cti = ccc_env_info(env);
+       struct mm_struct       *mm = current->mm;
+       struct vm_area_struct  *vma;
+       struct cl_lock_descr   *descr = &cti->cti_descr;
+       ldlm_policy_data_t      policy;
+       unsigned long      addr;
+       unsigned long      seg;
+       ssize_t          count;
+       int                  result;
+       ENTRY;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       if (!cl_is_normalio(env, io))
+               RETURN(0);
+
+       if (vio->cui_iov == NULL) /* nfs or loop back device write */
+               RETURN(0);
+
+       /* No MM (e.g. NFS)? No vmas too. */
+       if (mm == NULL)
+               RETURN(0);
+
+       for (seg = 0; seg < vio->cui_nrsegs; seg++) {
+               const struct iovec *iv = &vio->cui_iov[seg];
+
+               addr = (unsigned long)iv->iov_base;
+               count = iv->iov_len;
+               if (count == 0)
+                       continue;
+
+               count += addr & (~CFS_PAGE_MASK);
+               addr &= CFS_PAGE_MASK;
+
+               down_read(&mm->mmap_sem);
+               while((vma = our_vma(mm, addr, count)) != NULL) {
+                       struct inode *inode = vma->vm_file->f_dentry->d_inode;
+                       int flags = CEF_MUST;
+
+                       if (ll_file_nolock(vma->vm_file)) {
+                               /*
+                                * For no lock case, a lockless lock will be
+                                * generated.
+                                */
+                               flags = CEF_NEVER;
+                       }
+
+                       /*
+                        * XXX: Required lock mode can be weakened: CIT_WRITE
+                        * io only ever reads user level buffer, and CIT_READ
+                        * only writes on it.
+                        */
+                       policy_from_vma(&policy, vma, addr, count);
+                       descr->cld_mode = vvp_mode_from_vma(vma);
+                       descr->cld_obj = ll_i2info(inode)->lli_clob;
+                       descr->cld_start = cl_index(descr->cld_obj,
+                                                   policy.l_extent.start);
+                       descr->cld_end = cl_index(descr->cld_obj,
+                                                 policy.l_extent.end);
+                       descr->cld_enq_flags = flags;
+                       result = cl_io_lock_alloc_add(env, io, descr);
+
+                       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+                              descr->cld_mode, descr->cld_start,
+                              descr->cld_end);
+
+                       if (result < 0)
+                               RETURN(result);
+
+                       if (vma->vm_end - addr >= count)
+                               break;
+
+                       count -= vma->vm_end - addr;
+                       addr = vma->vm_end;
+               }
+               up_read(&mm->mmap_sem);
+       }
+       RETURN(0);
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                         enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+       struct ccc_io *cio = ccc_env_io(env);
+       int result;
+       int ast_flags = 0;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+       ENTRY;
+
+       ccc_io_update_iov(env, cio, io);
+
+       if (io->u.ci_rw.crw_nonblock)
+               ast_flags |= CEF_NONBLOCK;
+       result = vvp_mmap_locks(env, cio, io);
+       if (result == 0)
+               result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+       RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+                           const struct cl_io_slice *ios)
+{
+       struct cl_io     *io  = ios->cis_io;
+       struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj));
+       int result;
+
+       ENTRY;
+       /* XXX: Layer violation, we shouldn't see lsm at llite level. */
+       if (lli->lli_has_smd) /* lsm-less file doesn't need to lock */
+               result = vvp_io_rw_lock(env, io, CLM_READ,
+                                       io->u.ci_rd.rd.crw_pos,
+                                       io->u.ci_rd.rd.crw_pos +
+                                       io->u.ci_rd.rd.crw_count - 1);
+       else
+               result = 0;
+       RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct cl_io *io   = ios->cis_io;
+       struct vvp_io *vio = cl2vvp_io(env, ios);
+       /*
+        * XXX LDLM_FL_CBPENDING
+        */
+       return ccc_io_one_lock_index
+               (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+                io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct cl_io *io = ios->cis_io;
+       loff_t start;
+       loff_t end;
+
+       if (io->u.ci_wr.wr_append) {
+               start = 0;
+               end   = OBD_OBJECT_EOF;
+       } else {
+               start = io->u.ci_wr.wr.crw_pos;
+               end   = start + io->u.ci_wr.wr.crw_count - 1;
+       }
+       return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+                                   const struct cl_io_slice *ios)
+{
+       return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct ccc_io *cio = ccc_env_io(env);
+       struct cl_io  *io  = ios->cis_io;
+       __u64 new_size;
+       __u32 enqflags = 0;
+
+       if (cl_io_is_trunc(io)) {
+               new_size = io->u.ci_setattr.sa_attr.lvb_size;
+               if (new_size == 0)
+                       enqflags = CEF_DISCARD_DATA;
+       } else {
+               if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+                    io->u.ci_setattr.sa_attr.lvb_ctime) ||
+                   (io->u.ci_setattr.sa_attr.lvb_atime >=
+                    io->u.ci_setattr.sa_attr.lvb_ctime))
+                       return 0;
+               new_size = 0;
+       }
+       cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+       return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+                              new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+       int     result;
+       /*
+        * Only ll_inode_size_lock is taken at this level.
+        */
+       ll_inode_size_lock(inode);
+       result = inode_newsize_ok(inode, size);
+       if (result < 0) {
+               ll_inode_size_unlock(inode);
+               return result;
+       }
+       truncate_setsize(inode, size);
+       ll_inode_size_unlock(inode);
+       return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               struct inode *inode, loff_t size)
+{
+       inode_dio_wait(inode);
+       return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct cl_io       *io    = ios->cis_io;
+       struct cl_object   *obj   = io->ci_obj;
+       struct cl_attr     *attr  = ccc_env_thread_attr(env);
+       int result;
+       unsigned valid = CAT_CTIME;
+
+       cl_object_attr_lock(obj);
+       attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+       if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+               attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+               valid |= CAT_ATIME;
+       }
+       if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+               attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+               valid |= CAT_MTIME;
+       }
+       result = cl_object_attr_set(env, obj, attr, valid);
+       cl_object_attr_unlock(obj);
+
+       return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       struct cl_io    *io    = ios->cis_io;
+       struct inode    *inode = ccc_object_inode(io->ci_obj);
+
+       mutex_lock(&inode->i_mutex);
+       if (cl_io_is_trunc(io))
+               return vvp_io_setattr_trunc(env, ios, inode,
+                                           io->u.ci_setattr.sa_attr.lvb_size);
+       else
+               return vvp_io_setattr_time(env, ios);
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct cl_io *io    = ios->cis_io;
+       struct inode *inode = ccc_object_inode(io->ci_obj);
+
+       if (cl_io_is_trunc(io)) {
+               /* Truncate in memory pages - they must be clean pages
+                * because osc has already notified to destroy osc_extents. */
+               vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+               inode_dio_write_done(inode);
+       }
+       mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       vvp_io_fini(env, ios);
+}
+
+static ssize_t lustre_generic_file_read(struct file *file,
+                                       struct ccc_io *vio, loff_t *ppos)
+{
+       return generic_file_aio_read(vio->cui_iocb, vio->cui_iov,
+                                    vio->cui_nrsegs, *ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+                                       struct ccc_io *vio, loff_t *ppos)
+{
+       return generic_file_aio_write(vio->cui_iocb, vio->cui_iov,
+                                     vio->cui_nrsegs, *ppos);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct vvp_io     *vio   = cl2vvp_io(env, ios);
+       struct ccc_io     *cio   = cl2ccc_io(env, ios);
+       struct cl_io      *io    = ios->cis_io;
+       struct cl_object  *obj   = io->ci_obj;
+       struct inode      *inode = ccc_object_inode(obj);
+       struct ll_ra_read *bead  = &vio->cui_bead;
+       struct file       *file  = cio->cui_fd->fd_file;
+
+       int     result;
+       loff_t  pos = io->u.ci_rd.rd.crw_pos;
+       long    cnt = io->u.ci_rd.rd.crw_count;
+       long    tot = cio->cui_tot_count;
+       int     exceed = 0;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+       if (!can_populate_pages(env, io, inode))
+               return 0;
+
+       result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+       if (result != 0)
+               return result;
+       else if (exceed != 0)
+               goto out;
+
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+                       "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+                       inode->i_ino, cnt, pos, i_size_read(inode));
+
+       /* turn off the kernel's read-ahead */
+       cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+       /* initialize read-ahead window once per syscall */
+       if (!vio->cui_ra_window_set) {
+               vio->cui_ra_window_set = 1;
+               bead->lrr_start = cl_index(obj, pos);
+               /*
+                * XXX: explicit PAGE_CACHE_SIZE
+                */
+               bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+               ll_ra_read_in(file, bead);
+       }
+
+       /* BUG: 5972 */
+       file_accessed(file);
+       switch (vio->cui_io_subtype) {
+       case IO_NORMAL:
+                result = lustre_generic_file_read(file, cio, &pos);
+                break;
+       case IO_SPLICE:
+               result = generic_file_splice_read(file, &pos,
+                               vio->u.splice.cui_pipe, cnt,
+                               vio->u.splice.cui_flags);
+               /* LU-1109: do splice read stripe by stripe otherwise if it
+                * may make nfsd stuck if this read occupied all internal pipe
+                * buffers. */
+               io->ci_continue = 0;
+               break;
+       default:
+               CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+               LBUG();
+       }
+
+out:
+       if (result >= 0) {
+               if (result < cnt)
+                       io->ci_continue = 0;
+               io->ci_nob += result;
+               ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                 cio->cui_fd, pos, result, 0);
+               result = 0;
+       }
+       return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct vvp_io *vio = cl2vvp_io(env, ios);
+       struct ccc_io *cio = cl2ccc_io(env, ios);
+
+       if (vio->cui_ra_window_set)
+               ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+       vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct ccc_io      *cio   = cl2ccc_io(env, ios);
+       struct cl_io       *io    = ios->cis_io;
+       struct cl_object   *obj   = io->ci_obj;
+       struct inode       *inode = ccc_object_inode(obj);
+       struct file     *file  = cio->cui_fd->fd_file;
+       ssize_t result = 0;
+       loff_t pos = io->u.ci_wr.wr.crw_pos;
+       size_t cnt = io->u.ci_wr.wr.crw_count;
+
+       ENTRY;
+
+       if (!can_populate_pages(env, io, inode))
+               return 0;
+
+       if (cl_io_is_append(io)) {
+               /*
+                * PARALLEL IO This has to be changed for parallel IO doing
+                * out-of-order writes.
+                */
+               pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+               cio->cui_iocb->ki_pos = pos;
+       }
+
+       CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+       if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+               result = 0;
+       else
+               result = lustre_generic_file_write(file, cio, &pos);
+
+       if (result > 0) {
+               if (result < cnt)
+                       io->ci_continue = 0;
+               io->ci_nob += result;
+               ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                 cio->cui_fd, pos, result, 0);
+               result = 0;
+       }
+       RETURN(result);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+       struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+       cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+       if (vmf->page) {
+               LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+                              vmf->virtual_address);
+               if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+                       lock_page(vmf->page);
+                       cfio->fault.ft_flags &= VM_FAULT_LOCKED;
+               }
+
+               cfio->ft_vmpage = vmf->page;
+               return 0;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+               CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+               return -EFAULT;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+               CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+               return -ENOMEM;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+               return -EAGAIN;
+
+       CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
+       return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct vvp_io       *vio     = cl2vvp_io(env, ios);
+       struct cl_io    *io      = ios->cis_io;
+       struct cl_object    *obj     = io->ci_obj;
+       struct inode    *inode   = ccc_object_inode(obj);
+       struct cl_fault_io  *fio     = &io->u.ci_fault;
+       struct vvp_fault_io *cfio    = &vio->u.fault;
+       loff_t         offset;
+       int               result  = 0;
+       struct page       *vmpage  = NULL;
+       struct cl_page      *page;
+       loff_t         size;
+       pgoff_t       last; /* last page in a file data region */
+
+       if (fio->ft_executable &&
+           LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+               CWARN("binary "DFID
+                     " changed while waiting for the page fault lock\n",
+                     PFID(lu_object_fid(&obj->co_lu)));
+
+       /* offset of the last byte on the page */
+       offset = cl_offset(obj, fio->ft_index + 1) - 1;
+       LASSERT(cl_index(obj, offset) == fio->ft_index);
+       result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+       if (result != 0)
+               return result;
+
+       /* must return locked page */
+       if (fio->ft_mkwrite) {
+               LASSERT(cfio->ft_vmpage != NULL);
+               lock_page(cfio->ft_vmpage);
+       } else {
+               result = vvp_io_kernel_fault(cfio);
+               if (result != 0)
+                       return result;
+       }
+
+       vmpage = cfio->ft_vmpage;
+       LASSERT(PageLocked(vmpage));
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+               ll_invalidate_page(vmpage);
+
+       size = i_size_read(inode);
+       /* Though we have already held a cl_lock upon this page, but
+        * it still can be truncated locally. */
+       if (unlikely((vmpage->mapping != inode->i_mapping) ||
+                    (page_offset(vmpage) > size))) {
+               CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+               /* return +1 to stop cl_io_loop() and ll_fault() will catch
+                * and retry. */
+               GOTO(out, result = +1);
+       }
+
+
+       if (fio->ft_mkwrite ) {
+               pgoff_t last_index;
+               /*
+                * Capture the size while holding the lli_trunc_sem from above
+                * we want to make sure that we complete the mkwrite action
+                * while holding this lock. We need to make sure that we are
+                * not past the end of the file.
+                */
+               last_index = cl_index(obj, size - 1);
+               if (last_index < fio->ft_index) {
+                       CDEBUG(D_PAGE,
+                               "llite: mkwrite and truncate race happened: "
+                               "%p: 0x%lx 0x%lx\n",
+                               vmpage->mapping,fio->ft_index,last_index);
+                       /*
+                        * We need to return if we are
+                        * passed the end of the file. This will propagate
+                        * up the call stack to ll_page_mkwrite where
+                        * we will return VM_FAULT_NOPAGE. Any non-negative
+                        * value returned here will be silently
+                        * converted to 0. If the vmpage->mapping is null
+                        * the error code would be converted back to ENODATA
+                        * in ll_page_mkwrite0. Thus we return -ENODATA
+                        * to handle both cases
+                        */
+                       GOTO(out, result = -ENODATA);
+               }
+       }
+
+       page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+       if (IS_ERR(page))
+               GOTO(out, result = PTR_ERR(page));
+
+       /* if page is going to be written, we should add this page into cache
+        * earlier. */
+       if (fio->ft_mkwrite) {
+               wait_on_page_writeback(vmpage);
+               if (set_page_dirty(vmpage)) {
+                       struct ccc_page *cp;
+
+                       /* vvp_page_assume() calls wait_on_page_writeback(). */
+                       cl_page_assume(env, io, page);
+
+                       cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+                       vvp_write_pending(cl2ccc(obj), cp);
+
+                       /* Do not set Dirty bit here so that in case IO is
+                        * started before the page is really made dirty, we
+                        * still have chance to detect it. */
+                       result = cl_page_cache_add(env, io, page, CRT_WRITE);
+                       LASSERT(cl_page_is_owned(page, io));
+
+                       vmpage = NULL;
+                       if (result < 0) {
+                               cl_page_unmap(env, io, page);
+                               cl_page_discard(env, io, page);
+                               cl_page_disown(env, io, page);
+
+                               cl_page_put(env, page);
+
+                               /* we're in big trouble, what can we do now? */
+                               if (result == -EDQUOT)
+                                       result = -ENOSPC;
+                               GOTO(out, result);
+                       } else
+                               cl_page_disown(env, io, page);
+               }
+       }
+
+       last = cl_index(obj, size - 1);
+       /*
+        * The ft_index is only used in the case of
+        * a mkwrite action. We need to check
+        * our assertions are correct, since
+        * we should have caught this above
+        */
+       LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+       if (fio->ft_index == last)
+               /*
+                * Last page is mapped partially.
+                */
+               fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+       else
+               fio->ft_nob = cl_page_size(obj);
+
+       lu_ref_add(&page->cp_reference, "fault", io);
+       fio->ft_page = page;
+       EXIT;
+
+out:
+       /* return unlocked vmpage to avoid deadlocking */
+       if (vmpage != NULL)
+               unlock_page(vmpage);
+       cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+       return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       /* we should mark TOWRITE bit to each dirty page in radix tree to
+        * verify pages have been written, but this is difficult because of
+        * race. */
+       return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+                           const struct cl_io_slice *ios,
+                           const struct cl_page_slice *slice)
+{
+       struct cl_io          *io     = ios->cis_io;
+       struct cl_object          *obj    = slice->cpl_obj;
+       struct ccc_page    *cp     = cl2ccc_page(slice);
+       struct cl_page      *page   = slice->cpl_page;
+       struct inode          *inode  = ccc_object_inode(obj);
+       struct ll_sb_info        *sbi    = ll_i2sbi(inode);
+       struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+       struct ll_readahead_state *ras    = &fd->fd_ras;
+       struct page             *vmpage = cp->cpg_page;
+       struct cl_2queue          *queue  = &io->ci_queue;
+       int rc;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       LASSERT(slice->cpl_obj == obj);
+
+       ENTRY;
+
+       if (sbi->ll_ra_info.ra_max_pages_per_file &&
+           sbi->ll_ra_info.ra_max_pages)
+               ras_update(sbi, inode, ras, page->cp_index,
+                          cp->cpg_defer_uptodate);
+
+       /* Sanity check whether the page is protected by a lock. */
+       rc = cl_page_is_under_lock(env, io, page);
+       if (rc != -EBUSY) {
+               CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+                              rc == -ENODATA ? "without a lock" :
+                              "match failed", rc);
+               if (rc != -ENODATA)
+                       RETURN(rc);
+       }
+
+       if (cp->cpg_defer_uptodate) {
+               cp->cpg_ra_used = 1;
+               cl_page_export(env, page, 1);
+       }
+       /*
+        * Add page into the queue even when it is marked uptodate above.
+        * this will unlock it automatically as part of cl_page_list_disown().
+        */
+       cl_2queue_add(queue, page);
+       if (sbi->ll_ra_info.ra_max_pages_per_file &&
+           sbi->ll_ra_info.ra_max_pages)
+               ll_readahead(env, io, ras,
+                            vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+       RETURN(0);
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+                           struct cl_page *page, struct ccc_page *cp,
+                           enum cl_req_type crt)
+{
+       struct cl_2queue  *queue;
+       int result;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       queue = &io->ci_queue;
+       cl_2queue_init_page(queue, page);
+
+       result = cl_io_submit_sync(env, io, crt, queue, 0);
+       LASSERT(cl_page_is_owned(page, io));
+
+       if (crt == CRT_READ)
+               /*
+                * in CRT_WRITE case page is left locked even in case of
+                * error.
+                */
+               cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_2queue_fini(env, queue);
+
+       return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+                                 struct cl_object *obj, struct cl_page *pg,
+                                 struct ccc_page *cp,
+                                 unsigned from, unsigned to)
+{
+       struct cl_attr *attr   = ccc_env_thread_attr(env);
+       loff_t    offset = cl_offset(obj, pg->cp_index);
+       int          result;
+
+       cl_object_attr_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+       if (result == 0) {
+               /*
+                * If are writing to a new page, no need to read old data.
+                * The extent locking will have updated the KMS, and for our
+                * purposes here we can treat it like i_size.
+                */
+               if (attr->cat_kms <= offset) {
+                       char *kaddr = ll_kmap_atomic(cp->cpg_page, KM_USER0);
+
+                       memset(kaddr, 0, cl_page_size(obj));
+                       ll_kunmap_atomic(kaddr, KM_USER0);
+               } else if (cp->cpg_defer_uptodate)
+                       cp->cpg_ra_used = 1;
+               else
+                       result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+               /*
+                * In older implementations, obdo_refresh_inode is called here
+                * to update the inode because the write might modify the
+                * object info at OST. However, this has been proven useless,
+                * since LVB functions will be called when user space program
+                * tries to retrieve inode attribute.  Also, see bug 15909 for
+                * details. -jay
+                */
+               if (result == 0)
+                       cl_page_export(env, pg, 1);
+       }
+       return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct cl_object *obj    = slice->cpl_obj;
+       struct ccc_page  *cp     = cl2ccc_page(slice);
+       struct cl_page   *pg     = slice->cpl_page;
+       struct page       *vmpage = cp->cpg_page;
+
+       int result;
+
+       ENTRY;
+
+       LINVRNT(cl_page_is_vmlocked(env, pg));
+       LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+       result = 0;
+
+       CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+       if (!PageUptodate(vmpage)) {
+               /*
+                * We're completely overwriting an existing page, so _don't_
+                * set it up to date until commit_write
+                */
+               if (from == 0 && to == PAGE_CACHE_SIZE) {
+                       CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+                       POISON_PAGE(page, 0x11);
+               } else
+                       result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+                                                       pg, cp, from, to);
+       } else
+               CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+       RETURN(result);
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct cl_object  *obj    = slice->cpl_obj;
+       struct cl_io      *io     = ios->cis_io;
+       struct ccc_page   *cp     = cl2ccc_page(slice);
+       struct cl_page    *pg     = slice->cpl_page;
+       struct inode      *inode  = ccc_object_inode(obj);
+       struct ll_sb_info *sbi    = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct page     *vmpage = cp->cpg_page;
+
+       int    result;
+       int    tallyop;
+       loff_t size;
+
+       ENTRY;
+
+       LINVRNT(cl_page_is_vmlocked(env, pg));
+       LASSERT(vmpage->mapping->host == inode);
+
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
+       CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+       /*
+        * queue a write for some time in the future the first time we
+        * dirty the page.
+        *
+        * This is different from what other file systems do: they usually
+        * just mark page (and some of its buffers) dirty and rely on
+        * balance_dirty_pages() to start a write-back. Lustre wants write-back
+        * to be started earlier for the following reasons:
+        *
+        *     (1) with a large number of clients we need to limit the amount
+        *     of cached data on the clients a lot;
+        *
+        *     (2) large compute jobs generally want compute-only then io-only
+        *     and the IO should complete as quickly as possible;
+        *
+        *     (3) IO is batched up to the RPC size and is async until the
+        *     client max cache is hit
+        *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+        *
+        */
+       if (!PageDirty(vmpage)) {
+               tallyop = LPROC_LL_DIRTY_MISSES;
+               result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+               if (result == 0) {
+                       /* page was added into cache successfully. */
+                       set_page_dirty(vmpage);
+                       vvp_write_pending(cl2ccc(obj), cp);
+               } else if (result == -EDQUOT) {
+                       pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+                       bool need_clip = true;
+
+                       /*
+                        * Client ran out of disk space grant. Possible
+                        * strategies are:
+                        *
+                        *     (a) do a sync write, renewing grant;
+                        *
+                        *     (b) stop writing on this stripe, switch to the
+                        *     next one.
+                        *
+                        * (b) is a part of "parallel io" design that is the
+                        * ultimate goal. (a) is what "old" client did, and
+                        * what the new code continues to do for the time
+                        * being.
+                        */
+                       if (last_index > pg->cp_index) {
+                               to = PAGE_CACHE_SIZE;
+                               need_clip = false;
+                       } else if (last_index == pg->cp_index) {
+                               int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+                               if (to < size_to)
+                                       to = size_to;
+                       }
+                       if (need_clip)
+                               cl_page_clip(env, pg, 0, to);
+                       result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+                       if (result)
+                               CERROR("Write page %lu of inode %p failed %d\n",
+                                      pg->cp_index, inode, result);
+               }
+       } else {
+               tallyop = LPROC_LL_DIRTY_HITS;
+               result = 0;
+       }
+       ll_stats_ops_tally(sbi, tallyop, 1);
+
+       /* Inode should be marked DIRTY even if no new page was marked DIRTY
+        * because page could have been not flushed between 2 modifications.
+        * It is important the file is marked DIRTY as soon as the I/O is done
+        * Indeed, when cache is flushed, file could be already closed and it
+        * is too late to warn the MDT.
+        * It is acceptable that file is marked DIRTY even if I/O is dropped
+        * for some reasons before being flushed to OST.
+        */
+       if (result == 0) {
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags |= LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       size = cl_offset(obj, pg->cp_index) + to;
+
+       ll_inode_size_lock(inode);
+       if (result == 0) {
+               if (size > i_size_read(inode)) {
+                       cl_isize_write_nolock(inode, size);
+                       CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+                              PFID(lu_object_fid(&obj->co_lu)),
+                              (unsigned long)size);
+               }
+               cl_page_export(env, pg, 1);
+       } else {
+               if (size > i_size_read(inode))
+                       cl_page_discard(env, io, pg);
+       }
+       ll_inode_size_unlock(inode);
+       RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini      = vvp_io_read_fini,
+                       .cio_lock      = vvp_io_read_lock,
+                       .cio_start     = vvp_io_read_start,
+                       .cio_advance   = ccc_io_advance
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = vvp_io_fini,
+                       .cio_lock      = vvp_io_write_lock,
+                       .cio_start     = vvp_io_write_start,
+                       .cio_advance   = ccc_io_advance
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini       = vvp_io_setattr_fini,
+                       .cio_iter_init  = vvp_io_setattr_iter_init,
+                       .cio_lock       = vvp_io_setattr_lock,
+                       .cio_start      = vvp_io_setattr_start,
+                       .cio_end        = vvp_io_setattr_end
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = vvp_io_fault_fini,
+                       .cio_iter_init = vvp_io_fault_iter_init,
+                       .cio_lock      = vvp_io_fault_lock,
+                       .cio_start     = vvp_io_fault_start,
+                       .cio_end       = ccc_io_end
+               },
+               [CIT_FSYNC] = {
+                       .cio_start  = vvp_io_fsync_start,
+                       .cio_fini   = vvp_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = vvp_io_fini
+               }
+       },
+       .cio_read_page     = vvp_io_read_page,
+       .cio_prepare_write = vvp_io_prepare_write,
+       .cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_io *io)
+{
+       struct vvp_io      *vio   = vvp_env_io(env);
+       struct ccc_io      *cio   = ccc_env_io(env);
+       struct inode       *inode = ccc_object_inode(obj);
+       int              result;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       ENTRY;
+
+       CL_IO_SLICE_CLEAN(cio, cui_cl);
+       cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+       vio->cui_ra_window_set = 0;
+       result = 0;
+       if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+               size_t count;
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               count = io->u.ci_rw.crw_count;
+               /* "If nbyte is 0, read() will return 0 and have no other
+                *  results."  -- Single Unix Spec */
+               if (count == 0)
+                       result = 1;
+               else {
+                       cio->cui_tot_count = count;
+                       cio->cui_tot_nrsegs = 0;
+               }
+               /* for read/write, we store the jobid in the inode, and
+                * it'll be fetched by osc when building RPC.
+                *
+                * it's not accurate if the file is shared by different
+                * jobs.
+                */
+               lustre_get_jobid(lli->lli_jobid);
+       } else if (io->ci_type == CIT_SETATTR) {
+               if (!cl_io_is_trunc(io))
+                       io->ci_lockreq = CILR_MANDATORY;
+       }
+
+       /* ignore layout change for generic CIT_MISC but not for glimpse.
+        * io context for glimpse must set ci_verify_layout to true,
+        * see cl_glimpse_size0() for details. */
+       if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+               io->ci_ignore_layout = 1;
+
+       /* Enqueue layout lock and get layout version. We need to do this
+        * even for operations requiring to open file, such as read and write,
+        * because it might not grant layout lock in IT_OPEN. */
+       if (result == 0 && !io->ci_ignore_layout)
+               result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+
+       RETURN(result);
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       /* Caling just for assertion */
+       cl2ccc_io(env, slice);
+       return vvp_env_io(env);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644 (file)
index 0000000..9b8712b
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice)
+{
+       struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+       ENTRY;
+       RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0);
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+       .clo_delete    = ccc_lock_delete,
+       .clo_fini      = ccc_lock_fini,
+       .clo_enqueue   = ccc_lock_enqueue,
+       .clo_wait      = ccc_lock_wait,
+       .clo_unuse     = ccc_lock_unuse,
+       .clo_fits_into = ccc_lock_fits_into,
+       .clo_state     = ccc_lock_state,
+       .clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io)
+{
+       return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644 (file)
index 0000000..01edc5b
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       struct ccc_object    *obj   = lu2ccc(o);
+       struct inode     *inode = obj->cob_inode;
+       struct ll_inode_info *lli;
+
+       (*p)(env, cookie, "(%s %d %d) inode: %p ",
+            list_empty(&obj->cob_pending_list) ? "-" : "+",
+            obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+            inode);
+       if (inode) {
+               lli = ll_i2info(inode);
+               (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+                    inode->i_ino, inode->i_generation, inode->i_mode,
+                    inode->i_nlink, atomic_read(&inode->i_count),
+                    lli->lli_clob, PFID(&lli->lli_fid));
+       }
+       return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       /*
+        * lov overwrites most of these fields in
+        * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+        * attributes are newer.
+        */
+
+       attr->cat_size = i_size_read(inode);
+       attr->cat_mtime = LTIME_S(inode->i_mtime);
+       attr->cat_atime = LTIME_S(inode->i_atime);
+       attr->cat_ctime = LTIME_S(inode->i_ctime);
+       attr->cat_blocks = inode->i_blocks;
+       attr->cat_uid = inode->i_uid;
+       attr->cat_gid = inode->i_gid;
+       /* KMS is not known by this layer */
+       return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_attr *attr, unsigned valid)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       if (valid & CAT_UID)
+               inode->i_uid = attr->cat_uid;
+       if (valid & CAT_GID)
+               inode->i_gid = attr->cat_gid;
+       if (valid & CAT_ATIME)
+               LTIME_S(inode->i_atime) = attr->cat_atime;
+       if (valid & CAT_MTIME)
+               LTIME_S(inode->i_mtime) = attr->cat_mtime;
+       if (valid & CAT_CTIME)
+               LTIME_S(inode->i_ctime) = attr->cat_ctime;
+       if (0 && valid & CAT_SIZE)
+               cl_isize_write_nolock(inode, attr->cat_size);
+       /* not currently necessary */
+       if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+               mark_inode_dirty(inode);
+       return 0;
+}
+
+int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+               const struct cl_object_conf *conf)
+{
+       struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+       if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+               lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+               return 0;
+       }
+
+       if (conf->coc_opc != OBJECT_CONF_SET)
+               return 0;
+
+       if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+               CDEBUG(D_VFSTRACE, "layout lock change: %u -> %u\n",
+                       lli->lli_layout_gen,
+                       conf->u.coc_md->lsm->lsm_layout_gen);
+
+               lli->lli_has_smd = true;
+               lli->lli_layout_gen = conf->u.coc_md->lsm->lsm_layout_gen;
+       } else {
+               CDEBUG(D_VFSTRACE, "layout lock destroyed: %u.\n",
+                       lli->lli_layout_gen);
+
+               lli->lli_has_smd = false;
+               lli->lli_layout_gen = LL_LAYOUT_GEN_EMPTY;
+       }
+       return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+       .coo_page_init = vvp_page_init,
+       .coo_lock_init = vvp_lock_init,
+       .coo_io_init   = vvp_io_init,
+       .coo_attr_get  = vvp_attr_get,
+       .coo_attr_set  = vvp_attr_set,
+       .coo_conf_set  = vvp_conf_set,
+       .coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+       .loo_object_init  = ccc_object_init,
+       .loo_object_free  = ccc_object_free,
+       .loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+       struct cl_inode_info *lli = cl_i2info(inode);
+       struct cl_object     *obj = lli->lli_clob;
+       struct lu_object     *lu;
+
+       LASSERT(obj != NULL);
+       lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+       LASSERT(lu != NULL);
+       return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev)
+{
+       return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644 (file)
index 0000000..4568e69
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+       struct page *vmpage = cp->cpg_page;
+
+       LASSERT(vmpage != NULL);
+       page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct ccc_page *cp = cl2ccc_page(slice);
+       struct page *vmpage  = cp->cpg_page;
+
+       /*
+        * vmpage->private was already cleared when page was moved into
+        * VPG_FREEING state.
+        */
+       LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+       vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+                       const struct cl_page_slice *slice, struct cl_io *io,
+                       int nonblock)
+{
+       struct ccc_page *vpg    = cl2ccc_page(slice);
+       struct page      *vmpage = vpg->cpg_page;
+
+       LASSERT(vmpage != NULL);
+       if (nonblock) {
+               if (!trylock_page(vmpage))
+                       return -EAGAIN;
+
+               if (unlikely(PageWriteback(vmpage))) {
+                       unlock_page(vmpage);
+                       return -EAGAIN;
+               }
+
+               return 0;
+       }
+
+       lock_page(vmpage);
+       wait_on_page_writeback(vmpage);
+       return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+       wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *unused)
+{
+       struct page        *vmpage  = cl2vm_page(slice);
+       struct address_space *mapping;
+       struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       mapping = vmpage->mapping;
+
+       if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+               ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+       /*
+        * truncate_complete_page() calls
+        * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+        */
+       truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+       __u64       offset;
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+       /*
+        * XXX is it safe to call this with the page lock held?
+        */
+       ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+       return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       struct page       *vmpage = cl2vm_page(slice);
+       struct inode     *inode  = vmpage->mapping->host;
+       struct cl_object *obj    = slice->cpl_obj;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+       LASSERT(inode == ccc_object_inode(obj));
+
+       vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+       ClearPagePrivate(vmpage);
+       vmpage->private = 0;
+       /*
+        * Reference from vmpage to cl_page is removed, but the reference back
+        * is still here. It is removed later in vvp_page_fini().
+        */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           int uptodate)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+       if (uptodate)
+               SetPageUptodate(vmpage);
+       else
+               ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+                               const struct cl_page_slice *slice)
+{
+       return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       ENTRY;
+       /* Skip the page already marked as PG_uptodate. */
+       RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageDirty(vmpage));
+
+       set_page_writeback(vmpage);
+       vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+       return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+       struct ccc_object *obj = cl_inode2ccc(inode);
+
+       if (ioret == 0) {
+               ClearPageError(vmpage);
+               obj->cob_discard_page_warned = 0;
+       } else {
+               SetPageError(vmpage);
+               if (ioret == -ENOSPC)
+                       set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+               else
+                       set_bit(AS_EIO, &inode->i_mapping->flags);
+
+               if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+                    obj->cob_discard_page_warned == 0) {
+                       obj->cob_discard_page_warned = 1;
+                       ll_dirty_page_discard_warn(vmpage, ioret);
+               }
+       }
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+                                    const struct cl_page_slice *slice,
+                                    int ioret)
+{
+       struct ccc_page *cp     = cl2ccc_page(slice);
+       struct page      *vmpage = cp->cpg_page;
+       struct cl_page  *page   = cl_page_top(slice->cpl_page);
+       struct inode    *inode  = ccc_object_inode(page->cp_obj);
+       ENTRY;
+
+       LASSERT(PageLocked(vmpage));
+       CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+       if (cp->cpg_defer_uptodate)
+               ll_ra_count_put(ll_i2sbi(inode), 1);
+
+       if (ioret == 0)  {
+               if (!cp->cpg_defer_uptodate)
+                       cl_page_export(env, page, 1);
+       } else
+               cp->cpg_defer_uptodate = 0;
+
+       if (page->cp_sync_io == NULL)
+               unlock_page(vmpage);
+
+       EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+       struct ccc_page *cp     = cl2ccc_page(slice);
+       struct cl_page  *pg     = slice->cpl_page;
+       struct page      *vmpage = cp->cpg_page;
+       ENTRY;
+
+       LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+       LASSERT(PageWriteback(vmpage));
+
+       CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+       /*
+        * TODO: Actually it makes sense to add the page into oap pending
+        * list again and so that we don't need to take the page out from
+        * SoM write pending list, if we just meet a recoverable error,
+        * -ENOMEM, etc.
+        * To implement this, we just need to return a non zero value in
+        * ->cpo_completion method. The underlying transfer should be notified
+        * and then re-add the page into pending transfer queue.  -jay
+        */
+
+       cp->cpg_write_queued = 0;
+       vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+       /*
+        * Only mark the page error only when it's an async write because
+        * applications won't wait for IO to finish.
+        */
+       if (pg->cp_sync_io == NULL)
+               vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+       end_page_writeback(vmpage);
+       EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+                              const struct cl_page_slice *slice)
+{
+       struct page *vmpage = cl2vm_page(slice);
+       struct cl_page *pg = slice->cpl_page;
+       int result = 0;
+
+       lock_page(vmpage);
+       if (clear_page_dirty_for_io(vmpage)) {
+               LASSERT(pg->cp_state == CPS_CACHED);
+               /* This actually clears the dirty bit in the radix
+                * tree. */
+               set_page_writeback(vmpage);
+               vvp_write_pending(cl2ccc(slice->cpl_obj),
+                               cl2ccc_page(slice));
+               CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+       } else if (pg->cp_state == CPS_PAGEOUT) {
+               /* is it possible for osc_flush_async_page() to already
+                * make it ready? */
+               result = -EALREADY;
+       } else {
+               CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+                             pg->cp_state);
+               LBUG();
+       }
+       unlock_page(vmpage);
+       RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct ccc_page *vp = cl2ccc_page(slice);
+       struct page      *vmpage = vp->cpg_page;
+
+       (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) "
+                  "vm@%p ",
+                  vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+                  vp->cpg_write_queued, vmpage);
+       if (vmpage != NULL) {
+               (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+                          (long)vmpage->flags, page_count(vmpage),
+                          page_mapcount(vmpage), vmpage->private,
+                          page_index(vmpage),
+                          list_empty(&vmpage->lru) ? "not-" : "");
+       }
+       (*printer)(env, cookie, "\n");
+       return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+       .cpo_own           = vvp_page_own,
+       .cpo_assume     = vvp_page_assume,
+       .cpo_unassume      = vvp_page_unassume,
+       .cpo_disown     = vvp_page_disown,
+       .cpo_vmpage     = ccc_page_vmpage,
+       .cpo_discard       = vvp_page_discard,
+       .cpo_delete     = vvp_page_delete,
+       .cpo_unmap       = vvp_page_unmap,
+       .cpo_export     = vvp_page_export,
+       .cpo_is_vmlocked   = vvp_page_is_vmlocked,
+       .cpo_fini         = vvp_page_fini,
+       .cpo_print       = vvp_page_print,
+       .cpo_is_under_lock = ccc_page_is_under_lock,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = vvp_page_prep_read,
+                       .cpo_completion  = vvp_page_completion_read,
+                       .cpo_make_ready  = ccc_fail,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = vvp_page_prep_write,
+                       .cpo_completion  = vvp_page_completion_write,
+                       .cpo_make_ready  = vvp_page_make_ready,
+               }
+       }
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+       struct inode *inode = ccc_object_inode(page->cp_obj);
+
+       LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *unused, int nonblock)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+       return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+       struct cl_page *page = slice->cpl_page;
+
+       vvp_transient_page_verify(slice->cpl_page);
+
+       /*
+        * For transient pages, remove it from the radix tree.
+        */
+       cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+                                         const struct cl_page_slice *slice)
+{
+       struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+       int     locked;
+
+       locked = !mutex_trylock(&inode->i_mutex);
+       if (!locked)
+               mutex_unlock(&inode->i_mutex);
+       return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             int ioret)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+                                   struct cl_page_slice *slice)
+{
+       struct ccc_page *cp = cl2ccc_page(slice);
+       struct cl_page *clp = slice->cpl_page;
+       struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+       vvp_page_fini_common(cp);
+       LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+       clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+       .cpo_own           = vvp_transient_page_own,
+       .cpo_assume     = vvp_transient_page_assume,
+       .cpo_unassume      = vvp_transient_page_unassume,
+       .cpo_disown     = vvp_transient_page_disown,
+       .cpo_discard       = vvp_transient_page_discard,
+       .cpo_vmpage     = ccc_page_vmpage,
+       .cpo_fini         = vvp_transient_page_fini,
+       .cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+       .cpo_print       = vvp_page_print,
+       .cpo_is_under_lock = ccc_page_is_under_lock,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = ccc_transient_page_prep,
+                       .cpo_completion  = vvp_transient_page_completion,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = ccc_transient_page_prep,
+                       .cpo_completion  = vvp_transient_page_completion,
+               }
+       }
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       cpg->cpg_page = vmpage;
+       page_cache_get(vmpage);
+
+       INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+       if (page->cp_type == CPT_CACHEABLE) {
+               SetPagePrivate(vmpage);
+               vmpage->private = (unsigned long)page;
+               cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                               &vvp_page_ops);
+       } else {
+               struct ccc_object *clobj = cl2ccc(obj);
+
+               LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+               cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                               &vvp_transient_page_ops);
+               clobj->cob_transient_pages++;
+       }
+       return 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644 (file)
index 0000000..4176264
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T       (1)
+#define XATTR_TRUSTED_T         (2)
+#define XATTR_SECURITY_T       (3)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_LUSTRE_T   (6)
+#define XATTR_OTHER_T     (7)
+
+static
+int get_xattr_type(const char *name)
+{
+       if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+               return XATTR_ACL_ACCESS_T;
+
+       if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+               return XATTR_ACL_DEFAULT_T;
+
+       if (!strncmp(name, XATTR_USER_PREFIX,
+                    sizeof(XATTR_USER_PREFIX) - 1))
+               return XATTR_USER_T;
+
+       if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1))
+               return XATTR_TRUSTED_T;
+
+       if (!strncmp(name, XATTR_SECURITY_PREFIX,
+                    sizeof(XATTR_SECURITY_PREFIX) - 1))
+               return XATTR_SECURITY_T;
+
+       if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1))
+               return XATTR_LUSTRE_T;
+
+       return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+       if ((xattr_type == XATTR_ACL_ACCESS_T ||
+            xattr_type == XATTR_ACL_DEFAULT_T) &&
+          !(sbi->ll_flags & LL_SBI_ACL))
+               return -EOPNOTSUPP;
+
+       if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+               return -EOPNOTSUPP;
+       if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
+               return -EPERM;
+       if (xattr_type == XATTR_OTHER_T)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+                      const void *value, size_t size,
+                      int flags, __u64 valid)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req;
+       int xattr_type, rc;
+       struct obd_capa *oc;
+       posix_acl_xattr_header *new_value = NULL;
+       struct rmtacl_ctl_entry *rce = NULL;
+       ext_acl_xattr_header *acl = NULL;
+       const char *pv = value;
+       ENTRY;
+
+       xattr_type = get_xattr_type(name);
+       rc = xattr_type_filter(sbi, xattr_type);
+       if (rc)
+               RETURN(rc);
+
+       /* b10667: ignore lustre special xattr for now */
+       if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+           (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+               RETURN(0);
+
+       /* b15587: ignore security.capability xattr for now */
+       if ((xattr_type == XATTR_SECURITY_T &&
+           strcmp(name, "security.capability") == 0))
+               RETURN(0);
+
+       /* LU-549:  Disable security.selinux when selinux is disabled */
+       if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+           strcmp(name, "security.selinux") == 0)
+               RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           (xattr_type == XATTR_ACL_ACCESS_T ||
+           xattr_type == XATTR_ACL_DEFAULT_T)) {
+               rce = rct_search(&sbi->ll_rct, current_pid());
+               if (rce == NULL ||
+                   (rce->rce_ops != RMT_LSETFACL &&
+                   rce->rce_ops != RMT_RSETFACL))
+                       RETURN(-EOPNOTSUPP);
+
+               if (rce->rce_ops == RMT_LSETFACL) {
+                       struct eacl_entry *ee;
+
+                       ee = et_search_del(&sbi->ll_et, current_pid(),
+                                          ll_inode2fid(inode), xattr_type);
+                       LASSERT(ee != NULL);
+                       if (valid & OBD_MD_FLXATTR) {
+                               acl = lustre_acl_xattr_merge2ext(
+                                               (posix_acl_xattr_header *)value,
+                                               size, ee->ee_acl);
+                               if (IS_ERR(acl)) {
+                                       ee_free(ee);
+                                       RETURN(PTR_ERR(acl));
+                               }
+                               size =  CFS_ACL_XATTR_SIZE(\
+                                               le32_to_cpu(acl->a_count), \
+                                               ext_acl_xattr);
+                               pv = (const char *)acl;
+                       }
+                       ee_free(ee);
+               } else if (rce->rce_ops == RMT_RSETFACL) {
+                       size = lustre_posix_acl_xattr_filter(
+                                               (posix_acl_xattr_header *)value,
+                                               size, &new_value);
+                       if (unlikely(size < 0))
+                               RETURN(size);
+
+                       pv = (const char *)new_value;
+               } else
+                       RETURN(-EOPNOTSUPP);
+
+               valid |= rce_ops2valid(rce->rce_ops);
+       }
+#endif
+       oc = ll_mdscapa_get(inode);
+       rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                        valid, name, pv, size, 0, flags, ll_i2suppgid(inode),
+                        &req);
+       capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+       if (new_value != NULL)
+               lustre_posix_acl_xattr_free(new_value, size);
+       if (acl != NULL)
+               lustre_ext_acl_xattr_free(acl);
+#endif
+       if (rc) {
+               if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+                       LCONSOLE_INFO("Disabling user_xattr feature because "
+                                     "it is not supported on the server\n");
+                       sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+               }
+               RETURN(rc);
+       }
+
+       ptlrpc_req_finished(req);
+       RETURN(0);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+       if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+           (strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+               struct lov_user_md *lump = (struct lov_user_md *)value;
+               int rc = 0;
+
+               /* Attributes that are saved via getxattr will always have
+                * the stripe_offset as 0.  Instead, the MDS should be
+                * allowed to pick the starting OST index.   b=17846 */
+               if (lump != NULL && lump->lmm_stripe_offset == 0)
+                       lump->lmm_stripe_offset = -1;
+
+               if (lump != NULL && S_ISREG(inode->i_mode)) {
+                       struct file f;
+                       int flags = FMODE_WRITE;
+                       int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+                               sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+                       f.f_dentry = dentry;
+                       rc = ll_lov_setstripe_ea_info(inode, &f, flags, lump,
+                                                     lum_size);
+                       /* b10667: rc always be 0 here for now */
+                       rc = 0;
+               } else if (S_ISDIR(inode->i_mode)) {
+                       rc = ll_dir_setstripe(inode, lump, 0);
+               }
+
+               return rc;
+
+       } else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+                  strcmp(name, XATTR_NAME_LINK) == 0)
+               return 0;
+
+       return ll_setxattr_common(inode, name, value, size, flags,
+                                 OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+       return ll_setxattr_common(inode, name, NULL, 0, 0,
+                                 OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+                      void *buffer, size_t size, __u64 valid)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       struct mdt_body *body;
+       int xattr_type, rc;
+       void *xdata;
+       struct obd_capa *oc;
+       struct rmtacl_ctl_entry *rce = NULL;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       /* listxattr have slightly different behavior from of ext3:
+        * without 'user_xattr' ext3 will list all xattr names but
+        * filtered out "^user..*"; we list them all for simplicity.
+        */
+       if (!name) {
+               xattr_type = XATTR_OTHER_T;
+               goto do_getxattr;
+       }
+
+       xattr_type = get_xattr_type(name);
+       rc = xattr_type_filter(sbi, xattr_type);
+       if (rc)
+               RETURN(rc);
+
+       /* b15587: ignore security.capability xattr for now */
+       if ((xattr_type == XATTR_SECURITY_T &&
+           strcmp(name, "security.capability") == 0))
+               RETURN(-ENODATA);
+
+       /* LU-549:  Disable security.selinux when selinux is disabled */
+       if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+           strcmp(name, "security.selinux") == 0)
+               RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           (xattr_type == XATTR_ACL_ACCESS_T ||
+           xattr_type == XATTR_ACL_DEFAULT_T)) {
+               rce = rct_search(&sbi->ll_rct, current_pid());
+               if (rce == NULL ||
+                   (rce->rce_ops != RMT_LSETFACL &&
+                   rce->rce_ops != RMT_LGETFACL &&
+                   rce->rce_ops != RMT_RSETFACL &&
+                   rce->rce_ops != RMT_RGETFACL))
+                       RETURN(-EOPNOTSUPP);
+       }
+
+       /* posix acl is under protection of LOOKUP lock. when calling to this,
+        * we just have path resolution to the target inode, so we have great
+        * chance that cached ACL is uptodate.
+        */
+       if (xattr_type == XATTR_ACL_ACCESS_T &&
+           !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+               struct posix_acl *acl;
+
+               spin_lock(&lli->lli_lock);
+               acl = posix_acl_dup(lli->lli_posix_acl);
+               spin_unlock(&lli->lli_lock);
+
+               if (!acl)
+                       RETURN(-ENODATA);
+
+               rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+               posix_acl_release(acl);
+               RETURN(rc);
+       }
+       if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+               RETURN(-ENODATA);
+#endif
+
+do_getxattr:
+       oc = ll_mdscapa_get(inode);
+       rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                        valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+                        name, NULL, 0, size, 0, &req);
+       capa_put(oc);
+       if (rc) {
+               if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+                       LCONSOLE_INFO("Disabling user_xattr feature because "
+                                     "it is not supported on the server\n");
+                       sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+               }
+               RETURN(rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body);
+
+       /* only detect the xattr size */
+       if (size == 0)
+               GOTO(out, rc = body->eadatasize);
+
+       if (size < body->eadatasize) {
+               CERROR("server bug: replied size %u > %u\n",
+                      body->eadatasize, (int)size);
+               GOTO(out, rc = -ERANGE);
+       }
+
+       if (body->eadatasize == 0)
+               GOTO(out, rc = -ENODATA);
+
+       /* do not need swab xattr data */
+       xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+                                            body->eadatasize);
+       if (!xdata)
+               GOTO(out, rc = -EFAULT);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (body->eadatasize >= 0 && rce && rce->rce_ops == RMT_LSETFACL) {
+               ext_acl_xattr_header *acl;
+
+               acl = lustre_posix_acl_xattr_2ext((posix_acl_xattr_header *)xdata,
+                                                 body->eadatasize);
+               if (IS_ERR(acl))
+                       GOTO(out, rc = PTR_ERR(acl));
+
+               rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+                           xattr_type, acl);
+               if (unlikely(rc < 0)) {
+                       lustre_ext_acl_xattr_free(acl);
+                       GOTO(out, rc);
+               }
+       }
+#endif
+
+       if (body->eadatasize == 0) {
+               rc = -ENODATA;
+       } else {
+               LASSERT(buffer);
+               memcpy(buffer, xdata, body->eadatasize);
+               rc = body->eadatasize;
+       }
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+                   void *buffer, size_t size)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+       if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+           (strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+               struct lov_stripe_md *lsm;
+               struct lov_user_md *lump;
+               struct lov_mds_md *lmm = NULL;
+               struct ptlrpc_request *request = NULL;
+               int rc = 0, lmmsize = 0;
+
+               if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+                       return -ENODATA;
+
+               if (size == 0 && S_ISDIR(inode->i_mode)) {
+                       /* XXX directory EA is fix for now, optimize to save
+                        * RPC transfer */
+                       GOTO(out, rc = sizeof(struct lov_user_md));
+               }
+
+               lsm = ccc_inode_lsm_get(inode);
+               if (lsm == NULL) {
+                       if (S_ISDIR(inode->i_mode)) {
+                               rc = ll_dir_getstripe(inode, &lmm,
+                                                     &lmmsize, &request);
+                       } else {
+                               rc = -ENODATA;
+                       }
+               } else {
+                       /* LSM is present already after lookup/getattr call.
+                        * we need to grab layout lock once it is implemented */
+                       rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+                       lmmsize = rc;
+               }
+               ccc_inode_lsm_put(inode, lsm);
+
+               if (rc < 0)
+                      GOTO(out, rc);
+
+               if (size == 0) {
+                       /* used to call ll_get_max_mdsize() forward to get
+                        * the maximum buffer size, while some apps (such as
+                        * rsync 3.0.x) care much about the exact xattr value
+                        * size */
+                       rc = lmmsize;
+                       GOTO(out, rc);
+               }
+
+               if (size < lmmsize) {
+                       CERROR("server bug: replied size %d > %d for %s (%s)\n",
+                              lmmsize, (int)size, dentry->d_name.name, name);
+                       GOTO(out, rc = -ERANGE);
+               }
+
+               lump = (struct lov_user_md *)buffer;
+               memcpy(lump, lmm, lmmsize);
+               /* do not return layout gen for getxattr otherwise it would
+                * confuse tar --xattr by recognizing layout gen as stripe
+                * offset when the file is restored. See LU-2809. */
+               lump->lmm_layout_gen = 0;
+
+               rc = lmmsize;
+out:
+               if (request)
+                       ptlrpc_req_finished(request);
+               else if (lmm)
+                       obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+               return(rc);
+       }
+
+       return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+       struct inode *inode = dentry->d_inode;
+       int rc = 0, rc2 = 0;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *request = NULL;
+       int lmmsize;
+
+       LASSERT(inode);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+       rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       if (buffer != NULL) {
+               struct ll_sb_info *sbi = ll_i2sbi(inode);
+               char *xattr_name = buffer;
+               int xlen, rem = rc;
+
+               while (rem > 0) {
+                       xlen = strnlen(xattr_name, rem - 1) + 1;
+                       rem -= xlen;
+                       if (xattr_type_filter(sbi,
+                                       get_xattr_type(xattr_name)) == 0) {
+                               /* skip OK xattr type
+                                * leave it in buffer
+                                */
+                               xattr_name += xlen;
+                               continue;
+                       }
+                       /* move up remaining xattrs in buffer
+                        * removing the xattr that is not OK
+                        */
+                       memmove(xattr_name, xattr_name + xlen, rem);
+                       rc -= xlen;
+               }
+       }
+       if (S_ISREG(inode->i_mode)) {
+               if (!ll_i2info(inode)->lli_has_smd)
+                       rc2 = -1;
+       } else if (S_ISDIR(inode->i_mode)) {
+               rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+       }
+
+       if (rc2 < 0) {
+               GOTO(out, rc2 = 0);
+       } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+               const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+               const size_t name_len   = sizeof("lov") - 1;
+               const size_t total_len  = prefix_len + name_len + 1;
+
+               if (buffer && (rc + total_len) <= size) {
+                       buffer += rc;
+                       memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+                       memcpy(buffer + prefix_len, "lov", name_len);
+                       buffer[prefix_len + name_len] = '\0';
+               }
+               rc2 = total_len;
+       }
+out:
+       ptlrpc_req_finished(request);
+       rc = rc + rc2;
+
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644 (file)
index 0000000..8cc81ad
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644 (file)
index 0000000..a4805ae
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+                  const struct lu_fid *fid,
+                  mdsno_t *mds)
+{
+       int rc;
+       ENTRY;
+
+
+       /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+        * this fid_is_local check should be removed once LU-2240 is fixed */
+       LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+                 fid_seq_is_local_file(fid_seq(fid))) &&
+                fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+       rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+                              LU_SEQ_RANGE_MDT, NULL);
+       if (rc) {
+               CERROR("Error while looking for mds number. Seq "LPX64
+                      ", err = %d\n", fid_seq(fid), rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+              *mds, PFID(fid));
+
+       if (*mds >= lmv->desc.ld_tgt_count) {
+               CERROR("FLD lookup got invalid mds #%x (max: %x) "
+                      "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
+                      PFID(fid));
+               rc = -EINVAL;
+       }
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644 (file)
index 0000000..7eefab5
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+                            int lmmsize, struct lookup_intent *it,
+                            const struct lu_fid *parent_fid, int flags,
+                            struct ptlrpc_request **reqp,
+                            ldlm_blocking_callback cb_blocking,
+                            __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct ptlrpc_request   *req = NULL;
+       struct lustre_handle    plock;
+       struct md_op_data       *op_data;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body         *body;
+       int                     pmode;
+       int                     rc = 0;
+       ENTRY;
+
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       LASSERT((body->valid & OBD_MD_MDS));
+
+       /*
+        * Unfortunately, we have to lie to MDC/MDS to retrieve
+        * attributes llite needs and provideproper locking.
+        */
+       if (it->it_op & IT_LOOKUP)
+               it->it_op = IT_GETATTR;
+
+       /*
+        * We got LOOKUP lock, but we really need attrs.
+        */
+       pmode = it->d.lustre.it_lock_mode;
+       if (pmode) {
+               plock.cookie = it->d.lustre.it_lock_handle;
+               it->d.lustre.it_lock_mode = 0;
+               it->d.lustre.it_data = NULL;
+       }
+
+       LASSERT(fid_is_sane(&body->fid1));
+
+       tgt = lmv_find_target(lmv, &body->fid1);
+       if (IS_ERR(tgt))
+               GOTO(out, rc = PTR_ERR(tgt));
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       op_data->op_fid1 = body->fid1;
+       /* Sent the parent FID to the remote MDT */
+       if (parent_fid != NULL) {
+               /* The parent fid is only for remote open to
+                * check whether the open is from OBF,
+                * see mdt_cross_open */
+               LASSERT(it->it_op & IT_OPEN);
+               op_data->op_fid2 = *parent_fid;
+               /* Add object FID to op_fid3, in case it needs to check stale
+                * (M_CHECK_STALE), see mdc_finish_intent_lock */
+               op_data->op_fid3 = body->fid1;
+       }
+
+       op_data->op_bias = MDS_CROSS_REF;
+       CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+              PFID(&body->fid1), tgt->ltd_idx);
+
+       it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+                           flags, &req, cb_blocking, extra_lock_flags);
+       if (rc)
+               GOTO(out_free_op_data, rc);
+
+       /*
+        * LLite needs LOOKUP lock to track dentry revocation in order to
+        * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+        * and put LOOKUP in request.
+        */
+       if (it->d.lustre.it_lock_mode != 0) {
+               it->d.lustre.it_remote_lock_handle =
+                                       it->d.lustre.it_lock_handle;
+               it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+       }
+
+       it->d.lustre.it_lock_handle = plock.cookie;
+       it->d.lustre.it_lock_mode = pmode;
+
+       EXIT;
+out_free_op_data:
+       OBD_FREE_PTR(op_data);
+out:
+       if (rc && pmode)
+               ldlm_lock_decref(&plock, pmode);
+
+       ptlrpc_req_finished(*reqp);
+       *reqp = req;
+       return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body         *body;
+       int                     rc;
+       ENTRY;
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /* If it is ready to open the file by FID, do not need
+        * allocate FID at all, otherwise it will confuse MDT */
+       if ((it->it_op & IT_CREAT) &&
+           !(it->it_flags & MDS_OPEN_BY_FID)) {
+               /*
+                * For open with IT_CREATE and for IT_CREATE cases allocate new
+                * fid and setup FLD for it.
+                */
+               op_data->op_fid3 = op_data->op_fid2;
+               rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+              " name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+              PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+                           reqp, cb_blocking, extra_lock_flags);
+       if (rc != 0)
+               RETURN(rc);
+       /*
+        * Nothing is found, do not access body->fid1 as it is zero and thus
+        * pointless.
+        */
+       if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+           !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+           !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+       /*
+        * Not cross-ref case, just get out of here.
+        */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       /*
+        * Okay, MDS has returned success. Probably name has been resolved in
+        * remote inode.
+        */
+       rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+                              reqp, cb_blocking, extra_lock_flags);
+       if (rc != 0) {
+               LASSERT(rc < 0);
+               /*
+                * This is possible, that some userspace application will try to
+                * open file as directory and we will have -ENOTDIR here. As
+                * this is normal situation, we should not print error here,
+                * only debug info.
+                */
+               CDEBUG(D_INODE, "Can't handle remote %s: dir "DFID"("DFID"):"
+                      "%*s: %d\n", LL_IT2STR(it), PFID(&op_data->op_fid2),
+                      PFID(&op_data->op_fid1), op_data->op_namelen,
+                      op_data->op_name, rc);
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+                     void *lmm, int lmmsize, struct lookup_intent *it,
+                     int flags, struct ptlrpc_request **reqp,
+                     ldlm_blocking_callback cb_blocking,
+                     __u64 extra_lock_flags)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt = NULL;
+       struct mdt_body *body;
+       int                  rc = 0;
+       ENTRY;
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (!fid_is_sane(&op_data->op_fid2))
+               fid_zero(&op_data->op_fid2);
+
+       CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+              ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+              PFID(&op_data->op_fid2),
+              op_data->op_name ? op_data->op_name : "<NULL>",
+              tgt->ltd_idx);
+
+       op_data->op_bias &= ~MDS_CROSS_REF;
+
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+                            flags, reqp, cb_blocking, extra_lock_flags);
+
+       if (rc < 0 || *reqp == NULL)
+               RETURN(rc);
+
+       /*
+        * MDS has returned success. Probably name has been resolved in
+        * remote inode. Let's check this.
+        */
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+       /* Not cross-ref case, just get out of here. */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+                              cb_blocking, extra_lock_flags);
+
+       RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct obd_device *obd = exp->exp_obd;
+       int             rc;
+       ENTRY;
+
+       LASSERT(it != NULL);
+       LASSERT(fid_is_sane(&op_data->op_fid1));
+
+       CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+              LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+              PFID(&op_data->op_fid1));
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+               rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+                                      flags, reqp, cb_blocking,
+                                      extra_lock_flags);
+       else if (it->it_op & IT_OPEN)
+               rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+                                    flags, reqp, cb_blocking,
+                                    extra_lock_flags);
+       else
+               LBUG();
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644 (file)
index 0000000..f75b0a9
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv)   mutex_lock(&lmv->init_mutex);
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex);
+
+#define LL_IT2STR(it)                                  \
+       ((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+                     void *lmm, int lmmsize, struct lookup_intent *it,
+                     int flags, struct ptlrpc_request **reqp,
+                     ldlm_blocking_callback cb_blocking,
+                     __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                    void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid,
+                  mdsno_t *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+                   mdsno_t mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+       struct mdt_body  *body;
+       struct lmv_stripe_md    *mea;
+
+       LASSERT(req != NULL);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+       if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+               return NULL;
+
+       mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+                                          body->eadatasize);
+       LASSERT(mea != NULL);
+
+       if (mea->mea_count == 0)
+               return NULL;
+       if( mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+               mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+               mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+               return NULL;
+
+       return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+       return sizeof(struct lmv_stripe_md) +
+               lmv->desc.ld_tgt_count *
+               sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, mdsno_t mds)
+{
+       int count = lmv->desc.ld_tgt_count;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               if (lmv->tgts[i] == NULL)
+                       continue;
+
+               if (lmv->tgts[i]->ltd_idx == mds)
+                       return lmv->tgts[i];
+       }
+
+       return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+       mdsno_t mds = 0;
+       int rc;
+
+       if (lmv->desc.ld_tgt_count > 1) {
+               rc = lmv_fld_lookup(lmv, fid, &mds);
+               if (rc)
+                       return ERR_PTR(rc);
+       }
+
+       return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+               struct lu_fid *fid);
+/* lproc_lmv.c */
+#ifdef LPROCFS
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644 (file)
index 0000000..a13eead
--- /dev/null
@@ -0,0 +1,2734 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre_lite.h>
+#include <lustre_fid.h>
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+                               struct lmv_tgt_desc *tgt,
+                               int activate)
+{
+       if (tgt->ltd_active == activate)
+               return;
+
+       tgt->ltd_active = activate;
+       lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+                             int activate)
+{
+       struct lmv_tgt_desc    *tgt;
+       struct obd_device      *obd;
+       int                  i;
+       int                  rc = 0;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+              lmv, uuid->uuid, activate);
+
+       spin_lock(&lmv->lmv_lock);
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL)
+                       continue;
+
+               CDEBUG(D_INFO, "Target idx %d is %s conn "LPX64"\n", i,
+                      tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+               if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+                       break;
+       }
+
+       if (i == lmv->desc.ld_tgt_count)
+               GOTO(out_lmv_lock, rc = -EINVAL);
+
+       obd = class_exp2obd(tgt->ltd_exp);
+       if (obd == NULL)
+               GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+       CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+              obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+              obd->obd_type->typ_name, i);
+       LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+       if (tgt->ltd_active == activate) {
+               CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+                      activate ? "" : "in");
+               GOTO(out_lmv_lock, rc);
+       }
+
+       CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+              activate ? "" : "in");
+       lmv_activate_target(lmv, tgt, activate);
+       EXIT;
+
+ out_lmv_lock:
+       spin_unlock(&lmv->lmv_lock);
+       return rc;
+}
+
+struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+       struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+       return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data)
+{
+       struct obd_connect_data *conn_data;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct obd_uuid  *uuid;
+       int                   rc = 0;
+       ENTRY;
+
+       if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+               CERROR("unexpected notification of %s %s!\n",
+                      watched->obd_type->typ_name,
+                      watched->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       uuid = &watched->u.cli.cl_target_uuid;
+       if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+               /*
+                * Set MDC as active before notifying the observer, so the
+                * observer can use the MDC normally.
+                */
+               rc = lmv_set_mdc_active(lmv, uuid,
+                                       ev == OBD_NOTIFY_ACTIVE);
+               if (rc) {
+                       CERROR("%sactivation of %s failed: %d\n",
+                              ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+                              uuid->uuid, rc);
+                       RETURN(rc);
+               }
+       } else if (ev == OBD_NOTIFY_OCD) {
+               conn_data = &watched->u.cli.cl_import->imp_connect_data;
+               /*
+                * XXX: Make sure that ocd_connect_flags from all targets are
+                * the same. Otherwise one of MDTs runs wrong version or
+                * something like this.  --umka
+                */
+               obd->obd_self_export->exp_connect_data = *conn_data;
+       }
+#if 0
+       else if (ev == OBD_NOTIFY_DISCON) {
+               /*
+                * For disconnect event, flush fld cache for failout MDS case.
+                */
+               fld_client_flush(&lmv->lmv_fld);
+       }
+#endif
+       /*
+        * Pass the notification up the chain.
+        */
+       if (obd->obd_observer)
+               rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+       RETURN(rc);
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+                      struct obd_export **exp, struct obd_device *obd,
+                      struct obd_uuid *cluuid, struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct proc_dir_entry *lmv_proc_dir;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lustre_handle  conn = { 0 };
+       int                 rc = 0;
+       ENTRY;
+
+       /*
+        * We don't want to actually do the underlying connections more than
+        * once, so keep track.
+        */
+       lmv->refcount++;
+       if (lmv->refcount > 1) {
+               *exp = NULL;
+               RETURN(0);
+       }
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc) {
+               CERROR("class_connection() returned %d\n", rc);
+               RETURN(rc);
+       }
+
+       *exp = class_conn2export(&conn);
+       class_export_get(*exp);
+
+       lmv->exp = *exp;
+       lmv->connected = 0;
+       lmv->cluuid = *cluuid;
+
+       if (data)
+               lmv->conn_data = *data;
+
+       lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+                                       NULL, NULL);
+       if (IS_ERR(lmv_proc_dir)) {
+               CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+                      obd->obd_type->typ_name, obd->obd_name);
+               lmv_proc_dir = NULL;
+       }
+
+       /*
+        * All real clients should perform actual connection right away, because
+        * it is possible, that LMV will not have opportunity to connect targets
+        * and MDC stuff will be called directly, for instance while reading
+        * ../mdc/../kbytesfree procfs file, etc.
+        */
+       if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+               rc = lmv_check_connect(obd);
+
+       if (rc) {
+               if (lmv_proc_dir)
+                       lprocfs_remove(&lmv_proc_dir);
+       }
+
+       RETURN(rc);
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+       struct lmv_tgt_desc   *tgt;
+       struct lmv_obd  *lmv;
+       int                 i;
+
+       lmv = &obd->u.lmv;
+       if (lmv->server_timeout == 0)
+               return;
+
+       if (lmv->connected == 0)
+               return;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+                       continue;
+
+               obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+                                  KEY_INTERMDS, 0, NULL, NULL);
+       }
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+                           int def_easize, int cookiesize)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc = 0;
+       int               change = 0;
+       ENTRY;
+
+       if (lmv->max_easize < easize) {
+               lmv->max_easize = easize;
+               change = 1;
+       }
+       if (lmv->max_def_easize < def_easize) {
+               lmv->max_def_easize = def_easize;
+               change = 1;
+       }
+       if (lmv->max_cookiesize < cookiesize) {
+               lmv->max_cookiesize = cookiesize;
+               change = 1;
+       }
+       if (change == 0)
+               RETURN(0);
+
+       if (lmv->connected == 0)
+               RETURN(0);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL ||
+                   lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0) {
+                       CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+                       continue;
+               }
+
+               rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+                                    cookiesize);
+               if (rc) {
+                       CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+                              " rc = %d.\n", obd->obd_name, i, rc);
+                       break;
+               }
+       }
+       RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+       struct proc_dir_entry   *lmv_proc_dir;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct obd_uuid  *cluuid = &lmv->cluuid;
+       struct obd_uuid   lmv_mdc_uuid = { "LMV_MDC_UUID" };
+       struct obd_device       *mdc_obd;
+       struct obd_export       *mdc_exp;
+       struct lu_fld_target     target;
+       int                   rc;
+       ENTRY;
+
+       mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+                                       &obd->obd_uuid);
+       if (!mdc_obd) {
+               CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+               mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+               tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+               cluuid->uuid);
+
+       if (!mdc_obd->obd_set_up) {
+               CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
+
+       rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+                        &lmv->conn_data, NULL);
+       if (rc) {
+               CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+               RETURN(rc);
+       }
+
+       /*
+        * Init fid sequence client for this mdc and add new fld target.
+        */
+       rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+       if (rc)
+               RETURN(rc);
+
+       target.ft_srv = NULL;
+       target.ft_exp = mdc_exp;
+       target.ft_idx = tgt->ltd_idx;
+
+       fld_client_add_target(&lmv->lmv_fld, &target);
+
+       rc = obd_register_observer(mdc_obd, obd);
+       if (rc) {
+               obd_disconnect(mdc_exp);
+               CERROR("target %s register_observer error %d\n",
+                      tgt->ltd_uuid.uuid, rc);
+               RETURN(rc);
+       }
+
+       if (obd->obd_observer) {
+               /*
+                * Tell the observer about the new target.
+                */
+               rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+                               OBD_NOTIFY_ACTIVE,
+                               (void *)(tgt - lmv->tgts[0]));
+               if (rc) {
+                       obd_disconnect(mdc_exp);
+                       RETURN(rc);
+               }
+       }
+
+       tgt->ltd_active = 1;
+       tgt->ltd_exp = mdc_exp;
+       lmv->desc.ld_active_tgt_count++;
+
+       md_init_ea_size(tgt->ltd_exp, lmv->max_easize,
+                       lmv->max_def_easize, lmv->max_cookiesize);
+
+       CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+               mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+               atomic_read(&obd->obd_refcount));
+
+       lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+       if (lmv_proc_dir) {
+               struct proc_dir_entry *mdc_symlink;
+
+               LASSERT(mdc_obd->obd_type != NULL);
+               LASSERT(mdc_obd->obd_type->typ_name != NULL);
+               mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+                                                 lmv_proc_dir,
+                                                 "../../../%s/%s",
+                                                 mdc_obd->obd_type->typ_name,
+                                                 mdc_obd->obd_name);
+               if (mdc_symlink == NULL) {
+                       CERROR("Could not register LMV target "
+                              "/proc/fs/lustre/%s/%s/target_obds/%s.",
+                              obd->obd_type->typ_name, obd->obd_name,
+                              mdc_obd->obd_name);
+                       lprocfs_remove(&lmv_proc_dir);
+                       lmv_proc_dir = NULL;
+               }
+       }
+       RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+       if (lmv->tgts[index] == NULL)
+               return;
+
+       OBD_FREE_PTR(lmv->tgts[index]);
+       lmv->tgts[index] = NULL;
+       return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                          __u32 index, int gen)
+{
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt;
+       int               rc = 0;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+       lmv_init_lock(lmv);
+
+       if (lmv->desc.ld_tgt_count == 0) {
+               struct obd_device *mdc_obd;
+
+               mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+                                               &obd->obd_uuid);
+               if (!mdc_obd) {
+                       lmv_init_unlock(lmv);
+                       CERROR("%s: Target %s not attached: rc = %d\n",
+                              obd->obd_name, uuidp->uuid, -EINVAL);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+               tgt = lmv->tgts[index];
+               CERROR("%s: UUID %s already assigned at LOV target index %d:"
+                      " rc = %d\n", obd->obd_name,
+                      obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+               lmv_init_unlock(lmv);
+               RETURN(-EEXIST);
+       }
+
+       if (index >= lmv->tgts_size) {
+               /* We need to reallocate the lmv target array. */
+               struct lmv_tgt_desc **newtgts, **old = NULL;
+               __u32 newsize = 1;
+               __u32 oldsize = 0;
+
+               while (newsize < index + 1)
+                       newsize = newsize << 1;
+               OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+               if (newtgts == NULL) {
+                       lmv_init_unlock(lmv);
+                       RETURN(-ENOMEM);
+               }
+
+               if (lmv->tgts_size) {
+                       memcpy(newtgts, lmv->tgts,
+                              sizeof(*newtgts) * lmv->tgts_size);
+                       old = lmv->tgts;
+                       oldsize = lmv->tgts_size;
+               }
+
+               lmv->tgts = newtgts;
+               lmv->tgts_size = newsize;
+               smp_rmb();
+               if (old)
+                       OBD_FREE(old, sizeof(*old) * oldsize);
+
+               CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+                      lmv->tgts_size);
+       }
+
+       OBD_ALLOC_PTR(tgt);
+       if (!tgt) {
+               lmv_init_unlock(lmv);
+               RETURN(-ENOMEM);
+       }
+
+       mutex_init(&tgt->ltd_fid_mutex);
+       tgt->ltd_idx = index;
+       tgt->ltd_uuid = *uuidp;
+       tgt->ltd_active = 0;
+       lmv->tgts[index] = tgt;
+       if (index >= lmv->desc.ld_tgt_count)
+               lmv->desc.ld_tgt_count = index + 1;
+
+       if (lmv->connected) {
+               rc = lmv_connect_mdc(obd, tgt);
+               if (rc) {
+                       spin_lock(&lmv->lmv_lock);
+                       lmv->desc.ld_tgt_count--;
+                       memset(tgt, 0, sizeof(*tgt));
+                       spin_unlock(&lmv->lmv_lock);
+               } else {
+                       int easize = sizeof(struct lmv_stripe_md) +
+                                    lmv->desc.ld_tgt_count *
+                                    sizeof(struct lu_fid);
+                       lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+               }
+       }
+
+       lmv_init_unlock(lmv);
+       RETURN(rc);
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc  *tgt;
+       int                i;
+       int                rc;
+       int                easize;
+       ENTRY;
+
+       if (lmv->connected)
+               RETURN(0);
+
+       lmv_init_lock(lmv);
+       if (lmv->connected) {
+               lmv_init_unlock(lmv);
+               RETURN(0);
+       }
+
+       if (lmv->desc.ld_tgt_count == 0) {
+               lmv_init_unlock(lmv);
+               CERROR("%s: no targets configured.\n", obd->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+              lmv->cluuid.uuid, obd->obd_name);
+
+       LASSERT(lmv->tgts != NULL);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL)
+                       continue;
+               rc = lmv_connect_mdc(obd, tgt);
+               if (rc)
+                       GOTO(out_disc, rc);
+       }
+
+       lmv_set_timeouts(obd);
+       class_export_put(lmv->exp);
+       lmv->connected = 1;
+       easize = lmv_get_easize(lmv);
+       lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+       lmv_init_unlock(lmv);
+       RETURN(0);
+
+ out_disc:
+       while (i-- > 0) {
+               int rc2;
+               tgt = lmv->tgts[i];
+               if (tgt == NULL)
+                       continue;
+               tgt->ltd_active = 0;
+               if (tgt->ltd_exp) {
+                       --lmv->desc.ld_active_tgt_count;
+                       rc2 = obd_disconnect(tgt->ltd_exp);
+                       if (rc2) {
+                               CERROR("LMV target %s disconnect on "
+                                      "MDC idx %d: error %d\n",
+                                      tgt->ltd_uuid.uuid, i, rc2);
+                       }
+               }
+       }
+       class_disconnect(lmv->exp);
+       lmv_init_unlock(lmv);
+       RETURN(rc);
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+       struct proc_dir_entry  *lmv_proc_dir;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct obd_device      *mdc_obd;
+       int                  rc;
+       ENTRY;
+
+       LASSERT(tgt != NULL);
+       LASSERT(obd != NULL);
+
+       mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+       if (mdc_obd) {
+               mdc_obd->obd_force = obd->obd_force;
+               mdc_obd->obd_fail = obd->obd_fail;
+               mdc_obd->obd_no_recov = obd->obd_no_recov;
+       }
+
+       lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+       if (lmv_proc_dir) {
+               struct proc_dir_entry *mdc_symlink;
+
+               mdc_symlink = lprocfs_srch(lmv_proc_dir, mdc_obd->obd_name);
+               if (mdc_symlink) {
+                       lprocfs_remove(&mdc_symlink);
+               } else {
+                       CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing\n",
+                              obd->obd_type->typ_name, obd->obd_name,
+                              mdc_obd->obd_name);
+               }
+       }
+       rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+       if (rc)
+               CERROR("Can't finanize fids factory\n");
+
+       CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+              tgt->ltd_exp->exp_obd->obd_name,
+              tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+       obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+       rc = obd_disconnect(tgt->ltd_exp);
+       if (rc) {
+               if (tgt->ltd_active) {
+                       CERROR("Target %s disconnect error %d\n",
+                              tgt->ltd_uuid.uuid, rc);
+               }
+       }
+
+       lmv_activate_target(lmv, tgt, 0);
+       tgt->ltd_exp = NULL;
+       RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct proc_dir_entry *lmv_proc_dir;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       int                 rc;
+       int                 i;
+       ENTRY;
+
+       if (!lmv->tgts)
+               goto out_local;
+
+       /*
+        * Only disconnect the underlying layers on the final disconnect.
+        */
+       lmv->refcount--;
+       if (lmv->refcount != 0)
+               goto out_local;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+
+               lmv_disconnect_mdc(obd, lmv->tgts[i]);
+       }
+
+       lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+       if (lmv_proc_dir) {
+               lprocfs_remove(&lmv_proc_dir);
+       } else {
+               CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+                      obd->obd_type->typ_name, obd->obd_name);
+       }
+
+out_local:
+       /*
+        * This is the case when no real connection is established by
+        * lmv_check_connect().
+        */
+       if (!lmv->connected)
+               class_export_put(exp);
+       rc = class_disconnect(exp);
+       if (lmv->refcount == 0)
+               lmv->connected = 0;
+       RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+       struct obd_device       *obddev = class_exp2obd(exp);
+       struct lmv_obd          *lmv = &obddev->u.lmv;
+       struct getinfo_fid2path *gf;
+       struct lmv_tgt_desc     *tgt;
+       struct getinfo_fid2path *remote_gf = NULL;
+       int                     remote_gf_size = 0;
+       int                     rc;
+
+       gf = (struct getinfo_fid2path *)karg;
+       tgt = lmv_find_target(lmv, &gf->gf_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+repeat_fid2path:
+       rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+       if (rc != 0 && rc != -EREMOTE)
+               GOTO(out_fid2path, rc);
+
+       /* If remote_gf != NULL, it means just building the
+        * path on the remote MDT, copy this path segement to gf */
+       if (remote_gf != NULL) {
+               struct getinfo_fid2path *ori_gf;
+               char *ptr;
+
+               ori_gf = (struct getinfo_fid2path *)karg;
+               if (strlen(ori_gf->gf_path) +
+                   strlen(gf->gf_path) > ori_gf->gf_pathlen)
+                       GOTO(out_fid2path, rc = -EOVERFLOW);
+
+               ptr = ori_gf->gf_path;
+
+               memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+                       strlen(ori_gf->gf_path));
+
+               strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+               ptr += strlen(gf->gf_path);
+               *ptr = '/';
+       }
+
+       CDEBUG(D_INFO, "%s: get path %s "DFID" rec: "LPU64" ln: %u\n",
+              tgt->ltd_exp->exp_obd->obd_name,
+              gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+              gf->gf_linkno);
+
+       if (rc == 0)
+               GOTO(out_fid2path, rc);
+
+       /* sigh, has to go to another MDT to do path building further */
+       if (remote_gf == NULL) {
+               remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+               OBD_ALLOC(remote_gf, remote_gf_size);
+               if (remote_gf == NULL)
+                       GOTO(out_fid2path, rc = -ENOMEM);
+               remote_gf->gf_pathlen = PATH_MAX;
+       }
+
+       if (!fid_is_sane(&gf->gf_fid)) {
+               CERROR("%s: invalid FID "DFID": rc = %d\n",
+                      tgt->ltd_exp->exp_obd->obd_name,
+                      PFID(&gf->gf_fid), -EINVAL);
+               GOTO(out_fid2path, rc = -EINVAL);
+       }
+
+       tgt = lmv_find_target(lmv, &gf->gf_fid);
+       if (IS_ERR(tgt))
+               GOTO(out_fid2path, rc = -EINVAL);
+
+       remote_gf->gf_fid = gf->gf_fid;
+       remote_gf->gf_recno = -1;
+       remote_gf->gf_linkno = -1;
+       memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+       gf = remote_gf;
+       goto repeat_fid2path;
+
+out_fid2path:
+       if (remote_gf != NULL)
+               OBD_FREE(remote_gf, remote_gf_size);
+       RETURN(rc);
+}
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+                        int len, void *karg, void *uarg)
+{
+       struct obd_device    *obddev = class_exp2obd(exp);
+       struct lmv_obd       *lmv = &obddev->u.lmv;
+       int                i = 0;
+       int                rc = 0;
+       int                set = 0;
+       int                count = lmv->desc.ld_tgt_count;
+       ENTRY;
+
+       if (count == 0)
+               RETURN(-ENOTTY);
+
+       switch (cmd) {
+       case IOC_OBD_STATFS: {
+               struct obd_ioctl_data *data = karg;
+               struct obd_device *mdc_obd;
+               struct obd_statfs stat_buf = {0};
+               __u32 index;
+
+               memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+               if ((index >= count))
+                       RETURN(-ENODEV);
+
+               if (lmv->tgts[index] == NULL ||
+                   lmv->tgts[index]->ltd_active == 0)
+                       RETURN(-ENODATA);
+
+               mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+               if (!mdc_obd)
+                       RETURN(-EINVAL);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       RETURN(-EFAULT);
+
+               rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+               if (rc)
+                       RETURN(rc);
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       RETURN(-EFAULT);
+               break;
+       }
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct lmv_tgt_desc *tgt = NULL;
+               struct obd_quotactl *oqctl;
+
+               if (qctl->qc_valid == QC_MDTIDX) {
+                       if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+                               RETURN(-EINVAL);
+
+                       tgt = lmv->tgts[qctl->qc_idx];
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               RETURN(-EINVAL);
+               } else if (qctl->qc_valid == QC_UUID) {
+                       for (i = 0; i < count; i++) {
+                               tgt = lmv->tgts[i];
+                               if (tgt == NULL)
+                                       continue;
+                               if (!obd_uuid_equals(&tgt->ltd_uuid,
+                                                    &qctl->obd_uuid))
+                                       continue;
+
+                               if (tgt->ltd_exp == NULL)
+                                       RETURN(-EINVAL);
+
+                               break;
+                       }
+               } else {
+                       RETURN(-EINVAL);
+               }
+
+               if (i >= count)
+                       RETURN(-EAGAIN);
+
+               LASSERT(tgt && tgt->ltd_exp);
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_MDTIDX;
+                       qctl->obd_uuid = tgt->ltd_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       case OBD_IOC_CHANGELOG_SEND:
+       case OBD_IOC_CHANGELOG_CLEAR: {
+               struct ioc_changelog *icc = karg;
+
+               if (icc->icc_mdtindex >= count)
+                       RETURN(-ENODEV);
+
+               if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+                   lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+                   lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+                       RETURN(-ENODEV);
+               rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+                                  sizeof(*icc), icc, NULL);
+               break;
+       }
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               if (lmv->tgts[0] == NULL)
+                       RETURN(-ENODATA);
+               rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+               break;
+       }
+       case OBD_IOC_FID2PATH: {
+               rc = lmv_fid2path(exp, len, karg, uarg);
+               break;
+       }
+       case LL_IOC_HSM_STATE_GET:
+       case LL_IOC_HSM_STATE_SET:
+       case LL_IOC_HSM_ACTION:
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               struct md_op_data       *op_data = karg;
+               struct lmv_tgt_desc     *tgt1, *tgt2;
+
+               tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+               if (IS_ERR(tgt1))
+                       RETURN(PTR_ERR(tgt1));
+
+               tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+               if (IS_ERR(tgt2))
+                       RETURN(PTR_ERR(tgt2));
+
+               if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+                       RETURN(-EINVAL);
+
+               /* only files on same MDT can have their layouts swapped */
+               if (tgt1->ltd_idx != tgt2->ltd_idx)
+                       RETURN(-EPERM);
+
+               rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+               break;
+       }
+       default:
+               for (i = 0; i < count; i++) {
+                       struct obd_device *mdc_obd;
+                       int err;
+
+                       if (lmv->tgts[i] == NULL ||
+                           lmv->tgts[i]->ltd_exp == NULL)
+                               continue;
+                       /* ll_umount_begin() sets force flag but for lmv, not
+                        * mdc. Let's pass it through */
+                       mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+                       mdc_obd->obd_force = obddev->obd_force;
+                       err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+                                           karg, uarg);
+                       if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+                               RETURN(err);
+                       } else if (err) {
+                               if (lmv->tgts[i]->ltd_active) {
+                                       CERROR("error: iocontrol MDC %s on MDT"
+                                              "idx %d cmd %x: err = %d\n",
+                                               lmv->tgts[i]->ltd_uuid.uuid,
+                                               i, cmd, err);
+                                       if (!rc)
+                                               rc = err;
+                               }
+                       } else
+                               set = 1;
+               }
+               if (!set && !rc)
+                       rc = -EIO;
+       }
+       RETURN(rc);
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+                               int len)
+{
+       unsigned int c = 0;
+
+       while (len > 0)
+               c += name[--len];
+       c = c % count;
+       return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+       struct obd_import *imp;
+       __u32         id;
+
+       /*
+        * XXX: To get nid we assume that underlying obd device is mdc.
+        */
+       imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+       id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+       return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+                         placement_policy_t placement)
+{
+       switch (placement) {
+       case PLACEMENT_CHAR_POLICY:
+               return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+                                           op_data->op_name,
+                                           op_data->op_namelen);
+       case PLACEMENT_NID_POLICY:
+               return lmv_nid_policy(lmv);
+
+       default:
+               break;
+       }
+
+       CERROR("Unsupported placement policy %x\n", placement);
+       return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+                               struct md_op_data *op_data,
+                               mdsno_t *mds)
+{
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ENTRY;
+
+       LASSERT(mds != NULL);
+
+       if (lmv->desc.ld_tgt_count == 1) {
+               *mds = 0;
+               RETURN(0);
+       }
+
+       /**
+        * If stripe_offset is provided during setdirstripe
+        * (setdirstripe -i xx), xx MDS will be choosen.
+        */
+       if (op_data->op_cli_flags & CLI_SET_MEA) {
+               struct lmv_user_md *lum;
+
+               lum = (struct lmv_user_md *)op_data->op_data;
+               if (lum->lum_type == LMV_STRIPE_TYPE &&
+                   lum->lum_stripe_offset != -1) {
+                       if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+                               CERROR("%s: Stripe_offset %d > MDT count %d:"
+                                      " rc = %d\n", obd->obd_name,
+                                      lum->lum_stripe_offset,
+                                      lmv->desc.ld_tgt_count, -ERANGE);
+                               RETURN(-ERANGE);
+                       }
+                       *mds = lum->lum_stripe_offset;
+                       RETURN(0);
+               }
+       }
+
+       /* Allocate new fid on target according to operation type and parent
+        * home mds. */
+       *mds = op_data->op_mds;
+       RETURN(0);
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+                   mdsno_t mds)
+{
+       struct lmv_tgt_desc     *tgt;
+       int                      rc;
+       ENTRY;
+
+       tgt = lmv_get_target(lmv, mds);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /*
+        * New seq alloc and FLD setup should be atomic. Otherwise we may find
+        * on server that seq in new allocated fid is not yet known.
+        */
+       mutex_lock(&tgt->ltd_fid_mutex);
+
+       if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       /*
+        * Asking underlaying tgt layer to allocate new fid.
+        */
+       rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+       if (rc > 0) {
+               LASSERT(fid_is_sane(fid));
+               rc = 0;
+       }
+
+       EXIT;
+out:
+       mutex_unlock(&tgt->ltd_fid_mutex);
+       return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       mdsno_t         mds = 0;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       LASSERT(fid != NULL);
+
+       rc = lmv_placement_policy(obd, op_data, &mds);
+       if (rc) {
+               CERROR("Can't get target for allocating fid, "
+                      "rc %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = __lmv_fid_alloc(lmv, fid, mds);
+       if (rc) {
+               CERROR("Can't alloc new fid, rc %d\n", rc);
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lprocfs_static_vars  lvars;
+       struct lmv_desc     *desc;
+       int                      rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("LMV setup requires a descriptor\n");
+               RETURN(-EINVAL);
+       }
+
+       desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+       if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("Lmv descriptor size wrong: %d > %d\n",
+                      (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+       if (lmv->tgts == NULL)
+               RETURN(-ENOMEM);
+       lmv->tgts_size = 32;
+
+       obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+       lmv->desc.ld_tgt_count = 0;
+       lmv->desc.ld_active_tgt_count = 0;
+       lmv->max_cookiesize = 0;
+       lmv->max_def_easize = 0;
+       lmv->max_easize = 0;
+       lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+       spin_lock_init(&lmv->lmv_lock);
+       mutex_init(&lmv->init_mutex);
+
+       lprocfs_lmv_init_vars(&lvars);
+
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+       {
+               rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+                                       0444, &lmv_proc_target_fops, obd);
+               if (rc)
+                       CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+                              obd->obd_name, rc);
+       }
+#endif
+       rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+                            LUSTRE_CLI_FLD_HASH_DHT);
+       if (rc) {
+               CERROR("Can't init FLD, err %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       RETURN(0);
+
+out:
+       return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       ENTRY;
+
+       fld_client_fini(&lmv->lmv_fld);
+       if (lmv->tgts != NULL) {
+               int i;
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       if (lmv->tgts[i] == NULL)
+                               continue;
+                       lmv_del_target(lmv, i);
+               }
+               OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+               lmv->tgts_size = 0;
+       }
+       RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg       *lcfg = buf;
+       struct obd_uuid         obd_uuid;
+       int                     gen;
+       __u32                   index;
+       int                     rc;
+       ENTRY;
+
+       switch (lcfg->lcfg_command) {
+       case LCFG_ADD_MDC:
+               /* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+                * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                       GOTO(out, rc = -EINVAL);
+
+               obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+               if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+                       GOTO(out, rc = -EINVAL);
+               if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+                       GOTO(out, rc = -EINVAL);
+               rc = lmv_add_target(obd, &obd_uuid, index, gen);
+               GOTO(out, rc);
+       default:
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+       }
+out:
+       RETURN(rc);
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct obd_statfs     *temp;
+       int                 rc = 0;
+       int                 i;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       OBD_ALLOC(temp, sizeof(*temp));
+       if (temp == NULL)
+               RETURN(-ENOMEM);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+
+               rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+                               max_age, flags);
+               if (rc) {
+                       CERROR("can't stat MDS #%d (%s), error %d\n", i,
+                              lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+                              rc);
+                       GOTO(out_free_temp, rc);
+               }
+
+               if (i == 0) {
+                       *osfs = *temp;
+                       /* If the statfs is from mount, it will needs
+                        * retrieve necessary information from MDT0.
+                        * i.e. mount does not need the merged osfs
+                        * from all of MDT.
+                        * And also clients can be mounted as long as
+                        * MDT0 is in service*/
+                       if (flags & OBD_STATFS_FOR_MDT0)
+                               GOTO(out_free_temp, rc);
+               } else {
+                       osfs->os_bavail += temp->os_bavail;
+                       osfs->os_blocks += temp->os_blocks;
+                       osfs->os_ffree += temp->os_ffree;
+                       osfs->os_files += temp->os_files;
+               }
+       }
+
+       EXIT;
+out_free_temp:
+       OBD_FREE(temp, sizeof(*temp));
+       return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+                        struct lu_fid *fid,
+                        struct obd_capa **pc)
+{
+       struct obd_device    *obd = exp->exp_obd;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       int                rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+       RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, obd_valid valid, const char *name,
+                       const char *input, int input_size, int output_size,
+                       int flags, struct ptlrpc_request **request)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+                        input_size, output_size, flags, request);
+
+       RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, obd_valid valid, const char *name,
+                       const char *input, int input_size, int output_size,
+                       int flags, __u32 suppgid,
+                       struct ptlrpc_request **request)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+                        input_size, output_size, flags, suppgid,
+                        request);
+
+       RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                      struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (op_data->op_flags & MF_GET_MDT_IDX) {
+               op_data->op_mds = tgt->ltd_idx;
+               RETURN(0);
+       }
+
+       rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+       RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+       /*
+        * With DNE every object can have two locks in different namespaces:
+        * lookup lock in space of MDT storing direntry and update/open lock in
+        * space of MDT storing inode.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+               md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+       }
+
+       RETURN(0);
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+                          ldlm_iterator_t it, void *data)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+       /*
+        * With DNE every object can have two locks in different namespaces:
+        * lookup lock in space of MDT storing direntry and update/open lock in
+        * space of MDT storing inode.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+               rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+                    struct md_open_data *mod, struct ptlrpc_request **request)
+{
+       struct obd_device     *obd = exp->exp_obd;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc   *tgt;
+       int                 rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+       rc = md_close(tgt->ltd_exp, op_data, mod, request);
+       RETURN(rc);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+               struct lu_fid *fid)
+{
+       struct lmv_tgt_desc *tgt;
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               return tgt;
+
+       op_data->op_mds = tgt->ltd_idx;
+
+       return tgt;
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid,
+              __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+              struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       if (!lmv->desc.ld_active_tgt_count)
+               RETURN(-EIO);
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              op_data->op_mds);
+
+       op_data->op_flags |= MF_MDC_CANCEL_FID1;
+       rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+                      cap_effective, rdev, request);
+
+       if (rc == 0) {
+               if (*request == NULL)
+                       RETURN(rc);
+               CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+       }
+       RETURN(rc);
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+                           struct md_op_data *op_data,
+                           struct md_open_data *mod)
+{
+       struct obd_device     *obd = exp->exp_obd;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc   *tgt;
+       int                 rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+       RETURN(rc);
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+                  struct lookup_intent *it, struct md_op_data *op_data,
+                  struct lustre_handle *lockh, void *lmm, int lmmsize,
+                  int extra_lock_flags)
+{
+       struct ptlrpc_request      *req = it->d.lustre.it_data;
+       struct obd_device         *obd = exp->exp_obd;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lustre_handle    plock;
+       struct lmv_tgt_desc     *tgt;
+       struct md_op_data         *rdata;
+       struct lu_fid          fid1;
+       struct mdt_body     *body;
+       int                      rc = 0;
+       int                      pmode;
+       ENTRY;
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       if (!(body->valid & OBD_MD_MDS))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+       /*
+        * We got LOOKUP lock, but we really need attrs.
+        */
+       pmode = it->d.lustre.it_lock_mode;
+       LASSERT(pmode != 0);
+       memcpy(&plock, lockh, sizeof(plock));
+       it->d.lustre.it_lock_mode = 0;
+       it->d.lustre.it_data = NULL;
+       fid1 = body->fid1;
+
+       it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+       ptlrpc_req_finished(req);
+
+       tgt = lmv_find_target(lmv, &fid1);
+       if (IS_ERR(tgt))
+               GOTO(out, rc = PTR_ERR(tgt));
+
+       OBD_ALLOC_PTR(rdata);
+       if (rdata == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rdata->op_fid1 = fid1;
+       rdata->op_bias = MDS_CROSS_REF;
+
+       rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+                       lmm, lmmsize, NULL, extra_lock_flags);
+       OBD_FREE_PTR(rdata);
+       EXIT;
+out:
+       ldlm_lock_decref(&plock, pmode);
+       return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+           struct lookup_intent *it, struct md_op_data *op_data,
+           struct lustre_handle *lockh, void *lmm, int lmmsize,
+           struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd     *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc      *tgt;
+       int                    rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+       rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+                       lmm, lmmsize, req, extra_lock_flags);
+
+       if (rc == 0 && it && it->it_op == IT_OPEN) {
+               rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+                                       lmm, lmmsize, extra_lock_flags);
+       }
+       RETURN(rc);
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+                struct ptlrpc_request **request)
+{
+       struct ptlrpc_request   *req = NULL;
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body  *body;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              tgt->ltd_idx);
+
+       rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+       if (rc != 0)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*request)->rq_pill,
+                                     &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       if (body->valid & OBD_MD_MDS) {
+               struct lu_fid rid = body->fid1;
+               CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+                      PFID(&rid));
+
+               tgt = lmv_find_target(lmv, &rid);
+               if (IS_ERR(tgt)) {
+                       ptlrpc_req_finished(*request);
+                       RETURN(PTR_ERR(tgt));
+               }
+
+               op_data->op_fid1 = rid;
+               op_data->op_valid |= OBD_MD_FLCROSSREF;
+               op_data->op_namelen = 0;
+               op_data->op_name = NULL;
+               rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+               ptlrpc_req_finished(*request);
+               *request = req;
+       }
+
+       RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)                 \
+       (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+        fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+        fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+        fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+        NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+                           int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+       struct lu_fid     *fid = md_op_data_fid(op_data, flag);
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       ldlm_policy_data_t      policy = {{0}};
+       int                  rc = 0;
+       ENTRY;
+
+       if (!fid_is_sane(fid))
+               RETURN(0);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (tgt->ltd_idx != op_tgt) {
+               CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+               policy.l_inodebits.bits = bits;
+               rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+                                     mode, LCF_ASYNC, NULL);
+       } else {
+               CDEBUG(D_INODE,
+                      "EARLY_CANCEL skip operation target %d on "DFID"\n",
+                      op_tgt, PFID(fid));
+               op_data->op_flags |= flag;
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+                   struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(op_data->op_namelen != 0);
+
+       CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+              PFID(&op_data->op_fid2), op_data->op_namelen,
+              op_data->op_name, PFID(&op_data->op_fid1));
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /*
+        * Cancel UPDATE lock on child (fid1).
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID2;
+       rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+                             MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = md_link(tgt->ltd_exp, op_data, request);
+
+       RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+                     const char *old, int oldlen, const char *new, int newlen,
+                     struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *src_tgt;
+       struct lmv_tgt_desc     *tgt_tgt;
+       int                     rc;
+       ENTRY;
+
+       LASSERT(oldlen != 0);
+
+       CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+              oldlen, old, PFID(&op_data->op_fid1),
+              newlen, new, PFID(&op_data->op_fid2));
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(src_tgt))
+               RETURN(PTR_ERR(src_tgt));
+
+       tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       if (IS_ERR(tgt_tgt))
+               RETURN(PTR_ERR(tgt_tgt));
+       /*
+        * LOOKUP lock on src child (fid3) should also be cancelled for
+        * src_tgt in mdc_rename.
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+       /*
+        * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+        * own target.
+        */
+       rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                             LCK_EX, MDS_INODELOCK_UPDATE,
+                             MF_MDC_CANCEL_FID2);
+
+       /*
+        * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+        */
+       if (rc == 0) {
+               rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                                     LCK_EX, MDS_INODELOCK_LOOKUP,
+                                     MF_MDC_CANCEL_FID4);
+       }
+
+       /*
+        * Cancel all the locks on tgt child (fid4).
+        */
+       if (rc == 0)
+               rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     MF_MDC_CANCEL_FID4);
+
+       if (rc == 0)
+               rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+                              new, newlen, request);
+       RETURN(rc);
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+                      void *ea, int ealen, void *ea2, int ea2len,
+                      struct ptlrpc_request **request,
+                      struct md_open_data **mod)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc = 0;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+              PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+       op_data->op_flags |= MF_MDC_CANCEL_FID1;
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+                       ea2len, request, mod);
+
+       RETURN(rc);
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+                   struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       struct obd_device        *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc       *tgt;
+       int                     rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_sync(tgt->ltd_exp, fid, oc, request);
+       RETURN(rc);
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |  |
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each CFS_PAGE contains a single
+ *    '------.            lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each CFS_PAGE contains
+ *          ...                 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |   next CFS_PAGE       |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+       int i;
+
+       for (i = 0; i < ncfspgs; i++) {
+               struct lu_dirpage       *dp = kmap(pages[i]);
+               struct lu_dirpage       *first = dp;
+               struct lu_dirent        *end_dirent = NULL;
+               struct lu_dirent        *ent;
+               __u64                   hash_end = dp->ldp_hash_end;
+               __u32                   flags = dp->ldp_flags;
+
+               for (; nlupgs > 1; nlupgs--) {
+                       ent = lu_dirent_start(dp);
+                       for (end_dirent = ent; ent != NULL;
+                            end_dirent = ent, ent = lu_dirent_next(ent));
+
+                       /* Advance dp to next lu_dirpage. */
+                       dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+                       /* Check if we've reached the end of the CFS_PAGE. */
+                       if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+                               break;
+
+                       /* Save the hash and flags of this lu_dirpage. */
+                       hash_end = dp->ldp_hash_end;
+                       flags = dp->ldp_flags;
+
+                       /* Check if lu_dirpage contains no entries. */
+                       if (!end_dirent)
+                               break;
+
+                       /* Enlarge the end entry lde_reclen from 0 to
+                        * first entry of next lu_dirpage. */
+                       LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+                       end_dirent->lde_reclen =
+                               cpu_to_le16((char *)(dp->ldp_entries) -
+                                           (char *)end_dirent);
+               }
+
+               first->ldp_hash_end = hash_end;
+               first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+               first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+               kunmap(pages[i]);
+       }
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif /* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+                       struct page **pages, struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       __u64                   offset = op_data->op_offset;
+       int                     rc;
+       int                     ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+       int                     nlupgs; /* pages read in LU_PAGE_SIZE */
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "READPAGE at "LPX64" from "DFID"\n",
+              offset, PFID(&op_data->op_fid1));
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+       if (rc != 0)
+               RETURN(rc);
+
+       ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+                >> PAGE_CACHE_SHIFT;
+       nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+       LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+       LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+       CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+              op_data->op_npages);
+
+       lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+       RETURN(rc);
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                     struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt = NULL;
+       struct mdt_body         *body;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+retry:
+       /* Send unlink requests to the MDT where the child is located */
+       if (likely(!fid_is_zero(&op_data->op_fid2)))
+               tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       else
+               tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+
+       /*
+        * If child's fid is given, cancel unused locks for it if it is from
+        * another export than parent.
+        *
+        * LOOKUP lock for child (fid3) should also be cancelled on parent
+        * tgt_tgt in mdc_unlink().
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+       /*
+        * Cancel FULL locks on child (fid3).
+        */
+       rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+                             MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+       if (rc != 0)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+              PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+       rc = md_unlink(tgt->ltd_exp, op_data, request);
+       if (rc != 0 && rc != -EREMOTE)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       /* Not cross-ref case, just get out of here. */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+              exp->exp_obd->obd_name, PFID(&body->fid1));
+
+       /* This is a remote object, try remote MDT, Note: it may
+        * try more than 1 time here, Considering following case
+        * /mnt/lustre is root on MDT0, remote1 is on MDT1
+        * 1. Initially A does not know where remote1 is, it send
+        *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+        *    resend unlink RPC to MDT1 (retry 1st time).
+        *
+        * 2. During the unlink RPC in flight,
+        *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+        *    and create new remote1, but on MDT0
+        *
+        * 3. MDT1 get unlink RPC(from A), then do remote lock on
+        *    /mnt/lustre, then lookup get fid of remote1, and find
+        *    it is remote dir again, and replay -EREMOTE again.
+        *
+        * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+        *
+        * In theory, it might try unlimited time here, but it should
+        * be very rare case.  */
+       op_data->op_fid2 = body->fid1;
+       ptlrpc_req_finished(*request);
+       *request = NULL;
+
+       goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       struct lmv_obd *lmv = &obd->u.lmv;
+       int rc = 0;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               /* XXX: here should be calling obd_precleanup() down to
+                * stack. */
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               fld_client_proc_fini(&lmv->lmv_fld);
+               lprocfs_obd_cleanup(obd);
+               break;
+       default:
+               break;
+       }
+       RETURN(rc);
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       struct obd_device       *obd;
+       struct lmv_obd    *lmv;
+       int                   rc = 0;
+       ENTRY;
+
+       obd = class_exp2obd(exp);
+       if (obd == NULL) {
+               CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       lmv = &obd->u.lmv;
+       if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+               struct lmv_tgt_desc *tgt;
+               int i;
+
+               rc = lmv_check_connect(obd);
+               if (rc)
+                       RETURN(rc);
+
+               LASSERT(*vallen == sizeof(__u32));
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       tgt = lmv->tgts[i];
+                       /*
+                        * All tgts should be connected when this gets called.
+                        */
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               continue;
+
+                       if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+                                         vallen, val, NULL))
+                               RETURN(0);
+               }
+               RETURN(-EINVAL);
+       } else if (KEY_IS(KEY_MAX_EASIZE) || KEY_IS(KEY_CONN_DATA)) {
+               rc = lmv_check_connect(obd);
+               if (rc)
+                       RETURN(rc);
+
+               /*
+                * Forwarding this request to first MDS, it should know LOV
+                * desc.
+                */
+               rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+                                 vallen, val, NULL);
+               if (!rc && KEY_IS(KEY_CONN_DATA))
+                       exp->exp_connect_data = *(struct obd_connect_data *)val;
+               RETURN(rc);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = lmv->desc.ld_tgt_count;
+               RETURN(0);
+       }
+
+       CDEBUG(D_IOCTL, "Invalid key\n");
+       RETURN(-EINVAL);
+}
+
+int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      obd_count keylen, void *key, obd_count vallen,
+                      void *val, struct ptlrpc_request_set *set)
+{
+       struct lmv_tgt_desc    *tgt;
+       struct obd_device      *obd;
+       struct lmv_obd   *lmv;
+       int rc = 0;
+       ENTRY;
+
+       obd = class_exp2obd(exp);
+       if (obd == NULL) {
+               CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+       lmv = &obd->u.lmv;
+
+       if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+               int i, err = 0;
+
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       tgt = lmv->tgts[i];
+
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                                keylen, key, vallen, val, set);
+                       if (err && rc == 0)
+                               rc = err;
+               }
+
+               RETURN(rc);
+       }
+
+       RETURN(-EINVAL);
+}
+
+int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+              struct lov_stripe_md *lsm)
+{
+       struct obd_device        *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_stripe_md      *meap;
+       struct lmv_stripe_md      *lsmp;
+       int                     mea_size;
+       int                     i;
+       ENTRY;
+
+       mea_size = lmv_get_easize(lmv);
+       if (!lmmp)
+               RETURN(mea_size);
+
+       if (*lmmp && !lsm) {
+               OBD_FREE_LARGE(*lmmp, mea_size);
+               *lmmp = NULL;
+               RETURN(0);
+       }
+
+       if (*lmmp == NULL) {
+               OBD_ALLOC_LARGE(*lmmp, mea_size);
+               if (*lmmp == NULL)
+                       RETURN(-ENOMEM);
+       }
+
+       if (!lsm)
+               RETURN(mea_size);
+
+       lsmp = (struct lmv_stripe_md *)lsm;
+       meap = (struct lmv_stripe_md *)*lmmp;
+
+       if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+           lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+               RETURN(-EINVAL);
+
+       meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+       meap->mea_count = cpu_to_le32(lsmp->mea_count);
+       meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               meap->mea_ids[i] = lsmp->mea_ids[i];
+               fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+       }
+
+       RETURN(mea_size);
+}
+
+int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_size)
+{
+       struct obd_device         *obd = class_exp2obd(exp);
+       struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
+       struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       int                      mea_size;
+       int                      i;
+       __u32                  magic;
+       ENTRY;
+
+       mea_size = lmv_get_easize(lmv);
+       if (lsmp == NULL)
+               return mea_size;
+
+       if (*lsmp != NULL && lmm == NULL) {
+               OBD_FREE_LARGE(*tmea, mea_size);
+               *lsmp = NULL;
+               RETURN(0);
+       }
+
+       LASSERT(mea_size == lmm_size);
+
+       OBD_ALLOC_LARGE(*tmea, mea_size);
+       if (*tmea == NULL)
+               RETURN(-ENOMEM);
+
+       if (!lmm)
+               RETURN(mea_size);
+
+       if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+           mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+           mea->mea_magic == MEA_MAGIC_HASH_SEGMENT)
+       {
+               magic = le32_to_cpu(mea->mea_magic);
+       } else {
+               /*
+                * Old mea is not handled here.
+                */
+               CERROR("Old not supportable EA is found\n");
+               LBUG();
+       }
+
+       (*tmea)->mea_magic = magic;
+       (*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+       (*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+       for (i = 0; i < (*tmea)->mea_count; i++) {
+               (*tmea)->mea_ids[i] = mea->mea_ids[i];
+               fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+       }
+       RETURN(mea_size);
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+                            ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                            ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       int                   rc = 0;
+       int                   err;
+       int                   i;
+       ENTRY;
+
+       LASSERT(fid != NULL);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0)
+                       continue;
+
+               err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+                                      policy, mode, flags, opaque);
+               if (!rc)
+                       rc = err;
+       }
+       RETURN(rc);
+}
+
+int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+                     __u64 *bits)
+{
+       struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
+       int                   rc;
+       ENTRY;
+
+       rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+       RETURN(rc);
+}
+
+ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ldlm_mode_t           rc;
+       int                   i;
+       ENTRY;
+
+       CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+       /*
+        * With CMD every object can have two locks in different namespaces:
+        * lookup lock in space of mds storing direntry and update/open lock in
+        * space of mds storing inode. Thus we check all targets, not only that
+        * one fid was created in.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL ||
+                   lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0)
+                       continue;
+
+               rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+                                  type, policy, mode, lockh);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       RETURN(0);
+}
+
+int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *md_exp,
+                     struct lustre_md *md)
+{
+       struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
+
+       return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ENTRY;
+
+       if (md->mea)
+               obd_free_memmd(exp, (void *)&md->mea);
+       RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+}
+
+int lmv_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       tgt = lmv_find_target(lmv, &och->och_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+}
+
+int lmv_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       tgt = lmv_find_target(lmv, &och->och_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+                              const struct lu_fid *fid,
+                              struct obd_capa *oc, __u32 suppgid,
+                              struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+       RETURN(rc);
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+                         renew_capa_cb_t cb)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+       RETURN(rc);
+}
+
+int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+                   const struct req_msg_field *field, struct obd_capa **oc)
+{
+       struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+       return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+int lmv_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo)
+{
+       struct md_op_data       *op_data = &minfo->mi_data;
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt = NULL;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+       RETURN(rc);
+}
+
+int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+       RETURN(rc);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl)
+{
+       struct obd_device   *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt = lmv->tgts[0];
+       int               rc = 0, i;
+       __u64           curspace, curinodes;
+       ENTRY;
+
+       if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+               CERROR("master lmv inactive\n");
+               RETURN(-EIO);
+       }
+
+       if (oqctl->qc_cmd != Q_GETOQUOTA) {
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               RETURN(rc);
+       }
+
+       curspace = curinodes = 0;
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               int err;
+               tgt = lmv->tgts[i];
+
+               if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+                       continue;
+               if (!tgt->ltd_active) {
+                       CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+                       continue;
+               }
+
+               err = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (err) {
+                       CERROR("getquota on mdt %d failed. %d\n", i, err);
+                       if (!rc)
+                               rc = err;
+               } else {
+                       curspace += oqctl->qc_dqblk.dqb_curspace;
+                       curinodes += oqctl->qc_dqblk.dqb_curinodes;
+               }
+       }
+       oqctl->qc_dqblk.dqb_curspace = curspace;
+       oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+       RETURN(rc);
+}
+
+int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl)
+{
+       struct obd_device   *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt;
+       int               i, rc = 0;
+       ENTRY;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               int err;
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+                       CERROR("lmv idx %d inactive\n", i);
+                       RETURN(-EIO);
+               }
+
+               err = obd_quotacheck(tgt->ltd_exp, oqctl);
+               if (err && !rc)
+                       rc = err;
+       }
+
+       RETURN(rc);
+}
+
+struct obd_ops lmv_obd_ops = {
+       .o_owner                = THIS_MODULE,
+       .o_setup                = lmv_setup,
+       .o_cleanup            = lmv_cleanup,
+       .o_precleanup      = lmv_precleanup,
+       .o_process_config       = lmv_process_config,
+       .o_connect            = lmv_connect,
+       .o_disconnect      = lmv_disconnect,
+       .o_statfs              = lmv_statfs,
+       .o_get_info          = lmv_get_info,
+       .o_set_info_async       = lmv_set_info_async,
+       .o_packmd              = lmv_packmd,
+       .o_unpackmd          = lmv_unpackmd,
+       .o_notify              = lmv_notify,
+       .o_get_uuid          = lmv_get_uuid,
+       .o_iocontrol        = lmv_iocontrol,
+       .o_quotacheck      = lmv_quotacheck,
+       .o_quotactl          = lmv_quotactl
+};
+
+struct md_ops lmv_md_ops = {
+       .m_getstatus        = lmv_getstatus,
+       .m_null_inode           = lmv_null_inode,
+       .m_find_cbdata    = lmv_find_cbdata,
+       .m_close                = lmv_close,
+       .m_create              = lmv_create,
+       .m_done_writing  = lmv_done_writing,
+       .m_enqueue            = lmv_enqueue,
+       .m_getattr            = lmv_getattr,
+       .m_getxattr          = lmv_getxattr,
+       .m_getattr_name  = lmv_getattr_name,
+       .m_intent_lock    = lmv_intent_lock,
+       .m_link          = lmv_link,
+       .m_rename              = lmv_rename,
+       .m_setattr            = lmv_setattr,
+       .m_setxattr          = lmv_setxattr,
+       .m_sync          = lmv_sync,
+       .m_readpage          = lmv_readpage,
+       .m_unlink              = lmv_unlink,
+       .m_init_ea_size  = lmv_init_ea_size,
+       .m_cancel_unused        = lmv_cancel_unused,
+       .m_set_lock_data        = lmv_set_lock_data,
+       .m_lock_match      = lmv_lock_match,
+       .m_get_lustre_md        = lmv_get_lustre_md,
+       .m_free_lustre_md       = lmv_free_lustre_md,
+       .m_set_open_replay_data = lmv_set_open_replay_data,
+       .m_clear_open_replay_data = lmv_clear_open_replay_data,
+       .m_renew_capa      = lmv_renew_capa,
+       .m_unpack_capa    = lmv_unpack_capa,
+       .m_get_remote_perm      = lmv_get_remote_perm,
+       .m_intent_getattr_async = lmv_intent_getattr_async,
+       .m_revalidate_lock      = lmv_revalidate_lock
+};
+
+int __init lmv_init(void)
+{
+       struct lprocfs_static_vars lvars;
+       int                     rc;
+
+       lprocfs_lmv_init_vars(&lvars);
+
+       rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+                                lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+       return rc;
+}
+
+static void lmv_exit(void)
+{
+       class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644 (file)
index 0000000..4bbe024
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifndef LPROCFS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_rd_numobd(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct obd_device       *dev = (struct obd_device*)data;
+       struct lmv_desc  *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lmv.desc;
+       *eof = 1;
+       return snprintf(page, count, "%u\n", desc->ld_tgt_count);
+
+}
+
+static const char *placement_name[] = {
+       [PLACEMENT_CHAR_POLICY] = "CHAR",
+       [PLACEMENT_NID_POLICY]  = "NID",
+       [PLACEMENT_INVAL_POLICY]  = "INVAL"
+};
+
+static placement_policy_t placement_name2policy(char *name, int len)
+{
+       int                  i;
+
+       for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+               if (!strncmp(placement_name[i], name, len))
+                       return i;
+       }
+       return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(placement_policy_t placement)
+{
+       LASSERT(placement < PLACEMENT_MAX_POLICY);
+       return placement_name[placement];
+}
+
+static int lmv_rd_placement(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device       *dev = (struct obd_device*)data;
+       struct lmv_obd    *lmv;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+       *eof = 1;
+       return snprintf(page, count, "%s\n",
+                       placement_policy2name(lmv->lmv_placement));
+
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static int lmv_wr_placement(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct obd_device       *dev = (struct obd_device *)data;
+       char                 dummy[MAX_POLICY_STRING_SIZE + 1];
+       int                   len = count;
+       placement_policy_t       policy;
+       struct lmv_obd    *lmv;
+
+       if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+               return -EFAULT;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+
+       if (len > MAX_POLICY_STRING_SIZE)
+               len = MAX_POLICY_STRING_SIZE;
+
+       if (dummy[len - 1] == '\n')
+               len--;
+       dummy[len] = '\0';
+
+       policy = placement_name2policy(dummy, len);
+       if (policy != PLACEMENT_INVAL_POLICY) {
+               spin_lock(&lmv->lmv_lock);
+               lmv->lmv_placement = policy;
+               spin_unlock(&lmv->lmv_lock);
+       } else {
+               CERROR("Invalid placement policy \"%s\"!\n", dummy);
+               return -EINVAL;
+       }
+       return count;
+}
+
+static int lmv_rd_activeobd(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device       *dev = (struct obd_device*)data;
+       struct lmv_desc  *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lmv.desc;
+       *eof = 1;
+       return snprintf(page, count, "%u\n", desc->ld_active_tgt_count);
+}
+
+static int lmv_rd_desc_uuid(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device       *dev = (struct obd_device*) data;
+       struct lmv_obd    *lmv;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+       *eof = 1;
+       return snprintf(page, count, "%s\n", lmv->desc.ld_uuid.uuid);
+}
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_device       *dev = p->private;
+       struct lmv_obd    *lmv = &dev->u.lmv;
+       return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+       return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_device       *dev = p->private;
+       struct lmv_obd    *lmv = &dev->u.lmv;
+       ++*pos;
+       return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+       struct lmv_tgt_desc     *tgt = v;
+
+       if (tgt == NULL)
+               return 0;
+       return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_idx,
+                         tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lmv_tgt_sops = {
+       .start           = lmv_tgt_seq_start,
+       .stop             = lmv_tgt_seq_stop,
+       .next             = lmv_tgt_seq_next,
+       .show             = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry   *dp = PDE(inode);
+       struct seq_file  *seq;
+       int                  rc;
+
+       rc = seq_open(file, &lmv_tgt_sops);
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = dp->data;
+
+       return 0;
+}
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+       { "numobd",          lmv_rd_numobd,       0, 0 },
+       { "placement",    lmv_rd_placement,       lmv_wr_placement, 0 },
+       { "activeobd",    lmv_rd_activeobd,       0, 0 },
+       { "uuid",              lprocfs_rd_uuid, 0, 0 },
+       { "desc_uuid",    lmv_rd_desc_uuid,       0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+       { "num_refs",      lprocfs_rd_numrefs,     0, 0 },
+       { 0 }
+};
+
+struct file_operations lmv_proc_target_fops = {
+       .owner          = THIS_MODULE,
+       .open            = lmv_target_seq_open,
+       .read            = seq_read,
+       .llseek        = seq_lseek,
+       .release              = seq_release,
+};
+
+#endif /* LPROCFS */
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars    = lprocfs_lmv_module_vars;
+       lvars->obd_vars       = lprocfs_lmv_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644 (file)
index 0000000..67eaec2
--- /dev/null
@@ -0,0 +1,9 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o \
+        lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o  \
+        lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o      \
+        lovsub_lock.o lovsub_io.o lov_pool.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644 (file)
index 0000000..28801b8
--- /dev/null
@@ -0,0 +1,820 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+       LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+       /**
+        * Page list used to submit IO when memory is in pressure.
+        */
+       struct cl_page_list emrg_page_list;
+       /**
+        * sub-io's shared by all threads accessing this device when memory is
+        * too low to allocate sub-io's dynamically.
+        */
+       struct cl_io    emrg_subio;
+       /**
+        * Environments used by sub-io's in
+        * lov_device_emerg::emrg_subio.
+        */
+       struct lu_env      *emrg_env;
+       /**
+        * Refchecks for lov_device_emerg::emrg_env.
+        *
+        * \see cl_env_get()
+        */
+       int              emrg_refcheck;
+};
+
+struct lov_device {
+       /*
+        * XXX Locking of lov-private data is missing.
+        */
+       struct cl_device          ld_cl;
+       struct lov_obd     *ld_lov;
+       /** size of lov_device::ld_target[] array */
+       __u32                ld_target_nr;
+       struct lovsub_device    **ld_target;
+       __u32                ld_flags;
+
+       /** Emergency resources used in memory-cleansing paths. */
+       struct lov_device_emerg **ld_emrg;
+       /**
+        * Serializes access to lov_device::ld_emrg in low-memory
+        * conditions.
+        */
+       struct mutex              ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+       /** empty file without body */
+       LLT_EMPTY,
+       /** striped file */
+       LLT_RAID0,
+       LLT_NR
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+       struct cl_object       lo_cl;
+       /**
+        * Serializes object operations with transitions between layout types.
+        *
+        * This semaphore is taken in shared mode by all object methods, and
+        * is taken in exclusive mode when object type is changed.
+        *
+        * \see lov_object::lo_type
+        */
+       struct rw_semaphore     lo_type_guard;
+       /**
+        * Type of an object. Protected by lov_object::lo_type_guard.
+        */
+       enum lov_layout_type    lo_type;
+       /**
+        * True if layout is invalid. This bit is cleared when layout lock
+        * is lost.
+        */
+       bool                    lo_layout_invalid;
+       /**
+        * How many IOs are on going on this object. Layout can be changed
+        * only if there is no active IO.
+        */
+       atomic_t               lo_active_ios;
+       /**
+        * Waitq - wait for no one else is using lo_lsm
+        */
+       wait_queue_head_t              lo_waitq;
+       /**
+        * Layout metadata. NULL if empty layout.
+        */
+       struct lov_stripe_md  *lo_lsm;
+
+       union lov_layout_state {
+               struct lov_layout_raid0 {
+                       unsigned               lo_nr;
+                       /**
+                        * When this is true, lov_object::lo_attr contains
+                        * valid up to date attributes for a top-level
+                        * object. This field is reset to 0 when attributes of
+                        * any sub-object change.
+                        */
+                       int                    lo_attr_valid;
+                       /**
+                        * Array of sub-objects. Allocated when top-object is
+                        * created (lov_init_raid0()).
+                        *
+                        * Top-object is a strict master of its sub-objects:
+                        * it is created before them, and outlives its
+                        * children (this later is necessary so that basic
+                        * functions like cl_object_top() always
+                        * work). Top-object keeps a reference on every
+                        * sub-object.
+                        *
+                        * When top-object is destroyed (lov_delete_raid0())
+                        * it releases its reference to a sub-object and waits
+                        * until the latter is finally destroyed.
+                        */
+                       struct lovsub_object **lo_sub;
+                       /**
+                        * protect lo_sub
+                        */
+                       spinlock_t              lo_sub_lock;
+                       /**
+                        * Cached object attribute, built from sub-object
+                        * attributes.
+                        */
+                       struct cl_attr   lo_attr;
+               } raid0;
+               struct lov_layout_state_empty {
+               } empty;
+       } u;
+       /**
+        * Thread that acquired lov_object::lo_type_guard in an exclusive
+        * mode.
+        */
+       task_t      *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+       /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+       LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+       /** sub-lock itself */
+       struct lovsub_lock  *sub_lock;
+       /** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+       unsigned             sub_flags;
+       int               sub_stripe;
+       struct cl_lock_descr sub_descr;
+       struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+       struct cl_lock_slice   lls_cl;
+       /** Number of sub-locks in this lock */
+       int                 lls_nr;
+       /**
+        * Number of existing sub-locks.
+        */
+       unsigned               lls_nr_filled;
+       /**
+        * Set when sub-lock was canceled, while top-lock was being
+        * used, or unused.
+        */
+       unsigned int           lls_cancel_race:1;
+       /**
+        * An array of sub-locks
+        *
+        * There are two issues with managing sub-locks:
+        *
+        *     - sub-locks are concurrently canceled, and
+        *
+        *     - sub-locks are shared with other top-locks.
+        *
+        * To manage cancellation, top-lock acquires a hold on a sublock
+        * (lov_sublock_adopt()) when the latter is inserted into
+        * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+        * when top-lock is going into CLS_CACHED state or destroyed. Hold
+        * prevents sub-lock from cancellation.
+        *
+        * Sub-lock sharing means, among other things, that top-lock that is
+        * in the process of creation (i.e., not yet inserted into lock list)
+        * is already accessible to other threads once at least one of its
+        * sub-locks is created, see lov_lock_sub_init().
+        *
+        * Sub-lock can be in one of the following states:
+        *
+        *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+        *       sub-lock was either never created (top-lock is in CLS_NEW
+        *       state), or it was created, then canceled, then destroyed
+        *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+        *
+        *     - sub-lock exists and is on
+        *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+        *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+        *       of a top-lock.
+        *
+        *     - sub-lock exists, but is not held by the top-lock. This
+        *       happens after top-lock released a hold on sub-locks before
+        *       going into cache (lov_lock_unuse()).
+        *
+        * \todo To support wide-striping, array has to be replaced with a set
+        * of queues to avoid scanning.
+        */
+       struct lov_lock_sub   *lls_sub;
+       /**
+        * Original description with which lock was enqueued.
+        */
+       struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+       struct cl_page_slice lps_cl;
+       int               lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+       struct cl_device   acid_cl;
+       struct lov_device *acid_super;
+       int             acid_idx;
+       struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+       struct cl_object_header lso_header;
+       struct cl_object        lso_cl;
+       struct lov_object      *lso_super;
+       int                  lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+       struct lov_lock *lll_super;
+       /** An index within parent lock. */
+       int           lll_idx;
+       /**
+        * A linkage into per sub-lock list of all corresponding top-locks,
+        * hanging off lovsub_lock::lss_parents.
+        */
+       struct list_head       lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+       struct cl_lock_slice  lss_cl;
+       /**
+        * List of top-locks that have given sub-lock as their part. Protected
+        * by cl_lock::cll_guard mutex.
+        */
+       struct list_head            lss_parents;
+       /**
+        * Top-lock that initiated current operation on this sub-lock. This is
+        * only set during top-to-bottom lock operations like enqueue, and is
+        * used to optimize state change notification. Protected by
+        * cl_lock::cll_guard mutex.
+        *
+        * \see lovsub_lock_state_one().
+        */
+       struct cl_lock       *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+       const struct lu_env *lse_env;
+       struct cl_io    *lse_io;
+       struct lov_io_sub   *lse_sub;
+};
+
+struct lovsub_page {
+       struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+       struct cl_object_conf   lti_stripe_conf;
+       struct lu_fid      lti_fid;
+       struct cl_lock_descr    lti_ldescr;
+       struct ost_lvb    lti_lvb;
+       struct cl_2queue        lti_cl2q;
+       struct cl_lock_closure  lti_closure;
+       wait_queue_t      lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+       int               sub_stripe;
+       /**
+        * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+        * independently, with lov acting as a scheduler to maximize overall
+        * throughput.
+        */
+       struct cl_io    *sub_io;
+       /**
+        * Linkage into a list (hanging off lov_io::lis_active) of all
+        * sub-io's active for the current IO iteration.
+        */
+       struct list_head           sub_linkage;
+       /**
+        * true, iff cl_io_init() was successfully executed against
+        * lov_io_sub::sub_io.
+        */
+       int               sub_io_initialized;
+       /**
+        * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+        * allocated, but borrowed from a per-device emergency pool.
+        */
+       int               sub_borrowed;
+       /**
+        * environment, in which sub-io executes.
+        */
+       struct lu_env *sub_env;
+       /**
+        * environment's refcheck.
+        *
+        * \see cl_env_get()
+        */
+       int               sub_refcheck;
+       int               sub_refcheck2;
+       int               sub_reenter;
+       void            *sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+       /** super-class */
+       struct cl_io_slice lis_cl;
+       /**
+        * Pointer to the object slice. This is a duplicate of
+        * lov_io::lis_cl::cis_object.
+        */
+       struct lov_object *lis_object;
+       /**
+        * Original end-of-io position for this IO, set by the upper layer as
+        * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+        * changes pos and count to fit IO into a single stripe and uses saved
+        * value to determine when IO iterations have to stop.
+        *
+        * This is used only for CIT_READ and CIT_WRITE io's.
+        */
+       loff_t       lis_io_endpos;
+
+       /**
+        * starting position within a file, for the current io loop iteration
+        * (stripe), used by ci_io_loop().
+        */
+       obd_off     lis_pos;
+       /**
+        * end position with in a file, for the current stripe io. This is
+        * exclusive (i.e., next offset after last byte affected by io).
+        */
+       obd_off     lis_endpos;
+
+       int             lis_mem_frozen;
+       int             lis_stripe_count;
+       int             lis_active_subios;
+
+       /**
+        * the index of ls_single_subio in ls_subios array
+        */
+       int             lis_single_subio_index;
+       struct cl_io       lis_single_subio;
+
+       /**
+        * size of ls_subios array, actually the highest stripe #
+        */
+       int             lis_nr_subios;
+       struct lov_io_sub *lis_subs;
+       /**
+        * List of active sub-io's.
+        */
+       struct list_head         lis_active;
+};
+
+struct lov_session {
+       struct lov_io     ls_io;
+       struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+       struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+       struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init       (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+void  lov_lock_unlink     (const struct lu_env *env, struct lov_lock_link *link,
+                          struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+                              int stripe);
+void  lov_sub_put           (struct lov_io_sub *sub);
+int   lov_sublock_modify  (const struct lu_env *env, struct lov_lock *lov,
+                          struct lovsub_lock *sublock,
+                          const struct cl_lock_descr *d, int idx);
+
+
+int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
+                          struct cl_page *page, struct page *vmpage);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+                          struct cl_page *page, struct page *vmpage);
+
+int   lov_page_init_empty (const struct lu_env *env,
+                          struct cl_object *obj,
+                          struct cl_page *page, struct page *vmpage);
+int   lov_page_init_raid0 (const struct lu_env *env,
+                          struct cl_object *obj,
+                          struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+                                     const struct lu_object_header *hdr,
+                                     struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                     const struct lu_object_header *hdr,
+                                     struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                        struct lov_lock *lck,
+                                        struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio    (const struct lu_env *env,
+                                        struct lov_io *lio,
+                                        const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var)               \
+       for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+       struct lov_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &lov_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+       return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+       return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &lov_device_type);
+       return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+       return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+       return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &lovsub_device_type);
+       return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+       LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+       return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+       return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+       return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+       LINVRNT(lov_is_object(obj));
+       return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+       LINVRNT(lov_is_object(&obj->co_lu));
+       return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+       return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+       return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+       LINVRNT(lovsub_is_object(&obj->co_lu));
+       return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+       LINVRNT(lovsub_is_object(obj));
+       return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+       return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+
+       slice = cl_lock_at(lock, &lovsub_device_type);
+       LASSERT(slice != NULL);
+       return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+       return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+       return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       struct lov_io *lio;
+
+       lio = container_of(ios, struct lov_io, lis_cl);
+       LASSERT(lio == lov_env_io(env));
+       return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+       return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+       struct lov_thread_info *info;
+
+       info = lu_context_key_get(&env->le_ctx, &lov_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+       LASSERT(lov->lo_type == LLT_RAID0);
+       LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+               lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+       return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644 (file)
index 0000000..f94f8d9
--- /dev/null
@@ -0,0 +1,533 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+       {
+               .ckd_cache = &lov_lock_kmem,
+               .ckd_name  = "lov_lock_kmem",
+               .ckd_size  = sizeof (struct lov_lock)
+       },
+       {
+               .ckd_cache = &lov_object_kmem,
+               .ckd_name  = "lov_object_kmem",
+               .ckd_size  = sizeof (struct lov_object)
+       },
+       {
+               .ckd_cache = &lov_thread_kmem,
+               .ckd_name  = "lov_thread_kmem",
+               .ckd_size  = sizeof (struct lov_thread_info)
+       },
+       {
+               .ckd_cache = &lov_session_kmem,
+               .ckd_name  = "lov_session_kmem",
+               .ckd_size  = sizeof (struct lov_session)
+       },
+       {
+               .ckd_cache = &lov_req_kmem,
+               .ckd_name  = "lov_req_kmem",
+               .ckd_size  = sizeof (struct lov_req)
+       },
+       {
+               .ckd_cache = &lovsub_lock_kmem,
+               .ckd_name  = "lovsub_lock_kmem",
+               .ckd_size  = sizeof (struct lovsub_lock)
+       },
+       {
+               .ckd_cache = &lovsub_object_kmem,
+               .ckd_name  = "lovsub_object_kmem",
+               .ckd_size  = sizeof (struct lovsub_object)
+       },
+       {
+               .ckd_cache = &lovsub_req_kmem,
+               .ckd_name  = "lovsub_req_kmem",
+               .ckd_size  = sizeof (struct lovsub_req)
+       },
+       {
+               .ckd_cache = &lov_lock_link_kmem,
+               .ckd_name  = "lov_lock_link_kmem",
+               .ckd_size  = sizeof (struct lov_lock_link)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret)
+{
+       struct lov_req *lr;
+
+       ENTRY;
+       lr = cl2lov_req(slice);
+       OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+       EXIT;
+}
+
+static const struct cl_req_operations lov_req_ops = {
+       .cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct lov_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, __GFP_IO);
+       if (info != NULL)
+               INIT_LIST_HEAD(&info->lti_closure.clc_list);
+       else
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct lov_thread_info *info = data;
+       LINVRNT(list_empty(&info->lti_closure.clc_list));
+       OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = lov_key_init,
+       .lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct lov_session *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct lov_session *info = data;
+       OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = lov_session_key_init,
+       .lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       int i;
+       struct lov_device *ld = lu2lov_dev(d);
+
+       LASSERT(ld->ld_lov != NULL);
+       if (ld->ld_target == NULL)
+               RETURN(NULL);
+
+       lov_foreach_target(ld, i) {
+               struct lovsub_device *lsd;
+
+               lsd = ld->ld_target[i];
+               if (lsd != NULL) {
+                       cl_stack_fini(env, lovsub2cl_dev(lsd));
+                       ld->ld_target[i] = NULL;
+               }
+       }
+       RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       struct lov_device *ld = lu2lov_dev(d);
+       int i;
+       int rc = 0;
+
+       LASSERT(d->ld_site != NULL);
+       if (ld->ld_target == NULL)
+               RETURN(rc);
+
+       lov_foreach_target(ld, i) {
+               struct lovsub_device *lsd;
+               struct cl_device     *cl;
+               struct lov_tgt_desc  *desc;
+
+               desc = ld->ld_lov->lov_tgts[i];
+               if (desc == NULL)
+                       continue;
+
+               cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+                                  desc->ltd_obd->obd_lu_dev);
+               if (IS_ERR(cl)) {
+                       rc = PTR_ERR(cl);
+                       break;
+               }
+               lsd = cl2lovsub_dev(cl);
+               lsd->acid_idx = i;
+               lsd->acid_super = ld;
+               ld->ld_target[i] = lsd;
+       }
+
+       if (rc)
+               lov_device_fini(env, d);
+       else
+               ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+       RETURN(rc);
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+                       struct cl_req *req)
+{
+       struct lov_req *lr;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, __GFP_IO);
+       if (lr != NULL) {
+               cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+       .cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+       int i;
+
+       for (i = 0; i < nr; ++i) {
+               struct lov_device_emerg *em;
+
+               em = emrg[i];
+               if (em != NULL) {
+                       LASSERT(em->emrg_page_list.pl_nr == 0);
+                       if (em->emrg_env != NULL)
+                               cl_env_put(em->emrg_env, &em->emrg_refcheck);
+                       OBD_FREE_PTR(em);
+               }
+       }
+       OBD_FREE(emrg, nr * sizeof emrg[0]);
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct lov_device *ld = lu2lov_dev(d);
+       const int         nr = ld->ld_target_nr;
+
+       cl_device_fini(lu2cl_dev(d));
+       if (ld->ld_target != NULL)
+               OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+       if (ld->ld_emrg != NULL)
+               lov_emerg_free(ld->ld_emrg, nr);
+       OBD_FREE_PTR(ld);
+       return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+                             __u32 index)
+{
+       struct lov_device *ld = lu2lov_dev(dev);
+       ENTRY;
+
+       if (ld->ld_target[index] != NULL) {
+               cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+               ld->ld_target[index] = NULL;
+       }
+       EXIT;
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+       struct lov_device_emerg **emerg;
+       int i;
+       int result;
+
+       OBD_ALLOC(emerg, nr * sizeof emerg[0]);
+       if (emerg == NULL)
+               return ERR_PTR(-ENOMEM);
+       for (result = i = 0; i < nr && result == 0; i++) {
+               struct lov_device_emerg *em;
+
+               OBD_ALLOC_PTR(em);
+               if (em != NULL) {
+                       emerg[i] = em;
+                       cl_page_list_init(&em->emrg_page_list);
+                       em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+                                                   LCT_REMEMBER|LCT_NOREF);
+                       if (!IS_ERR(em->emrg_env))
+                               em->emrg_env->le_ctx.lc_cookie = 0x2;
+                       else {
+                               result = PTR_ERR(em->emrg_env);
+                               em->emrg_env = NULL;
+                       }
+               } else
+                       result = -ENOMEM;
+       }
+       if (result != 0) {
+               lov_emerg_free(emerg, nr);
+               emerg = ERR_PTR(result);
+       }
+       return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+       int   result;
+       __u32 tgt_size;
+       __u32 sub_size;
+
+       ENTRY;
+       result = 0;
+       tgt_size = dev->ld_lov->lov_tgt_size;
+       sub_size = dev->ld_target_nr;
+       if (sub_size < tgt_size) {
+               struct lovsub_device    **newd;
+               struct lov_device_emerg **emerg;
+               const size_t          sz   = sizeof newd[0];
+
+               emerg = lov_emerg_alloc(tgt_size);
+               if (IS_ERR(emerg))
+                       RETURN(PTR_ERR(emerg));
+
+               OBD_ALLOC(newd, tgt_size * sz);
+               if (newd != NULL) {
+                       mutex_lock(&dev->ld_mutex);
+                       if (sub_size > 0) {
+                               memcpy(newd, dev->ld_target, sub_size * sz);
+                               OBD_FREE(dev->ld_target, sub_size * sz);
+                       }
+                       dev->ld_target    = newd;
+                       dev->ld_target_nr = tgt_size;
+
+                       if (dev->ld_emrg != NULL)
+                               lov_emerg_free(dev->ld_emrg, sub_size);
+                       dev->ld_emrg = emerg;
+                       mutex_unlock(&dev->ld_mutex);
+               } else {
+                       lov_emerg_free(emerg, tgt_size);
+                       result = -ENOMEM;
+               }
+       }
+       RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+                            __u32 index)
+{
+       struct obd_device    *obd = dev->ld_obd;
+       struct lov_device    *ld  = lu2lov_dev(dev);
+       struct lov_tgt_desc  *tgt;
+       struct lovsub_device *lsd;
+       struct cl_device     *cl;
+       int rc;
+       ENTRY;
+
+       obd_getref(obd);
+
+       tgt = obd->u.lov.lov_tgts[index];
+       LASSERT(tgt != NULL);
+       LASSERT(tgt->ltd_obd != NULL);
+
+       if (!tgt->ltd_obd->obd_set_up) {
+               CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+               RETURN(-EINVAL);
+       }
+
+       rc = lov_expand_targets(env, ld);
+       if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+               LASSERT(dev->ld_site != NULL);
+
+               cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+                                  tgt->ltd_obd->obd_lu_dev);
+               if (!IS_ERR(cl)) {
+                       lsd = cl2lovsub_dev(cl);
+                       lsd->acid_idx = index;
+                       lsd->acid_super = ld;
+                       ld->ld_target[index] = lsd;
+               } else {
+                       CERROR("add failed (%d), deleting %s\n", rc,
+                              obd_uuid2str(&tgt->ltd_uuid));
+                       lov_cl_del_target(env, dev, index);
+                       rc = PTR_ERR(cl);
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+                             struct lu_device *d, struct lustre_cfg *cfg)
+{
+       struct obd_device *obd = d->ld_obd;
+       int cmd;
+       int rc;
+       int gen;
+       __u32 index;
+
+       obd_getref(obd);
+
+       cmd = cfg->lcfg_command;
+       rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+       if (rc == 0) {
+               switch(cmd) {
+               case LCFG_LOV_ADD_OBD:
+               case LCFG_LOV_ADD_INA:
+                       rc = lov_cl_add_target(env, d, index);
+                       if (rc != 0)
+                               lov_del_target(d->ld_obd, index, 0, 0);
+                       break;
+               case LCFG_LOV_DEL_OBD:
+                       lov_cl_del_target(env, d, index);
+                       break;
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+       .ldo_object_alloc      = lov_object_alloc,
+       .ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       struct lu_device *d;
+       struct lov_device *ld;
+       struct obd_device *obd;
+       int rc;
+
+       OBD_ALLOC_PTR(ld);
+       if (ld == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       cl_device_init(&ld->ld_cl, t);
+       d = lov2lu_dev(ld);
+       d->ld_ops       = &lov_lu_ops;
+       ld->ld_cl.cd_ops = &lov_cl_ops;
+
+       mutex_init(&ld->ld_mutex);
+       lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+       /* setup the LOV OBD */
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       rc = lov_setup(obd, cfg);
+       if (rc) {
+               lov_device_free(env, d);
+               RETURN(ERR_PTR(rc));
+       }
+
+       ld->ld_lov = &obd->u.lov;
+       RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+       .ldto_init = lov_type_init,
+       .ldto_fini = lov_type_fini,
+
+       .ldto_start = lov_type_start,
+       .ldto_stop  = lov_type_stop,
+
+       .ldto_device_alloc = lov_device_alloc,
+       .ldto_device_free  = lov_device_free,
+
+       .ldto_device_init    = lov_device_init,
+       .ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_LOV_NAME,
+       .ldt_ops      = &lov_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644 (file)
index 0000000..481e863
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+       struct lov_stripe_md *lsm;
+       int                cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+                                __u16 stripe_count)
+{
+
+       if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+               CERROR("bad stripe count %d\n", stripe_count);
+               lov_dump_lmm(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+               CERROR("zero object id\n");
+               lov_dump_lmm(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
+               CERROR("bad striping pattern\n");
+               lov_dump_lmm(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm->lmm_stripe_size == 0 ||
+            (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+               CERROR("bad stripe size %u\n",
+                      le32_to_cpu(lmm->lmm_stripe_size));
+               lov_dump_lmm(D_WARNING, lmm);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+       struct lov_stripe_md *lsm;
+       struct lov_oinfo     *loi;
+       int                i, oinfo_ptrs_size;
+
+       LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+       oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+       *size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+       OBD_ALLOC_LARGE(lsm, *size);
+       if (!lsm)
+               return NULL;;
+
+       for (i = 0; i < stripe_count; i++) {
+               OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, __GFP_IO);
+               if (loi == NULL)
+                       goto err;
+               lsm->lsm_oinfo[i] = loi;
+       }
+       lsm->lsm_stripe_count = stripe_count;
+       return lsm;
+
+err:
+       while (--i >= 0)
+               OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+       OBD_FREE_LARGE(lsm, *size);
+       return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+       __u16 stripe_count = lsm->lsm_stripe_count;
+       int i;
+
+       for (i = 0; i < stripe_count; i++)
+               OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+                             sizeof(struct lov_oinfo));
+       OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+                      stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+                               struct lov_mds_md *lmm)
+{
+       /*
+        * This supposes lov_mds_md_v1/v3 first fields are
+        * are the same
+        */
+       lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+       lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+       lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+       lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+       lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+                          obd_off *lov_off, obd_off *swidth)
+{
+       if (swidth)
+               *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+                          obd_off *lov_off, obd_off *swidth)
+{
+       if (swidth)
+               *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+                            struct obd_export *md_exp)
+{
+       return 0;
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+       struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+       if (imp == NULL || !tgt->ltd_active) {
+               *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+               return;
+       }
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_FULL &&
+           (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+           imp->imp_connect_data.ocd_maxbytes > 0) {
+               if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+                       *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+       } else {
+               *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+       }
+       spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+                            __u16 *stripe_count)
+{
+       if (lmm_bytes < sizeof(*lmm)) {
+               CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+                      lmm_bytes, (int)sizeof(*lmm));
+               return -EINVAL;
+       }
+
+       *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+       if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+               CERROR("LOV EA V1 too small: %d, need %d\n",
+                      lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+               lov_dump_lmm_v1(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                   struct lov_mds_md_v1 *lmm)
+{
+       struct lov_oinfo *loi;
+       int i;
+       __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+       lsm_unpackmd_common(lsm, lmm);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               /* XXX LOV STACKING call down to osc_unpackmd() */
+               loi = lsm->lsm_oinfo[i];
+               ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+               loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+               loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+               if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                       CERROR("OST index %d more than OST count %d\n",
+                              loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                       lov_dump_lmm_v1(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                       lov_dump_lmm_v1(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               /* calculate the minimum stripe max bytes */
+               lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+                                &stripe_maxbytes);
+       }
+
+       lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+       return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+       .lsm_free           = lsm_free_plain,
+       .lsm_destroy     = lsm_destroy_plain,
+       .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+       .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+       .lsm_lmm_verify  = lsm_lmm_verify_v1,
+       .lsm_unpackmd      = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+                            __u16 *stripe_count)
+{
+       struct lov_mds_md_v3 *lmm;
+
+       lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+       if (lmm_bytes < sizeof(*lmm)) {
+               CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+                      lmm_bytes, (int)sizeof(*lmm));
+               return -EINVAL;
+       }
+
+       *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+       if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+               CERROR("LOV EA V3 too small: %d, need %d\n",
+                      lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+               lov_dump_lmm_v3(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+                                    *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                   struct lov_mds_md *lmmv1)
+{
+       struct lov_mds_md_v3 *lmm;
+       struct lov_oinfo *loi;
+       int i;
+       __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+       int cplen = 0;
+
+       lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+       lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+       cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+                       sizeof(lsm->lsm_pool_name));
+       if (cplen >= sizeof(lsm->lsm_pool_name))
+               return -E2BIG;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               /* XXX LOV STACKING call down to osc_unpackmd() */
+               loi = lsm->lsm_oinfo[i];
+               ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+               loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+               loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+               if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                       CERROR("OST index %d more than OST count %d\n",
+                              loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                       lov_dump_lmm_v3(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                       lov_dump_lmm_v3(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               /* calculate the minimum stripe max bytes */
+               lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+                                &stripe_maxbytes);
+       }
+
+       lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+       return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+       .lsm_free           = lsm_free_plain,
+       .lsm_destroy     = lsm_destroy_plain,
+       .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+       .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+       .lsm_lmm_verify  = lsm_lmm_verify_v3,
+       .lsm_unpackmd      = lsm_unpackmd_v3,
+};
diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644 (file)
index 0000000..146d5e3
--- /dev/null
@@ -0,0 +1,322 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_user.h>
+
+struct lov_lock_handles {
+       struct portals_handle   llh_handle;
+       atomic_t            llh_refcount;
+       int                  llh_stripe_count;
+       struct lustre_handle    llh_handles[0];
+};
+
+struct lov_request {
+       struct obd_info   rq_oi;
+       struct lov_request_set  *rq_rqset;
+
+       struct list_head               rq_link;
+
+       int                   rq_idx;   /* index in lov->tgts array */
+       int                   rq_stripe;     /* stripe number */
+       int                   rq_complete;
+       int                   rq_rc;
+       int                   rq_buflen;     /* length of sub_md */
+
+       obd_count               rq_oabufs;
+       obd_count               rq_pgaidx;
+};
+
+struct lov_request_set {
+       struct ldlm_enqueue_info        *set_ei;
+       struct obd_info                 *set_oi;
+       atomic_t                        set_refcount;
+       struct obd_export               *set_exp;
+       /* XXX: There is @set_exp already, however obd_statfs gets obd_device
+          only. */
+       struct obd_device               *set_obd;
+       int                             set_count;
+       atomic_t                        set_completes;
+       atomic_t                        set_success;
+       atomic_t                        set_finish_checked;
+       struct llog_cookie              *set_cookies;
+       int                             set_cookie_sent;
+       struct obd_trans_info           *set_oti;
+       obd_count                       set_oabufs;
+       struct brw_page                 *set_pga;
+       struct lov_lock_handles         *set_lockh;
+       struct list_head                        set_list;
+       wait_queue_head_t                       set_waitq;
+       spinlock_t                      set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+       LASSERT(set != NULL);
+       LASSERT(atomic_read(&set->set_refcount) > 0);
+       atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+       if (atomic_dec_and_test(&set->set_refcount))
+               lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+       LASSERT(handle != NULL);
+       return(class_handle2object(handle->cookie));
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+       CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+              atomic_read(&llh->llh_refcount) - 1);
+       LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+               atomic_read(&llh->llh_refcount) < 0x5a5a);
+       if (atomic_dec_and_test(&llh->llh_refcount)) {
+               class_handle_unhash(&llh->llh_handle);
+               /* The structure may be held by other threads because RCU.
+                *   -jxiong */
+               if (atomic_read(&llh->llh_refcount))
+                       return;
+
+               OBD_FREE_RCU(llh, sizeof *llh +
+                            sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+                            &llh->llh_handle);
+       }
+}
+
+#define lov_uuid2str(lv, index) \
+       (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+                    struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+                 struct ost_lvb *lvb, int kms_only);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+                  obd_off size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                     struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+                        int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+                     int stripeno, obd_off *obd_off);
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+                          int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+                         obd_off start, obd_off end,
+                         obd_off *obd_start, obd_off *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE       0
+#define LOV_USES_DEFAULT_STRIPE         1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+                   struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+                         struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
+                       struct lov_stripe_md **ea, struct obdo *src_oa,
+                       struct obd_trans_info *oti,
+                       struct lov_request_set **reqset);
+int cb_create_update(void *cookie, int rc);
+int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pga,
+                    struct obd_trans_info *oti,
+                    struct lov_request_set **reqset);
+int lov_fini_brw_set(struct lov_request_set *set);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obdo *src_oa, struct lov_stripe_md *lsm,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset);
+int lov_update_destroy_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct obd_trans_info *oti,
+                      struct lov_request_set **reqset);
+int lov_fini_punch_set(struct lov_request_set *set);
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
+                     obd_off start, obd_off end,
+                     struct lov_request_set **reqset);
+int lov_fini_sync_set(struct lov_request_set *set);
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct lov_request_set **reqset);
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+                        struct ptlrpc_request_set *rqset);
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct lov_stripe_md *lsm,
+                      ldlm_policy_data_t *policy, __u32 mode,
+                      struct lustre_handle *lockh,
+                      struct lov_request_set **reqset);
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags);
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+                       struct lov_stripe_md *lsm,
+                       __u32 mode, struct lustre_handle *lockh,
+                       struct lov_request_set **reqset);
+int lov_fini_cancel_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                       struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                      int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+                   int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                   struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                           __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+                  struct obd_uuid *uuidp, int gen);
+/* lov_log.c */
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *tgt, int *idx);
+int lov_llog_finish(struct obd_device *obd, int count);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+              struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_bytes);
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                 struct lov_stripe_md **lsmp, struct lov_user_md *lump);
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+             struct lov_user_md *lump);
+int lov_getstripe(struct obd_export *exp,
+                 struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+                   int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+
+int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                        struct obdo *oa, void *data);
+/* lproc_lov.c */
+extern struct file_operations lov_proc_target_fops;
+#ifdef LPROCFS
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+       LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+       atomic_inc(&lsm->lsm_refc);
+       return lsm;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644 (file)
index 0000000..1a87abd
--- /dev/null
@@ -0,0 +1,967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+       sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+       sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+                           struct lov_io_sub *sub)
+{
+       ENTRY;
+       if (sub->sub_io != NULL) {
+               if (sub->sub_io_initialized) {
+                       lov_sub_enter(sub);
+                       cl_io_fini(sub->sub_env, sub->sub_io);
+                       lov_sub_exit(sub);
+                       sub->sub_io_initialized = 0;
+                       lio->lis_active_subios--;
+               }
+               if (sub->sub_stripe == lio->lis_single_subio_index)
+                       lio->lis_single_subio_index = -1;
+               else if (!sub->sub_borrowed)
+                       OBD_FREE_PTR(sub->sub_io);
+               sub->sub_io = NULL;
+       }
+       if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+               if (!sub->sub_borrowed)
+                       cl_env_put(sub->sub_env, &sub->sub_refcheck);
+               sub->sub_env = NULL;
+       }
+       EXIT;
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+                              int stripe, loff_t start, loff_t end)
+{
+       struct lov_stripe_md *lsm    = lio->lis_object->lo_lsm;
+       struct cl_io     *parent = lio->lis_cl.cis_io;
+
+       switch(io->ci_type) {
+       case CIT_SETATTR: {
+               io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+               io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+               io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+               if (cl_io_is_trunc(io)) {
+                       loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+                       new_size = lov_size_to_stripe(lsm, new_size, stripe);
+                       io->u.ci_setattr.sa_attr.lvb_size = new_size;
+               }
+               break;
+       }
+       case CIT_FAULT: {
+               struct cl_object *obj = parent->ci_obj;
+               loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+               io->u.ci_fault = parent->u.ci_fault;
+               off = lov_size_to_stripe(lsm, off, stripe);
+               io->u.ci_fault.ft_index = cl_index(obj, off);
+               break;
+       }
+       case CIT_FSYNC: {
+               io->u.ci_fsync.fi_start = start;
+               io->u.ci_fsync.fi_end = end;
+               io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+               io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+               io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+               break;
+       }
+       case CIT_READ:
+       case CIT_WRITE: {
+               io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+               if (cl_io_is_append(parent)) {
+                       io->u.ci_wr.wr_append = 1;
+               } else {
+                       io->u.ci_rw.crw_pos = start;
+                       io->u.ci_rw.crw_count = end - start;
+               }
+               break;
+       }
+       default:
+               break;
+       }
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+                          struct lov_io_sub *sub)
+{
+       struct lov_object *lov = lio->lis_object;
+       struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+       struct cl_io      *sub_io;
+       struct cl_object  *sub_obj;
+       struct cl_io      *io  = lio->lis_cl.cis_io;
+
+       int stripe = sub->sub_stripe;
+       int result;
+
+       LASSERT(sub->sub_io == NULL);
+       LASSERT(sub->sub_env == NULL);
+       LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+       ENTRY;
+
+       result = 0;
+       sub->sub_io_initialized = 0;
+       sub->sub_borrowed = 0;
+
+       if (lio->lis_mem_frozen) {
+               LASSERT(mutex_is_locked(&ld->ld_mutex));
+               sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+               sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+               sub->sub_borrowed = 1;
+       } else {
+               void *cookie;
+
+               /* obtain new environment */
+               cookie = cl_env_reenter();
+               sub->sub_env = cl_env_get(&sub->sub_refcheck);
+               cl_env_reexit(cookie);
+               if (IS_ERR(sub->sub_env))
+                       result = PTR_ERR(sub->sub_env);
+
+               if (result == 0) {
+                       /*
+                        * First sub-io. Use ->lis_single_subio to
+                        * avoid dynamic allocation.
+                        */
+                       if (lio->lis_active_subios == 0) {
+                               sub->sub_io = &lio->lis_single_subio;
+                               lio->lis_single_subio_index = stripe;
+                       } else {
+                               OBD_ALLOC_PTR(sub->sub_io);
+                               if (sub->sub_io == NULL)
+                                       result = -ENOMEM;
+                       }
+               }
+       }
+
+       if (result == 0) {
+               sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+               sub_io  = sub->sub_io;
+
+               sub_io->ci_obj    = sub_obj;
+               sub_io->ci_result = 0;
+
+               sub_io->ci_parent  = io;
+               sub_io->ci_lockreq = io->ci_lockreq;
+               sub_io->ci_type    = io->ci_type;
+               sub_io->ci_no_srvlock = io->ci_no_srvlock;
+
+               lov_sub_enter(sub);
+               result = cl_io_sub_init(sub->sub_env, sub_io,
+                                       io->ci_type, sub_obj);
+               lov_sub_exit(sub);
+               if (result >= 0) {
+                       lio->lis_active_subios++;
+                       sub->sub_io_initialized = 1;
+                       result = 0;
+               }
+       }
+       if (result != 0)
+               lov_io_sub_fini(env, lio, sub);
+       RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+                              struct lov_io *lio, int stripe)
+{
+       int rc;
+       struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+       LASSERT(stripe < lio->lis_stripe_count);
+       ENTRY;
+
+       if (!sub->sub_io_initialized) {
+               sub->sub_stripe = stripe;
+               rc = lov_io_sub_init(env, lio, sub);
+       } else
+               rc = 0;
+       if (rc == 0)
+               lov_sub_enter(sub);
+       else
+               sub = ERR_PTR(rc);
+       RETURN(sub);
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+       lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+       struct lovsub_object *subobj;
+
+       ENTRY;
+       subobj = lu2lovsub(
+               lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+                                &lovsub_device_type));
+       LASSERT(subobj != NULL);
+       RETURN(subobj->lso_index);
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+                                 const struct cl_page_slice *slice)
+{
+       struct lov_stripe_md *lsm  = lio->lis_object->lo_lsm;
+       struct cl_page       *page = slice->cpl_page;
+       int stripe;
+
+       LASSERT(lio->lis_cl.cis_io != NULL);
+       LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+       LASSERT(lsm != NULL);
+       LASSERT(lio->lis_nr_subios > 0);
+       ENTRY;
+
+       stripe = lov_page_stripe(page);
+       RETURN(lov_sub_get(env, lio, stripe));
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+                            struct cl_io *io)
+{
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       int result;
+
+       LASSERT(lio->lis_object != NULL);
+       ENTRY;
+
+       /*
+        * Need to be optimized, we can't afford to allocate a piece of memory
+        * when writing a page. -jay
+        */
+       OBD_ALLOC_LARGE(lio->lis_subs,
+                       lsm->lsm_stripe_count * sizeof lio->lis_subs[0]);
+       if (lio->lis_subs != NULL) {
+               lio->lis_nr_subios = lio->lis_stripe_count;
+               lio->lis_single_subio_index = -1;
+               lio->lis_active_subios = 0;
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+                             struct lov_object *obj, struct cl_io *io)
+{
+       ENTRY;
+
+       io->ci_result = 0;
+       lio->lis_object = obj;
+
+       LASSERT(obj->lo_lsm != NULL);
+       lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               lio->lis_pos = io->u.ci_rw.crw_pos;
+               lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+               lio->lis_io_endpos = lio->lis_endpos;
+               if (cl_io_is_append(io)) {
+                       LASSERT(io->ci_type == CIT_WRITE);
+                       lio->lis_pos = 0;
+                       lio->lis_endpos = OBD_OBJECT_EOF;
+               }
+               break;
+
+       case CIT_SETATTR:
+               if (cl_io_is_trunc(io))
+                       lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+               else
+                       lio->lis_pos = 0;
+               lio->lis_endpos = OBD_OBJECT_EOF;
+               break;
+
+       case CIT_FAULT: {
+               pgoff_t index = io->u.ci_fault.ft_index;
+               lio->lis_pos = cl_offset(io->ci_obj, index);
+               lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+               break;
+       }
+
+       case CIT_FSYNC: {
+               lio->lis_pos = io->u.ci_fsync.fi_start;
+               lio->lis_endpos = io->u.ci_fsync.fi_end;
+               break;
+       }
+
+       case CIT_MISC:
+               lio->lis_pos = 0;
+               lio->lis_endpos = OBD_OBJECT_EOF;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       EXIT;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       struct lov_object *lov = cl2lov(ios->cis_obj);
+       int i;
+
+       ENTRY;
+       if (lio->lis_subs != NULL) {
+               for (i = 0; i < lio->lis_nr_subios; i++)
+                       lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+               OBD_FREE_LARGE(lio->lis_subs,
+                        lio->lis_nr_subios * sizeof lio->lis_subs[0]);
+               lio->lis_nr_subios = 0;
+       }
+
+       LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+       if (atomic_dec_and_test(&lov->lo_active_ios))
+               wake_up_all(&lov->lo_waitq);
+       EXIT;
+}
+
+static obd_off lov_offset_mod(obd_off val, int delta)
+{
+       if (val != OBD_OBJECT_EOF)
+               val += delta;
+       return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+                           const struct cl_io_slice *ios)
+{
+       struct lov_io   *lio = cl2lov_io(env, ios);
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       struct lov_io_sub    *sub;
+       obd_off endpos;
+       obd_off start;
+       obd_off end;
+       int stripe;
+       int rc = 0;
+
+       ENTRY;
+       endpos = lov_offset_mod(lio->lis_endpos, -1);
+       for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+               if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+                                          endpos, &start, &end))
+                       continue;
+
+               end = lov_offset_mod(end, +1);
+               sub = lov_sub_get(env, lio, stripe);
+               if (!IS_ERR(sub)) {
+                       lov_io_sub_inherit(sub->sub_io, lio, stripe,
+                                          start, end);
+                       rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+                       lov_sub_put(sub);
+                       CDEBUG(D_VFSTRACE, "shrink: %d ["LPU64", "LPU64")\n",
+                              stripe, start, end);
+               } else
+                       rc = PTR_ERR(sub);
+
+               if (!rc)
+                       list_add_tail(&sub->sub_linkage, &lio->lis_active);
+               else
+                       break;
+       }
+       RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct lov_io   *lio = cl2lov_io(env, ios);
+       struct cl_io     *io  = ios->cis_io;
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       loff_t start = io->u.ci_rw.crw_pos;
+       loff_t next;
+       unsigned long ssize = lsm->lsm_stripe_size;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+       ENTRY;
+
+       /* fast path for common case. */
+       if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+               lov_do_div64(start, ssize);
+               next = (start + 1) * ssize;
+               if (next <= start * ssize)
+                       next = ~0ull;
+
+               io->ci_continue = next < lio->lis_io_endpos;
+               io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+                                             next) - io->u.ci_rw.crw_pos;
+               lio->lis_pos    = io->u.ci_rw.crw_pos;
+               lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+               CDEBUG(D_VFSTRACE, "stripe: "LPU64" chunk: ["LPU64", "LPU64") "
+                      LPU64"\n", (__u64)start, lio->lis_pos, lio->lis_endpos,
+                      (__u64)lio->lis_io_endpos);
+       }
+       /*
+        * XXX The following call should be optimized: we know, that
+        * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+        */
+       RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+                      int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+       struct cl_io *parent = lio->lis_cl.cis_io;
+       struct lov_io_sub *sub;
+       int rc = 0;
+
+       ENTRY;
+       list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+               lov_sub_enter(sub);
+               rc = iofunc(sub->sub_env, sub->sub_io);
+               lov_sub_exit(sub);
+               if (rc)
+                       break;
+
+               if (parent->ci_result == 0)
+                       parent->ci_result = sub->sub_io->ci_result;
+       }
+       RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       ENTRY;
+       RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       ENTRY;
+       RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       ENTRY;
+       /*
+        * It's possible that lov_io_start() wasn't called against this
+        * sub-io, either because previous sub-io failed, or upper layer
+        * completed IO.
+        */
+       if (io->ci_state == CIS_IO_GOING)
+               cl_io_end(env, io);
+       else
+               io->ci_state = CIS_IO_FINISHED;
+       RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       cl_io_iter_fini(env, io);
+       RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       cl_io_unlock(env, io);
+       RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       int rc;
+
+       rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+       LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       int rc;
+
+       ENTRY;
+       rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+       LASSERT(rc == 0);
+       while (!list_empty(&lio->lis_active))
+               list_del_init(lio->lis_active.next);
+       EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+                         const struct cl_io_slice *ios)
+{
+       int rc;
+
+       ENTRY;
+       rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+       LASSERT(rc == 0);
+       EXIT;
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+                                             struct cl_page_list *qin,
+                                             int idx, int alloc)
+{
+       return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+                        const struct cl_io_slice *ios,
+                        enum cl_req_type crt, struct cl_2queue *queue)
+{
+       struct lov_io     *lio = cl2lov_io(env, ios);
+       struct lov_object      *obj = lio->lis_object;
+       struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+       struct cl_page_list    *qin = &queue->c2_qin;
+       struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+       struct cl_page_list *stripes_qin = NULL;
+       struct cl_page *page;
+       struct cl_page *tmp;
+       int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+       int rc = 0;
+       int alloc =
+               !(current->flags & PF_MEMALLOC);
+       ENTRY;
+       if (lio->lis_active_subios == 1) {
+               int idx = lio->lis_single_subio_index;
+               struct lov_io_sub *sub;
+
+               LASSERT(idx < lio->lis_nr_subios);
+               sub = lov_sub_get(env, lio, idx);
+               LASSERT(!IS_ERR(sub));
+               LASSERT(sub->sub_io == &lio->lis_single_subio);
+               rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+                                    crt, queue);
+               lov_sub_put(sub);
+               RETURN(rc);
+       }
+
+       LASSERT(lio->lis_subs != NULL);
+       if (alloc) {
+               OBD_ALLOC_LARGE(stripes_qin,
+                               sizeof(*stripes_qin) * lio->lis_nr_subios);
+               if (stripes_qin == NULL)
+                       RETURN(-ENOMEM);
+
+               for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+                       cl_page_list_init(&stripes_qin[stripe]);
+       } else {
+               /*
+                * If we get here, it means pageout & swap doesn't help.
+                * In order to not make things worse, even don't try to
+                * allocate the memory with __GFP_NOWARN. -jay
+                */
+               mutex_lock(&ld->ld_mutex);
+               lio->lis_mem_frozen = 1;
+       }
+
+       cl_2queue_init(cl2q);
+       cl_page_list_for_each_safe(page, tmp, qin) {
+               stripe = lov_page_stripe(page);
+               cl_page_list_move(QIN(stripe), qin, page);
+       }
+
+       for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+               struct lov_io_sub   *sub;
+               struct cl_page_list *sub_qin = QIN(stripe);
+
+               if (list_empty(&sub_qin->pl_pages))
+                       continue;
+
+               cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+               sub = lov_sub_get(env, lio, stripe);
+               if (!IS_ERR(sub)) {
+                       rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+                                            crt, cl2q);
+                       lov_sub_put(sub);
+               } else
+                       rc = PTR_ERR(sub);
+               cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+               cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+               if (rc != 0)
+                       break;
+       }
+
+       for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+               struct cl_page_list *sub_qin = QIN(stripe);
+
+               if (list_empty(&sub_qin->pl_pages))
+                       continue;
+
+               cl_page_list_splice(sub_qin, qin);
+       }
+
+       if (alloc) {
+               OBD_FREE_LARGE(stripes_qin,
+                        sizeof(*stripes_qin) * lio->lis_nr_subios);
+       } else {
+               int i;
+
+               for (i = 0; i < lio->lis_nr_subios; i++) {
+                       struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+                       if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+                               lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+               }
+               lio->lis_mem_frozen = 0;
+               mutex_unlock(&ld->ld_mutex);
+       }
+
+       RETURN(rc);
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct lov_io     *lio      = cl2lov_io(env, ios);
+       struct cl_page    *sub_page = lov_sub_page(slice);
+       struct lov_io_sub *sub;
+       int result;
+
+       ENTRY;
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+                                            sub_page, from, to);
+               lov_sub_put(sub);
+       } else
+               result = PTR_ERR(sub);
+       RETURN(result);
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct lov_io     *lio      = cl2lov_io(env, ios);
+       struct cl_page    *sub_page = lov_sub_page(slice);
+       struct lov_io_sub *sub;
+       int result;
+
+       ENTRY;
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+                                           sub_page, from, to);
+               lov_sub_put(sub);
+       } else
+               result = PTR_ERR(sub);
+       RETURN(result);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_fault_io *fio;
+       struct lov_io      *lio;
+       struct lov_io_sub  *sub;
+
+       ENTRY;
+       fio = &ios->cis_io->u.ci_fault;
+       lio = cl2lov_io(env, ios);
+       sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+       sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+       lov_sub_put(sub);
+       RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       struct lov_io_sub *sub;
+       unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+       ENTRY;
+
+       *written = 0;
+       list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+               struct cl_io *subio = sub->sub_io;
+
+               lov_sub_enter(sub);
+               lov_io_end_wrapper(sub->sub_env, subio);
+               lov_sub_exit(sub);
+
+               if (subio->ci_result == 0)
+                       *written += subio->u.ci_fsync.fi_nr_written;
+       }
+       RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_rw_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_rw_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_fault_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_FSYNC] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_fsync_end
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = lov_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = lov_io_submit
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = lov_io_submit
+                }
+        },
+       .cio_prepare_write = lov_io_prepare_write,
+       .cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct lov_object *lov = cl2lov(ios->cis_obj);
+       ENTRY;
+
+       if (atomic_dec_and_test(&lov->lo_active_ios))
+               wake_up_all(&lov->lo_waitq);
+       EXIT;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+                                struct cl_io_slice *ios)
+{
+       LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini       = lov_empty_io_fini,
+#if 0
+                       .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end        = LOV_EMPTY_IMPOSSIBLE
+#endif
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_FSYNC] = {
+                       .cio_fini   = lov_empty_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = lov_empty_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                }
+        },
+       .cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                     struct cl_io *io)
+{
+       struct lov_io       *lio = lov_env_io(env);
+       struct lov_object   *lov = cl2lov(obj);
+
+       ENTRY;
+       INIT_LIST_HEAD(&lio->lis_active);
+       lov_io_slice_init(lio, lov, io);
+       if (io->ci_result == 0) {
+               io->ci_result = lov_io_subio_init(env, lio, io);
+               if (io->ci_result == 0) {
+                       cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+                       atomic_inc(&lov->lo_active_ios);
+               }
+       }
+       RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+                     struct cl_io *io)
+{
+       struct lov_object *lov = cl2lov(obj);
+       struct lov_io *lio = lov_env_io(env);
+       int result;
+       ENTRY;
+
+       lio->lis_object = lov;
+       switch (io->ci_type) {
+       default:
+               LBUG();
+       case CIT_MISC:
+       case CIT_READ:
+               result = 0;
+               break;
+       case CIT_FSYNC:
+       case CIT_SETATTR:
+               result = +1;
+               break;
+       case CIT_WRITE:
+               result = -EBADF;
+               break;
+       case CIT_FAULT:
+               result = -EFAULT;
+               CERROR("Page fault on a file without stripes: "DFID"\n",
+                      PFID(lu_object_fid(&obj->co_lu)));
+               break;
+       }
+       if (result == 0) {
+               cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+               atomic_inc(&lov->lo_active_ios);
+       }
+
+       io->ci_result = result < 0 ? result : 0;
+       RETURN(result != 0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644 (file)
index 0000000..bdf3334
--- /dev/null
@@ -0,0 +1,1253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                              struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+                                                  struct cl_lock *parent,
+                                                  struct lov_lock_sub *lls)
+{
+       struct lov_sublock_env *subenv;
+       struct lov_io     *lio    = lov_env_io(env);
+       struct cl_io       *io     = lio->lis_cl.cis_io;
+       struct lov_io_sub      *sub;
+
+       subenv = &lov_env_session(env)->ls_subenv;
+
+       /*
+        * FIXME: We tend to use the subio's env & io to call the sublock
+        * lock operations because osc lock sometimes stores some control
+        * variables in thread's IO infomation(Now only lockless information).
+        * However, if the lock's host(object) is different from the object
+        * for current IO, we have no way to get the subenv and subio because
+        * they are not initialized at all. As a temp fix, in this case,
+        * we still borrow the parent's env to call sublock operations.
+        */
+       if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+               subenv->lse_env = env;
+               subenv->lse_io  = io;
+               subenv->lse_sub = NULL;
+       } else {
+               sub = lov_sub_get(env, lio, lls->sub_stripe);
+               if (!IS_ERR(sub)) {
+                       subenv->lse_env = sub->sub_env;
+                       subenv->lse_io  = sub->sub_io;
+                       subenv->lse_sub = sub;
+               } else {
+                       subenv = (void*)sub;
+               }
+       }
+       return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+       if (subenv && subenv->lse_sub)
+               lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+                             struct cl_lock *sublock, int idx,
+                             struct lov_lock_link *link)
+{
+       struct lovsub_lock *lsl;
+       struct cl_lock     *parent = lck->lls_cl.cls_lock;
+       int              rc;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       LASSERT(cl_lock_is_mutexed(sublock));
+       ENTRY;
+
+       lsl = cl2sub_lock(sublock);
+       /*
+        * check that sub-lock doesn't have lock link to this top-lock.
+        */
+       LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+       LASSERT(idx < lck->lls_nr);
+
+       lck->lls_sub[idx].sub_lock = lsl;
+       lck->lls_nr_filled++;
+       LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+       list_add_tail(&link->lll_list, &lsl->lss_parents);
+       link->lll_idx = idx;
+       link->lll_super = lck;
+       cl_lock_get(parent);
+       lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+       lck->lls_sub[idx].sub_flags |= LSF_HELD;
+       cl_lock_user_add(env, sublock);
+
+       rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+       LASSERT(rc == 0); /* there is no way this can fail, currently */
+       EXIT;
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+                                        const struct cl_io *io,
+                                        struct lov_lock *lck,
+                                        int idx, struct lov_lock_link **out)
+{
+       struct cl_lock       *sublock;
+       struct cl_lock       *parent;
+       struct lov_lock_link *link;
+
+       LASSERT(idx < lck->lls_nr);
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, __GFP_IO);
+       if (link != NULL) {
+               struct lov_sublock_env *subenv;
+               struct lov_lock_sub  *lls;
+               struct cl_lock_descr *descr;
+
+               parent = lck->lls_cl.cls_lock;
+               lls    = &lck->lls_sub[idx];
+               descr  = &lls->sub_got;
+
+               subenv = lov_sublock_env_get(env, parent, lls);
+               if (!IS_ERR(subenv)) {
+                       /* CAVEAT: Don't try to add a field in lov_lock_sub
+                        * to remember the subio. This is because lock is able
+                        * to be cached, but this is not true for IO. This
+                        * further means a sublock might be referenced in
+                        * different io context. -jay */
+
+                       sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+                                              descr, "lov-parent", parent);
+                       lov_sublock_env_put(subenv);
+               } else {
+                       /* error occurs. */
+                       sublock = (void*)subenv;
+               }
+
+               if (!IS_ERR(sublock))
+                       *out = link;
+               else
+                       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+       } else
+               sublock = ERR_PTR(-ENOMEM);
+       RETURN(sublock);
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+                              struct lovsub_lock *lsl,
+                              struct cl_lock_closure *closure,
+                              struct lov_sublock_env *subenv)
+{
+       ENTRY;
+       lov_sublock_env_put(subenv);
+       lsl->lss_active = NULL;
+       cl_lock_disclosure(env, closure);
+       EXIT;
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+                           struct lov_lock *lck,
+                           struct lov_lock_sub *lls,
+                           struct cl_lock_closure *closure,
+                           struct lov_sublock_env **lsep)
+{
+       struct lovsub_lock *sublock;
+       struct cl_lock     *child;
+       int              result = 0;
+       ENTRY;
+
+       LASSERT(list_empty(&closure->clc_list));
+
+       sublock = lls->sub_lock;
+       child = sublock->lss_cl.cls_lock;
+       result = cl_lock_closure_build(env, child, closure);
+       if (result == 0) {
+               struct cl_lock *parent = closure->clc_origin;
+
+               LASSERT(cl_lock_is_mutexed(child));
+               sublock->lss_active = parent;
+
+               if (unlikely((child->cll_state == CLS_FREEING) ||
+                            (child->cll_flags & CLF_CANCELLED))) {
+                       struct lov_lock_link *link;
+                       /*
+                        * we could race with lock deletion which temporarily
+                        * put the lock in freeing state, bug 19080.
+                        */
+                       LASSERT(!(lls->sub_flags & LSF_HELD));
+
+                       link = lov_lock_link_find(env, lck, sublock);
+                       LASSERT(link != NULL);
+                       lov_lock_unlink(env, link, sublock);
+                       lov_sublock_unlock(env, sublock, closure, NULL);
+                       lck->lls_cancel_race = 1;
+                       result = CLO_REPEAT;
+               } else if (lsep) {
+                       struct lov_sublock_env *subenv;
+                       subenv = lov_sublock_env_get(env, parent, lls);
+                       if (IS_ERR(subenv)) {
+                               lov_sublock_unlock(env, sublock,
+                                                  closure, NULL);
+                               result = PTR_ERR(subenv);
+                       } else {
+                               *lsep = subenv;
+                       }
+               }
+       }
+       RETURN(result);
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0               success
+ *     - CLO_WAIT         wait for event
+ *     - CLO_REPEAT     repeat top-operation
+ *     - -ne           fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+       int result_rank;
+       int rc_rank;
+
+       ENTRY;
+
+       LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+                "result = %d", result);
+       LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+                "rc = %d\n", rc);
+       CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+       /* calculate ranks in the ordering above */
+       result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+       rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+       if (result_rank < rc_rank)
+               result = rc;
+       RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+                            struct lov_lock *lck, const struct cl_io *io)
+{
+       int result = 0;
+       int i;
+       int nr;
+       obd_off start;
+       obd_off end;
+       obd_off file_start;
+       obd_off file_end;
+
+       struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+       struct lov_layout_raid0 *r0     = lov_r0(loo);
+       struct cl_lock    *parent = lck->lls_cl.cls_lock;
+
+       ENTRY;
+
+       lck->lls_orig = parent->cll_descr;
+       file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+       file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+       for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+               /*
+                * XXX for wide striping smarter algorithm is desirable,
+                * breaking out of the loop, early.
+                */
+               if (lov_stripe_intersects(loo->lo_lsm, i,
+                                         file_start, file_end, &start, &end))
+                       nr++;
+       }
+       LASSERT(nr > 0);
+       OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof lck->lls_sub[0]);
+       if (lck->lls_sub == NULL)
+               RETURN(-ENOMEM);
+
+       lck->lls_nr = nr;
+       /*
+        * First, fill in sub-lock descriptions in
+        * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+        * (called below in this function, and by lov_lock_enqueue()) to
+        * create sub-locks. At this moment, no other thread can access
+        * top-lock.
+        */
+       for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+               if (lov_stripe_intersects(loo->lo_lsm, i,
+                                         file_start, file_end, &start, &end)) {
+                       struct cl_lock_descr *descr;
+
+                       descr = &lck->lls_sub[nr].sub_descr;
+
+                       LASSERT(descr->cld_obj == NULL);
+                       descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+                       descr->cld_start = cl_index(descr->cld_obj, start);
+                       descr->cld_end   = cl_index(descr->cld_obj, end);
+                       descr->cld_mode  = parent->cll_descr.cld_mode;
+                       descr->cld_gid   = parent->cll_descr.cld_gid;
+                       descr->cld_enq_flags   = parent->cll_descr.cld_enq_flags;
+                       /* XXX has no effect */
+                       lck->lls_sub[nr].sub_got = *descr;
+                       lck->lls_sub[nr].sub_stripe = i;
+                       nr++;
+               }
+       }
+       LASSERT(nr == lck->lls_nr);
+       /*
+        * Then, create sub-locks. Once at least one sub-lock was created,
+        * top-lock can be reached by other threads.
+        */
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct cl_lock       *sublock;
+               struct lov_lock_link *link;
+
+               if (lck->lls_sub[i].sub_lock == NULL) {
+                       sublock = lov_sublock_alloc(env, io, lck, i, &link);
+                       if (IS_ERR(sublock)) {
+                               result = PTR_ERR(sublock);
+                               break;
+                       }
+                       cl_lock_get_trust(sublock);
+                       cl_lock_mutex_get(env, sublock);
+                       cl_lock_mutex_get(env, parent);
+                       /*
+                        * recheck under mutex that sub-lock wasn't created
+                        * concurrently, and that top-lock is still alive.
+                        */
+                       if (lck->lls_sub[i].sub_lock == NULL &&
+                           parent->cll_state < CLS_FREEING) {
+                               lov_sublock_adopt(env, lck, sublock, i, link);
+                               cl_lock_mutex_put(env, parent);
+                       } else {
+                               OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+                               cl_lock_mutex_put(env, parent);
+                               cl_lock_unhold(env, sublock,
+                                              "lov-parent", parent);
+                       }
+                       cl_lock_mutex_put(env, sublock);
+                       cl_lock_put(env, sublock);
+               }
+       }
+       /*
+        * Some sub-locks can be missing at this point. This is not a problem,
+        * because enqueue will create them anyway. Main duty of this function
+        * is to fill in sub-lock descriptions in a race free manner.
+        */
+       RETURN(result);
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+                              int i, int deluser, int rc)
+{
+       struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       ENTRY;
+
+       if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+               struct cl_lock    *sublock;
+               int dying;
+
+               LASSERT(lck->lls_sub[i].sub_lock != NULL);
+               sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+               LASSERT(cl_lock_is_mutexed(sublock));
+
+               lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+               if (deluser)
+                       cl_lock_user_del(env, sublock);
+               /*
+                * If the last hold is released, and cancellation is pending
+                * for a sub-lock, release parent mutex, to avoid keeping it
+                * while sub-lock is being paged out.
+                */
+               dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+                        sublock->cll_descr.cld_mode == CLM_GROUP ||
+                        (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+                       sublock->cll_holds == 1;
+               if (dying)
+                       cl_lock_mutex_put(env, parent);
+               cl_lock_unhold(env, sublock, "lov-parent", parent);
+               if (dying) {
+                       cl_lock_mutex_get(env, parent);
+                       rc = lov_subresult(rc, CLO_REPEAT);
+               }
+               /*
+                * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+                * not backed by a reference on a
+                * sub-lock. lovsub_lock_delete() will clear
+                * lck->lls_sub[i].sub_lock under semaphores, just before
+                * sub-lock is destroyed.
+                */
+       }
+       RETURN(rc);
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+                            int i)
+{
+       struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       ENTRY;
+
+       if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+               struct cl_lock *sublock;
+
+               LASSERT(lck->lls_sub[i].sub_lock != NULL);
+               sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+               LASSERT(cl_lock_is_mutexed(sublock));
+               LASSERT(sublock->cll_state != CLS_FREEING);
+
+               lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+               cl_lock_get_trust(sublock);
+               cl_lock_hold_add(env, sublock, "lov-parent", parent);
+               cl_lock_user_add(env, sublock);
+               cl_lock_put(env, sublock);
+       }
+       EXIT;
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+                         struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck;
+       int i;
+
+       ENTRY;
+       lck = cl2lov_lock(slice);
+       LASSERT(lck->lls_nr_filled == 0);
+       if (lck->lls_sub != NULL) {
+               for (i = 0; i < lck->lls_nr; ++i)
+                       /*
+                        * No sub-locks exists at this point, as sub-lock has
+                        * a reference on its parent.
+                        */
+                       LASSERT(lck->lls_sub[i].sub_lock == NULL);
+               OBD_FREE_LARGE(lck->lls_sub,
+                              lck->lls_nr * sizeof lck->lls_sub[0]);
+       }
+       OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+       EXIT;
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+                                struct lov_lock *lck,
+                                struct cl_lock *sublock)
+{
+       struct cl_lock *lock = lck->lls_cl.cls_lock;
+       int          result;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+
+       cl_lock_mutex_put(env, lock);
+       result = cl_lock_enqueue_wait(env, sublock, 0);
+       cl_lock_mutex_get(env, lock);
+       RETURN(result ?: CLO_REPEAT);
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+                               struct cl_lock *sublock,
+                               struct cl_io *io, __u32 enqflags, int last)
+{
+       int result;
+       ENTRY;
+
+       /* first, try to enqueue a sub-lock ... */
+       result = cl_enqueue_try(env, sublock, io, enqflags);
+       if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+               /* if it is enqueued, try to `wait' on it---maybe it's already
+                * granted */
+               result = cl_wait_try(env, sublock);
+               if (result == CLO_REENQUEUED)
+                       result = CLO_WAIT;
+       }
+       /*
+        * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+        * parallel, otherwise---enqueue has to wait until sub-lock is granted
+        * before proceeding to the next one.
+        */
+       if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+           (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+               result = 0;
+       RETURN(result);
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+                           struct cl_io *io, struct lov_lock *lck, int idx)
+{
+       struct lov_lock_link *link;
+       struct cl_lock       *sublock;
+       int                result;
+
+       LASSERT(parent->cll_depth == 1);
+       cl_lock_mutex_put(env, parent);
+       sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+       if (!IS_ERR(sublock))
+               cl_lock_mutex_get(env, sublock);
+       cl_lock_mutex_get(env, parent);
+
+       if (!IS_ERR(sublock)) {
+               cl_lock_get_trust(sublock);
+               if (parent->cll_state == CLS_QUEUING &&
+                   lck->lls_sub[idx].sub_lock == NULL) {
+                       lov_sublock_adopt(env, lck, sublock, idx, link);
+               } else {
+                       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+                       /* other thread allocated sub-lock, or enqueue is no
+                        * longer going on */
+                       cl_lock_mutex_put(env, parent);
+                       cl_lock_unhold(env, sublock, "lov-parent", parent);
+                       cl_lock_mutex_get(env, parent);
+               }
+               cl_lock_mutex_put(env, sublock);
+               cl_lock_put(env, sublock);
+               result = CLO_REPEAT;
+       } else
+               result = PTR_ERR(sublock);
+       return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *io, __u32 enqflags)
+{
+       struct cl_lock   *lock    = slice->cls_lock;
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, lock);
+       int i;
+       int result;
+       enum cl_lock_state minstate;
+
+       ENTRY;
+
+       for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct lov_lock_sub    *lls;
+               struct cl_lock   *sublock;
+               struct lov_sublock_env *subenv;
+
+               if (lock->cll_state != CLS_QUEUING) {
+                       /*
+                        * Lock might have left QUEUING state if previous
+                        * iteration released its mutex. Stop enqueing in this
+                        * case and let the upper layer to decide what to do.
+                        */
+                       LASSERT(i > 0 && result != 0);
+                       break;
+               }
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               /*
+                * Sub-lock might have been canceled, while top-lock was
+                * cached.
+                */
+               if (sub == NULL) {
+                       result = lov_sublock_fill(env, lock, io, lck, i);
+                       /* lov_sublock_fill() released @lock mutex,
+                        * restart. */
+                       break;
+               }
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       lov_sublock_hold(env, lck, i);
+                       rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+                                                 subenv->lse_io, enqflags,
+                                                 i == lck->lls_nr - 1);
+                       minstate = min(minstate, sublock->cll_state);
+                       if (rc == CLO_WAIT) {
+                               switch (sublock->cll_state) {
+                               case CLS_QUEUING:
+                                       /* take recursive mutex, the lock is
+                                        * released in lov_lock_enqueue_wait.
+                                        */
+                                       cl_lock_mutex_get(env, sublock);
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       rc = lov_lock_enqueue_wait(env, lck,
+                                                                  sublock);
+                                       break;
+                               case CLS_CACHED:
+                                       cl_lock_get(sublock);
+                                       /* take recursive mutex of sublock */
+                                       cl_lock_mutex_get(env, sublock);
+                                       /* need to release all locks in closure
+                                        * otherwise it may deadlock. LU-2683.*/
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       /* sublock and parent are held. */
+                                       rc = lov_sublock_release(env, lck, i,
+                                                                1, rc);
+                                       cl_lock_mutex_put(env, sublock);
+                                       cl_lock_put(env, sublock);
+                                       break;
+                               default:
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       break;
+                               }
+                       } else {
+                               LASSERT(sublock->cll_conflict == NULL);
+                               lov_sublock_unlock(env, sub, closure, subenv);
+                       }
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int i;
+       int result;
+
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               /* top-lock state cannot change concurrently, because single
+                * thread (one that released the last hold) carries unlocking
+                * to the completion. */
+               LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL)
+                       continue;
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       if (lls->sub_flags & LSF_HELD) {
+                               LASSERT(sublock->cll_state == CLS_HELD ||
+                                       sublock->cll_state == CLS_ENQUEUED);
+                               rc = cl_unuse_try(subenv->lse_env, sublock);
+                               rc = lov_sublock_release(env, lck, i, 0, rc);
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               result = lov_subresult(result, rc);
+       }
+
+       if (result == 0 && lck->lls_cancel_race) {
+               lck->lls_cancel_race = 0;
+               result = -ESTALE;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result);
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+                          const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int i;
+       int result;
+
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               /* top-lock state cannot change concurrently, because single
+                * thread (one that released the last hold) carries unlocking
+                * to the completion. */
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL)
+                       continue;
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       if (!(lls->sub_flags & LSF_HELD)) {
+                               lov_sublock_unlock(env, sub, closure, subenv);
+                               continue;
+                       }
+
+                       switch(sublock->cll_state) {
+                       case CLS_HELD:
+                               rc = cl_unuse_try(subenv->lse_env, sublock);
+                               lov_sublock_release(env, lck, i, 0, 0);
+                               break;
+                       default:
+                               lov_sublock_release(env, lck, i, 1, 0);
+                               break;
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+
+               if (rc == CLO_REPEAT) {
+                       --i;
+                       continue;
+               }
+
+               result = lov_subresult(result, rc);
+       }
+
+       if (result)
+               CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+                             "lov_lock_cancel fails with %d.\n", result);
+
+       cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       enum cl_lock_state      minstate;
+       int                  reenqueued;
+       int                  result;
+       int                  i;
+
+       ENTRY;
+
+again:
+       for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+            i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               LASSERT(sub != NULL);
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+                       if (sublock->cll_state < CLS_HELD)
+                               rc = cl_wait_try(env, sublock);
+
+                       minstate = min(minstate, sublock->cll_state);
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               if (rc == CLO_REENQUEUED) {
+                       reenqueued++;
+                       rc = 0;
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+       /* Each sublock only can be reenqueued once, so will not loop for
+        * ever. */
+       if (result == 0 && reenqueued != 0)
+               goto again;
+       cl_lock_closure_fini(closure);
+       RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_use(const struct lu_env *env,
+                       const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int                  result;
+       int                  i;
+
+       LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL) {
+                       /*
+                        * Sub-lock might have been canceled, while top-lock was
+                        * cached.
+                        */
+                       result = -ESTALE;
+                       break;
+               }
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       LASSERT(sublock->cll_state != CLS_FREEING);
+                       lov_sublock_hold(env, lck, i);
+                       if (sublock->cll_state == CLS_CACHED) {
+                               rc = cl_use_try(subenv->lse_env, sublock, 0);
+                               if (rc != 0)
+                                       rc = lov_sublock_release(env, lck,
+                                                                i, 1, rc);
+                       } else if (sublock->cll_state == CLS_NEW) {
+                               /* Sub-lock might have been canceled, while
+                                * top-lock was cached. */
+                               result = -ESTALE;
+                               lov_sublock_release(env, lck, i, 1, result);
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+
+       if (lck->lls_cancel_race) {
+               /*
+                * If there is unlocking happened at the same time, then
+                * sublock_lock state should be FREEING, and lov_sublock_lock
+                * should return CLO_REPEAT. In this case, it should return
+                * ESTALE, and up layer should reset the lock state to be NEW.
+                */
+               lck->lls_cancel_race = 0;
+               LASSERT(result != 0);
+               result = -ESTALE;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result);
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+       struct cl_lock    *lock    = slice->cls_lock;
+       struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+       struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+       struct lov_layout_raid0 *r0      = lov_r0(loo);
+       struct lov_lock_sub     *sub;
+       struct cl_object        *subobj;
+       obd_off  fstart;
+       obd_off  fend;
+       obd_off  start;
+       obd_off  end;
+       int i;
+
+       fstart = cl_offset(need->cld_obj, need->cld_start);
+       fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+       subneed->cld_mode = need->cld_mode;
+       cl_lock_mutex_get(env, lock);
+       for (i = 0; i < lov->lls_nr; ++i) {
+               sub = &lov->lls_sub[i];
+               if (sub->sub_lock == NULL)
+                       continue;
+               subobj = sub->sub_descr.cld_obj;
+               if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+                                          fstart, fend, &start, &end))
+                       continue;
+               subneed->cld_start = cl_index(subobj, start);
+               subneed->cld_end   = cl_index(subobj, end);
+               subneed->cld_obj   = subobj;
+               if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+                       result = 0;
+                       break;
+               }
+       }
+       cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+                                      struct lov_object *lov, int stripe,
+                                      const struct cl_lock_descr *child,
+                                      const struct cl_lock_descr *descr)
+{
+       struct lov_stripe_md *lsm = lov->lo_lsm;
+       obd_off start;
+       obd_off end;
+       int result;
+
+       if (lov_r0(lov)->lo_nr == 1)
+               return cl_lock_ext_match(child, descr);
+
+       /*
+        * For a multi-stripes object:
+        * - make sure the descr only covers child's stripe, and
+        * - check if extent is matching.
+        */
+       start = cl_offset(&lov->lo_cl, descr->cld_start);
+       end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+       result = end - start <= lsm->lsm_stripe_size &&
+                stripe == lov_stripe_number(lsm, start) &&
+                stripe == lov_stripe_number(lsm, end);
+       if (result) {
+               struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+               obd_off sub_start;
+               obd_off sub_end;
+
+               subd->cld_obj  = NULL;   /* don't need sub object at all */
+               subd->cld_mode = descr->cld_mode;
+               subd->cld_gid  = descr->cld_gid;
+               result = lov_stripe_intersects(lsm, stripe, start, end,
+                                              &sub_start, &sub_end);
+               LASSERT(result);
+               subd->cld_start = cl_index(child->cld_obj, sub_start);
+               subd->cld_end   = cl_index(child->cld_obj, sub_end);
+               result = cl_lock_ext_match(child, subd);
+       }
+       return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io)
+{
+       struct lov_lock   *lov = cl2lov_lock(slice);
+       struct lov_object *obj = cl2lov(slice->cls_obj);
+       int result;
+
+       LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+       LASSERT(lov->lls_nr > 0);
+
+       ENTRY;
+
+       /* for top lock, it's necessary to match enq flags otherwise it will
+        * run into problem if a sublock is missing and reenqueue. */
+       if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+               return 0;
+
+       if (need->cld_mode == CLM_GROUP)
+               /*
+                * always allow to match group lock.
+                */
+               result = cl_lock_ext_match(&lov->lls_orig, need);
+       else if (lov->lls_nr == 1) {
+               struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+               result = lov_lock_stripe_is_matching(env,
+                                                    cl2lov(slice->cls_obj),
+                                                    lov->lls_sub[0].sub_stripe,
+                                                    got, need);
+       } else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+                  !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+               /*
+                * Multi-stripe locks are only suitable for `quick' IO and for
+                * glimpse.
+                */
+               result = 0;
+       else
+               /*
+                * Most general case: multi-stripe existing lock, and
+                * (potentially) multi-stripe @need lock. Check that @need is
+                * covered by @lov's sub-locks.
+                *
+                * For now, ignore lock expansions made by the server, and
+                * match against original lock extent.
+                */
+               result = cl_lock_ext_match(&lov->lls_orig, need);
+       CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+              PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+              lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+              result);
+       RETURN(result);
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+                    struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+       struct lov_lock *lck    = link->lll_super;
+       struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+       ENTRY;
+
+       list_del_init(&link->lll_list);
+       LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+       /* yank this sub-lock from parent's array */
+       lck->lls_sub[link->lll_idx].sub_lock = NULL;
+       LASSERT(lck->lls_nr_filled > 0);
+       lck->lls_nr_filled--;
+       lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+       cl_lock_put(env, parent);
+       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+       EXIT;
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                        struct lov_lock *lck,
+                                        struct lovsub_lock *sub)
+{
+       struct lov_lock_link *scan;
+
+       LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+       ENTRY;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               if (scan->lll_super == lck)
+                       RETURN(scan);
+       }
+       RETURN(NULL);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       struct lov_lock_link   *link;
+       int                  rc;
+       int                  i;
+
+       LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+       ENTRY;
+
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct lov_lock_sub *lls = &lck->lls_sub[i];
+               struct lovsub_lock  *lsl = lls->sub_lock;
+
+               if (lsl == NULL) /* already removed */
+                       continue;
+
+               rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+               if (rc == CLO_REPEAT) {
+                       --i;
+                       continue;
+               }
+
+               LASSERT(rc == 0);
+               LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+               if (lls->sub_flags & LSF_HELD)
+                       lov_sublock_release(env, lck, i, 1, 0);
+
+               link = lov_lock_link_find(env, lck, lsl);
+               LASSERT(link != NULL);
+               lov_lock_unlink(env, link, lsl);
+               LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+               lov_sublock_unlock(env, lsl, closure, NULL);
+       }
+
+       cl_lock_closure_fini(closure);
+       EXIT;
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck = cl2lov_lock(slice);
+       int           i;
+
+       (*p)(env, cookie, "%d\n", lck->lls_nr);
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct lov_lock_sub *sub;
+
+               sub = &lck->lls_sub[i];
+               (*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+               if (sub->sub_lock != NULL)
+                       cl_lock_print(env, cookie, p,
+                                     sub->sub_lock->lss_cl.cls_lock);
+               else
+                       (*p)(env, cookie, "---\n");
+       }
+       return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+       .clo_fini      = lov_lock_fini,
+       .clo_enqueue   = lov_lock_enqueue,
+       .clo_wait      = lov_lock_wait,
+       .clo_use       = lov_lock_use,
+       .clo_unuse     = lov_lock_unuse,
+       .clo_cancel    = lov_lock_cancel,
+       .clo_fits_into = lov_lock_fits_into,
+       .clo_delete    = lov_lock_delete,
+       .clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lov_lock *lck;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+       if (lck != NULL) {
+               cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+               result = lov_lock_sub_init(env, lck, io);
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+                               struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck = cl2lov_lock(slice);
+       OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+                       lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       (*p)(env, cookie, "empty\n");
+       return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+       .clo_fini  = lov_empty_lock_fini,
+       .clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+               struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lov_lock *lck;
+       int result = -ENOMEM;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+       if (lck != NULL) {
+               cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+               lck->lls_orig = lock->cll_descr;
+               result = 0;
+       }
+       RETURN(result);
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                              struct cl_lock *parent)
+{
+       struct cl_lock_closure *closure;
+
+       closure = &lov_env_info(env)->lti_closure;
+       LASSERT(list_empty(&closure->clc_list));
+       cl_lock_closure_init(env, closure, parent, 1);
+       return closure;
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_log.c b/drivers/staging/lustre/lustre/lov/lov_log.c
new file mode 100644 (file)
index 0000000..63b7f8d
--- /dev/null
@@ -0,0 +1,278 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_log.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+
+#include "lov_internal.h"
+
+/* Add log records for each OSC that this object is striped over, and return
+ * cookies for each one.  We _would_ have nice abstraction here, except that
+ * we need to keep cookies in stripe order, even if some are NULL, so that
+ * the right cookies are passed back to the right OSTs at the client side.
+ * Unset cookies should be all-zero (which will never occur naturally). */
+static int lov_llog_origin_add(const struct lu_env *env,
+                              struct llog_ctxt *ctxt,
+                              struct llog_rec_hdr *rec,
+                              struct lov_stripe_md *lsm,
+                              struct llog_cookie *logcookies, int numcookies)
+{
+       struct obd_device *obd = ctxt->loc_obd;
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc = 0, cookies = 0;
+       ENTRY;
+
+       LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count,
+                "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n",
+                logcookies, numcookies, lsm->lsm_stripe_count);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               struct obd_device *child =
+                       lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+               struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx);
+
+               /* fill mds unlink/setattr log record */
+               switch (rec->lrh_type) {
+               case MDS_UNLINK_REC: {
+                       struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+                       lur->lur_oid = ostid_id(&loi->loi_oi);
+                       lur->lur_oseq = (__u32)ostid_seq(&loi->loi_oi);
+                       break;
+               }
+               case MDS_SETATTR64_REC: {
+                       struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+                       lsr->lsr_oi = loi->loi_oi;
+                       break;
+               }
+               default:
+                       break;
+               }
+
+               /* inject error in llog_obd_add() below */
+               if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) {
+                       llog_ctxt_put(cctxt);
+                       cctxt = NULL;
+               }
+               rc = llog_obd_add(env, cctxt, rec, NULL, logcookies + cookies,
+                                 numcookies - cookies);
+               llog_ctxt_put(cctxt);
+               if (rc < 0) {
+                       CERROR("Can't add llog (rc = %d) for stripe %d\n",
+                              rc, cookies);
+                       memset(logcookies + cookies, 0,
+                              sizeof(struct llog_cookie));
+                       rc = 1; /* skip this cookie */
+               }
+               /* Note that rc is always 1 if llog_obd_add was successful */
+               cookies += rc;
+       }
+       RETURN(cookies);
+}
+
+static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
+                                  struct llog_logid *logid,
+                                  struct llog_gen *gen,
+                                  struct obd_uuid *uuid)
+{
+       struct obd_device *obd = ctxt->loc_obd;
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc = 0, err = 0;
+       ENTRY;
+
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               struct obd_device *child;
+               struct llog_ctxt *cctxt;
+
+               if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                       continue;
+               if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
+                       continue;
+               CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count);
+               child = lov->lov_tgts[i]->ltd_exp->exp_obd;
+               cctxt = llog_get_context(child, ctxt->loc_idx);
+               rc = llog_connect(cctxt, logid, gen, uuid);
+               llog_ctxt_put(cctxt);
+
+               if (rc) {
+                       CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc);
+                       if (!err)
+                               err = rc;
+               }
+       }
+       obd_putref(obd);
+
+       RETURN(err);
+}
+
+/* the replicators commit callback */
+static int lov_llog_repl_cancel(const struct lu_env *env,
+                               struct llog_ctxt *ctxt,
+                               struct lov_stripe_md *lsm,
+                               int count, struct llog_cookie *cookies,
+                               int flags)
+{
+       struct lov_obd *lov;
+       struct obd_device *obd = ctxt->loc_obd;
+       int rc = 0, i;
+       ENTRY;
+
+       LASSERT(lsm != NULL);
+       LASSERT(count == lsm->lsm_stripe_count);
+
+       lov = &obd->u.lov;
+       obd_getref(obd);
+       for (i = 0; i < count; i++, cookies++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               struct obd_device *child =
+                       lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+               struct llog_ctxt *cctxt =
+                       llog_get_context(child, ctxt->loc_idx);
+               int err;
+
+               err = llog_cancel(env, cctxt, NULL, 1, cookies, flags);
+               llog_ctxt_put(cctxt);
+               if (err && lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
+                       CERROR("%s: objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&loi->loi_oi), loi->loi_ost_idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static struct llog_operations lov_mds_ost_orig_logops = {
+       .lop_obd_add    = lov_llog_origin_add,
+       .lop_connect    = lov_llog_origin_connect,
+};
+
+static struct llog_operations lov_size_repl_logops = {
+       .lop_cancel     = lov_llog_repl_cancel,
+};
+
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *index)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_device *child;
+       int i, rc = 0;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+       rc = llog_setup(NULL, obd, olg, LLOG_MDS_OST_ORIG_CTXT, disk_obd,
+                       &lov_mds_ost_orig_logops);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_setup(NULL, obd, olg, LLOG_SIZE_REPL_CTXT, disk_obd,
+                       &lov_size_repl_logops);
+       if (rc)
+               GOTO(err_cleanup, rc);
+
+       obd_getref(obd);
+       /* count may not match lov->desc.ld_tgt_count during dynamic ost add */
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (!lov->lov_tgts[i])
+                       continue;
+
+               if (index && i != *index)
+                       continue;
+
+               child = lov->lov_tgts[i]->ltd_obd;
+               rc = obd_llog_init(child, &child->obd_olg, disk_obd, &i);
+               if (rc)
+                       CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' "
+                              "(rc=%d)\n", i, child->obd_name,
+                              disk_obd->obd_name, rc);
+               rc = 0;
+       }
+       obd_putref(obd);
+       GOTO(err_cleanup, rc);
+err_cleanup:
+       if (rc) {
+               struct llog_ctxt *ctxt =
+                       llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+               if (ctxt)
+                       llog_cleanup(NULL, ctxt);
+               ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+               if (ctxt)
+                       llog_cleanup(NULL, ctxt);
+       }
+       return rc;
+}
+
+int lov_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       /* cleanup our llogs only if the ctxts have been setup
+        * (client lov doesn't setup, mds lov does). */
+       ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       /* lov->tgt llogs are cleaned during osc_cleanup. */
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644 (file)
index 0000000..ddbac12
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                     struct ost_lvb *lvb, __u64 *kms_place)
+{
+       __u64 size = 0;
+       __u64 kms = 0;
+       __u64 blocks = 0;
+       obd_time current_mtime = lvb->lvb_mtime;
+       obd_time current_atime = lvb->lvb_atime;
+       obd_time current_ctime = lvb->lvb_ctime;
+       int i;
+       int rc = 0;
+
+       LASSERT(spin_is_locked(&lsm->lsm_lock));
+       LASSERT(lsm->lsm_lock_owner == current_pid());
+
+       CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s="LPU64" m="LPU64
+              " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+              lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+              lvb->lvb_blocks);
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               obd_size lov_size, tmpsize;
+
+               if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+                       rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+                       continue;
+               }
+
+               tmpsize = loi->loi_kms;
+               lov_size = lov_stripe_size(lsm, tmpsize, i);
+               if (lov_size > kms)
+                       kms = lov_size;
+
+               if (loi->loi_lvb.lvb_size > tmpsize)
+                       tmpsize = loi->loi_lvb.lvb_size;
+
+               lov_size = lov_stripe_size(lsm, tmpsize, i);
+               if (lov_size > size)
+                       size = lov_size;
+               /* merge blocks, mtime, atime */
+               blocks += loi->loi_lvb.lvb_blocks;
+               if (loi->loi_lvb.lvb_mtime > current_mtime)
+                       current_mtime = loi->loi_lvb.lvb_mtime;
+               if (loi->loi_lvb.lvb_atime > current_atime)
+                       current_atime = loi->loi_lvb.lvb_atime;
+               if (loi->loi_lvb.lvb_ctime > current_ctime)
+                       current_ctime = loi->loi_lvb.lvb_ctime;
+
+               CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s="LPU64" m="LPU64
+                      " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+                      loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+                      loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+                      loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+       }
+
+       *kms_place = kms;
+       lvb->lvb_size = size;
+       lvb->lvb_blocks = blocks;
+       lvb->lvb_mtime = current_mtime;
+       lvb->lvb_atime = current_atime;
+       lvb->lvb_ctime = current_ctime;
+       RETURN(rc);
+}
+
+/** Merge the lock value block(&lvb) attributes from each of the stripes in a
+ * file into a single lvb. It is expected that the caller initializes the
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * the local client.
+ *
+ * If \a kms_only is set then we do not consider the recently seen size (rss)
+ * when updating the known minimum size (kms).  Even when merging RSS, we will
+ * take the KMS value if it's larger.  This prevents getattr from stomping on
+ * dirty cached pages which extend the file size. */
+int lov_merge_lvb(struct obd_export *exp,
+                 struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only)
+{
+       int   rc;
+       __u64 kms;
+
+       ENTRY;
+       lov_stripe_lock(lsm);
+       rc = lov_merge_lvb_kms(lsm, lvb, &kms);
+       lov_stripe_unlock(lsm);
+       if (kms_only)
+               lvb->lvb_size = kms;
+
+       CDEBUG(D_INODE, "merged for ID "DOSTID" s="LPU64" m="LPU64" a="LPU64
+              " c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), lvb->lvb_size,
+              lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+       RETURN(rc);
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+                  obd_off size, int shrink)
+{
+       struct lov_oinfo *loi;
+       int stripe = 0;
+       __u64 kms;
+       ENTRY;
+
+       LASSERT(spin_is_locked(&lsm->lsm_lock));
+       LASSERT(lsm->lsm_lock_owner == current_pid());
+
+       if (shrink) {
+               for (; stripe < lsm->lsm_stripe_count; stripe++) {
+                       struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+                       kms = lov_size_to_stripe(lsm, size, stripe);
+                       CDEBUG(D_INODE,
+                              "stripe %d KMS %sing "LPU64"->"LPU64"\n",
+                              stripe, kms > loi->loi_kms ? "increas":"shrink",
+                              loi->loi_kms, kms);
+                       loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+               }
+               RETURN(0);
+       }
+
+       if (size > 0)
+               stripe = lov_stripe_number(lsm, size - 1);
+       kms = lov_size_to_stripe(lsm, size, stripe);
+       loi = lsm->lsm_oinfo[stripe];
+
+       CDEBUG(D_INODE, "stripe %d KMS %sincreasing "LPU64"->"LPU64"\n",
+              stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+       if (kms > loi->loi_kms)
+               loi_kms_set(loi, kms);
+
+       RETURN(0);
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+                    struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+       valid &= src->o_valid;
+
+       if (*set) {
+               if (valid & OBD_MD_FLSIZE) {
+                       /* this handles sparse files properly */
+                       obd_size lov_size;
+
+                       lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+                       if (lov_size > tgt->o_size)
+                               tgt->o_size = lov_size;
+               }
+               if (valid & OBD_MD_FLBLOCKS)
+                       tgt->o_blocks += src->o_blocks;
+               if (valid & OBD_MD_FLBLKSZ)
+                       tgt->o_blksize += src->o_blksize;
+               if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+                       tgt->o_ctime = src->o_ctime;
+               if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+                       tgt->o_mtime = src->o_mtime;
+               if (valid & OBD_MD_FLDATAVERSION)
+                       tgt->o_data_version += src->o_data_version;
+       } else {
+               memcpy(tgt, src, sizeof(*tgt));
+               tgt->o_oi = lsm->lsm_oi;
+               if (valid & OBD_MD_FLSIZE)
+                       tgt->o_size = lov_stripe_size(lsm, src->o_size,
+                                                     stripeno);
+       }
+
+       /* data_version needs to be valid on all stripes to be correct! */
+       if (!(valid & OBD_MD_FLDATAVERSION))
+               tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+       *set += 1;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644 (file)
index 0000000..8089f03
--- /dev/null
@@ -0,0 +1,2923 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <lustre_debug.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre/ll_fiemap.h>
+#include <lustre_log.h>
+#include <lustre_fid.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+
+       /* nobody gets through here until lov_putref is done */
+       mutex_lock(&lov->lov_lock);
+       atomic_inc(&lov->lov_refcount);
+       mutex_unlock(&lov->lov_lock);
+       return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+
+       mutex_lock(&lov->lov_lock);
+       /* ok to dec to 0 more than once -- ltd_exp's will be null */
+       if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+               LIST_HEAD(kill);
+               int i;
+               struct lov_tgt_desc *tgt, *n;
+               CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+                      lov->lov_death_row);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       tgt = lov->lov_tgts[i];
+
+                       if (!tgt || !tgt->ltd_reap)
+                               continue;
+                       list_add(&tgt->ltd_kill, &kill);
+                       /* XXX - right now there is a dependency on ld_tgt_count
+                        * being the maximum tgt index for computing the
+                        * mds_max_easize. So we can't shrink it. */
+                       lov_ost_pool_remove(&lov->lov_packed, i);
+                       lov->lov_tgts[i] = NULL;
+                       lov->lov_death_row--;
+               }
+               mutex_unlock(&lov->lov_lock);
+
+               list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+                       list_del(&tgt->ltd_kill);
+                       /* Disconnect */
+                       __lov_del_obd(obd, tgt);
+               }
+       } else {
+               mutex_unlock(&lov->lov_lock);
+       }
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                             enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                   struct obd_connect_data *data)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_uuid *tgt_uuid;
+       struct obd_device *tgt_obd;
+       static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+       struct obd_import *imp;
+       proc_dir_entry_t *lov_proc_dir;
+       int rc;
+       ENTRY;
+
+       if (!lov->lov_tgts[index])
+               RETURN(-EINVAL);
+
+       tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+       tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+       if (!tgt_obd->obd_set_up) {
+               CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+               RETURN(-EINVAL);
+       }
+
+       /* override the sp_me from lov */
+       tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+       if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+               data->ocd_index = index;
+
+       /*
+        * Divine LOV knows that OBDs under it are OSCs.
+        */
+       imp = tgt_obd->u.cli.cl_import;
+
+       if (activate) {
+               tgt_obd->obd_no_recov = 0;
+               /* FIXME this is probably supposed to be
+                  ptlrpc_set_import_active.  Horrible naming. */
+               ptlrpc_activate_import(imp);
+       }
+
+       rc = obd_register_observer(tgt_obd, obd);
+       if (rc) {
+               CERROR("Target %s register_observer error %d\n",
+                      obd_uuid2str(tgt_uuid), rc);
+               RETURN(rc);
+       }
+
+
+       if (imp->imp_invalid) {
+               CDEBUG(D_CONFIG, "not connecting OSC %s; administratively "
+                      "disabled\n", obd_uuid2str(tgt_uuid));
+               RETURN(0);
+       }
+
+       rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+                        &lov_osc_uuid, data, NULL);
+       if (rc || !lov->lov_tgts[index]->ltd_exp) {
+               CERROR("Target %s connect error %d\n",
+                      obd_uuid2str(tgt_uuid), rc);
+               RETURN(-ENODEV);
+       }
+
+       lov->lov_tgts[index]->ltd_reap = 0;
+
+       CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+              obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+       lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+       if (lov_proc_dir) {
+               struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+               proc_dir_entry_t *osc_symlink;
+
+               LASSERT(osc_obd != NULL);
+               LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+               LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+               osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+                                                 lov_proc_dir,
+                                                 "../../../%s/%s",
+                                                 osc_obd->obd_type->typ_name,
+                                                 osc_obd->obd_name);
+               if (osc_symlink == NULL) {
+                       CERROR("could not register LOV target "
+                               "/proc/fs/lustre/%s/%s/target_obds/%s.",
+                               obd->obd_type->typ_name, obd->obd_name,
+                               osc_obd->obd_name);
+                       lprocfs_remove(&lov_proc_dir);
+               }
+       }
+
+       RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+                      struct obd_export **exp, struct obd_device *obd,
+                      struct obd_uuid *cluuid, struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       struct lustre_handle conn;
+       int i, rc;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               RETURN(rc);
+
+       *exp = class_conn2export(&conn);
+
+       /* Why should there ever be more than 1 connect? */
+       lov->lov_connects++;
+       LASSERT(lov->lov_connects == 1);
+
+       memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+       if (data)
+               lov->lov_ocd = *data;
+
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               tgt = lov->lov_tgts[i];
+               if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+                       continue;
+               /* Flags will be lowest common denominator */
+               rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+               if (rc) {
+                       CERROR("%s: lov connect tgt %d failed: %d\n",
+                              obd->obd_name, i, rc);
+                       continue;
+               }
+               /* connect to administrative disabled ost */
+               if (!lov->lov_tgts[i]->ltd_exp)
+                       continue;
+
+               rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+                               OBD_NOTIFY_CONNECT, (void *)&i);
+               if (rc) {
+                       CERROR("%s error sending notify %d\n",
+                              obd->obd_name, rc);
+               }
+       }
+       obd_putref(obd);
+
+       RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+       proc_dir_entry_t *lov_proc_dir;
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_device *osc_obd;
+       int rc;
+       ENTRY;
+
+       osc_obd = class_exp2obd(tgt->ltd_exp);
+       CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+              obd->obd_name, osc_obd->obd_name);
+
+       if (tgt->ltd_active) {
+               tgt->ltd_active = 0;
+               lov->desc.ld_active_tgt_count--;
+               tgt->ltd_exp->exp_obd->obd_inactive = 1;
+       }
+
+       lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
+       if (lov_proc_dir) {
+               proc_dir_entry_t *osc_symlink;
+
+               osc_symlink = lprocfs_srch(lov_proc_dir, osc_obd->obd_name);
+               if (osc_symlink) {
+                       lprocfs_remove(&osc_symlink);
+               } else {
+                       CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing.",
+                              obd->obd_type->typ_name, obd->obd_name,
+                              osc_obd->obd_name);
+               }
+       }
+
+       if (osc_obd) {
+               /* Pass it on to our clients.
+                * XXX This should be an argument to disconnect,
+                * XXX not a back-door flag on the OBD.  Ah well.
+                */
+               osc_obd->obd_force = obd->obd_force;
+               osc_obd->obd_fail = obd->obd_fail;
+               osc_obd->obd_no_recov = obd->obd_no_recov;
+       }
+
+       obd_register_observer(osc_obd, NULL);
+
+       rc = obd_disconnect(tgt->ltd_exp);
+       if (rc) {
+               CERROR("Target %s disconnect error %d\n",
+                      tgt->ltd_uuid.uuid, rc);
+               rc = 0;
+       }
+
+       tgt->ltd_exp = NULL;
+       RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc;
+       ENTRY;
+
+       if (!lov->lov_tgts)
+               goto out;
+
+       /* Only disconnect the underlying layers on the final disconnect. */
+       lov->lov_connects--;
+       if (lov->lov_connects != 0) {
+               /* why should there be more than 1 connect? */
+               CERROR("disconnect #%d\n", lov->lov_connects);
+               goto out;
+       }
+
+       /* Let's hold another reference so lov_del_obd doesn't spin through
+          putref every time */
+       obd_getref(obd);
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+                       /* Disconnection is the last we know about an obd */
+                       lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen);
+               }
+       }
+       obd_putref(obd);
+
+out:
+       rc = class_disconnect(exp); /* bz 9811 */
+       RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                             enum obd_notify_event ev)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       int index, activate, active;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+              lov, uuid->uuid, ev);
+
+       obd_getref(obd);
+       for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+               tgt = lov->lov_tgts[index];
+               if (!tgt)
+                       continue;
+               /*
+                * LU-642, initially inactive OSC could miss the obd_connect,
+                * we make up for it here.
+                */
+               if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+                   obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+                       struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+                       obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+                                   &lov_osc_uuid, &lov->lov_ocd, NULL);
+               }
+               if (!tgt->ltd_exp)
+                       continue;
+
+               CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
+                      index, obd_uuid2str(&tgt->ltd_uuid),
+                      tgt->ltd_exp->exp_handle.h_cookie);
+               if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+                       break;
+       }
+
+       if (index == lov->desc.ld_tgt_count)
+               GOTO(out, index = -EINVAL);
+
+       if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+               activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+               if (lov->lov_tgts[index]->ltd_activate == activate) {
+                       CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+                              uuid->uuid, activate ? "" : "de");
+               } else {
+                       lov->lov_tgts[index]->ltd_activate = activate;
+                       CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+                              activate ? "" : "de", obd_uuid2str(uuid));
+               }
+
+       } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+               active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+               if (lov->lov_tgts[index]->ltd_active == active) {
+                       CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+                              uuid->uuid, active ? "" : "in");
+                       GOTO(out, index);
+               } else {
+                       CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+                              obd_uuid2str(uuid), active ? "" : "in");
+               }
+
+               lov->lov_tgts[index]->ltd_active = active;
+               if (active) {
+                       lov->desc.ld_active_tgt_count++;
+                       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+               } else {
+                       lov->desc.ld_active_tgt_count--;
+                       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+               }
+       } else {
+               CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+       }
+
+ out:
+       obd_putref(obd);
+       RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data)
+{
+       int rc = 0;
+       struct lov_obd *lov = &obd->u.lov;
+       ENTRY;
+
+       down_read(&lov->lov_notify_lock);
+       if (!lov->lov_connects) {
+               up_read(&lov->lov_notify_lock);
+               RETURN(rc);
+       }
+
+       if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+           ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+               struct obd_uuid *uuid;
+
+               LASSERT(watched);
+
+               if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                       up_read(&lov->lov_notify_lock);
+                       CERROR("unexpected notification of %s %s!\n",
+                              watched->obd_type->typ_name,
+                              watched->obd_name);
+                       RETURN(-EINVAL);
+               }
+               uuid = &watched->u.cli.cl_target_uuid;
+
+               /* Set OSC as active before notifying the observer, so the
+                * observer can use the OSC normally.
+                */
+               rc = lov_set_osc_active(obd, uuid, ev);
+               if (rc < 0) {
+                       up_read(&lov->lov_notify_lock);
+                       CERROR("event(%d) of %s failed: %d\n", ev,
+                              obd_uuid2str(uuid), rc);
+                       RETURN(rc);
+               }
+               /* active event should be pass lov target index as data */
+               data = &rc;
+       }
+
+       /* Pass the notification up the chain. */
+       if (watched) {
+               rc = obd_notify_observer(obd, watched, ev, data);
+       } else {
+               /* NULL watched means all osc's in the lov (only for syncs) */
+               /* sync event should be send lov idx as data */
+               struct lov_obd *lov = &obd->u.lov;
+               int i, is_sync;
+
+               data = &i;
+               is_sync = (ev == OBD_NOTIFY_SYNC) ||
+                         (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+               obd_getref(obd);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+
+                       /* don't send sync event if target not
+                        * connected/activated */
+                       if (is_sync &&  !lov->lov_tgts[i]->ltd_active)
+                               continue;
+
+                       rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+                                                ev, data);
+                       if (rc) {
+                               CERROR("%s: notify %s of %s failed %d\n",
+                                      obd->obd_name,
+                                      obd->obd_observer->obd_name,
+                                      lov->lov_tgts[i]->ltd_obd->obd_name,
+                                      rc);
+                       }
+               }
+               obd_putref(obd);
+       }
+
+       up_read(&lov->lov_notify_lock);
+       RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                         __u32 index, int gen, int active)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       struct obd_device *tgt_obd;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+              uuidp->uuid, index, gen, active);
+
+       if (gen <= 0) {
+               CERROR("request to add OBD %s with invalid generation: %d\n",
+                      uuidp->uuid, gen);
+               RETURN(-EINVAL);
+       }
+
+       tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+                                       &obd->obd_uuid);
+       if (tgt_obd == NULL)
+               RETURN(-EINVAL);
+
+       mutex_lock(&lov->lov_lock);
+
+       if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+               tgt = lov->lov_tgts[index];
+               CERROR("UUID %s already assigned at LOV target index %d\n",
+                      obd_uuid2str(&tgt->ltd_uuid), index);
+               mutex_unlock(&lov->lov_lock);
+               RETURN(-EEXIST);
+       }
+
+       if (index >= lov->lov_tgt_size) {
+               /* We need to reallocate the lov target array. */
+               struct lov_tgt_desc **newtgts, **old = NULL;
+               __u32 newsize, oldsize = 0;
+
+               newsize = max(lov->lov_tgt_size, (__u32)2);
+               while (newsize < index + 1)
+                       newsize = newsize << 1;
+               OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+               if (newtgts == NULL) {
+                       mutex_unlock(&lov->lov_lock);
+                       RETURN(-ENOMEM);
+               }
+
+               if (lov->lov_tgt_size) {
+                       memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+                              lov->lov_tgt_size);
+                       old = lov->lov_tgts;
+                       oldsize = lov->lov_tgt_size;
+               }
+
+               lov->lov_tgts = newtgts;
+               lov->lov_tgt_size = newsize;
+               smp_rmb();
+               if (old)
+                       OBD_FREE(old, sizeof(*old) * oldsize);
+
+               CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+                      lov->lov_tgts, lov->lov_tgt_size);
+       }
+
+       OBD_ALLOC_PTR(tgt);
+       if (!tgt) {
+               mutex_unlock(&lov->lov_lock);
+               RETURN(-ENOMEM);
+       }
+
+       rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+       if (rc) {
+               mutex_unlock(&lov->lov_lock);
+               OBD_FREE_PTR(tgt);
+               RETURN(rc);
+       }
+
+       tgt->ltd_uuid = *uuidp;
+       tgt->ltd_obd = tgt_obd;
+       /* XXX - add a sanity check on the generation number. */
+       tgt->ltd_gen = gen;
+       tgt->ltd_index = index;
+       tgt->ltd_activate = active;
+       lov->lov_tgts[index] = tgt;
+       if (index >= lov->desc.ld_tgt_count)
+               lov->desc.ld_tgt_count = index + 1;
+
+       mutex_unlock(&lov->lov_lock);
+
+       CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+               index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+       rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+       if (lov->lov_connects == 0) {
+               /* lov_connect hasn't been called yet. We'll do the
+                  lov_connect_obd on this target when that fn first runs,
+                  because we don't know the connect flags yet. */
+               RETURN(0);
+       }
+
+       obd_getref(obd);
+
+       rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+       if (rc)
+               GOTO(out, rc);
+
+       /* connect to administrative disabled ost */
+       if (!tgt->ltd_exp)
+               GOTO(out, rc = 0);
+
+       if (lov->lov_cache != NULL) {
+               rc = obd_set_info_async(NULL, tgt->ltd_exp,
+                               sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+                               sizeof(struct cl_client_cache), lov->lov_cache,
+                               NULL);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+                       active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+                       (void *)&index);
+
+out:
+       if (rc) {
+               CERROR("add failed (%d), deleting %s\n", rc,
+                      obd_uuid2str(&tgt->ltd_uuid));
+               lov_del_target(obd, index, 0, 0);
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+                  struct obd_uuid *uuidp, int gen)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       int count = lov->desc.ld_tgt_count;
+       int rc = 0;
+       ENTRY;
+
+       if (index >= count) {
+               CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+                      index, count);
+               RETURN(-EINVAL);
+       }
+
+       /* to make sure there's no ongoing lov_notify() now */
+       down_write(&lov->lov_notify_lock);
+       obd_getref(obd);
+
+       if (!lov->lov_tgts[index]) {
+               CERROR("LOV target at index %d is not setup.\n", index);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+               CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+                      lov_uuid2str(lov, index), index,
+                      obd_uuid2str(uuidp));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+              lov_uuid2str(lov, index), index,
+              lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+              lov->lov_tgts[index]->ltd_active);
+
+       lov->lov_tgts[index]->ltd_reap = 1;
+       lov->lov_death_row++;
+       /* we really delete it from obd_putref */
+out:
+       obd_putref(obd);
+       up_write(&lov->lov_notify_lock);
+
+       RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+       struct obd_device *osc_obd;
+
+       LASSERT(tgt);
+       LASSERT(tgt->ltd_reap);
+
+       osc_obd = class_exp2obd(tgt->ltd_exp);
+
+       CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+              tgt->ltd_uuid.uuid,
+              osc_obd ? osc_obd->obd_name : "<no obd>");
+
+       if (tgt->ltd_exp)
+               lov_disconnect_obd(obd, tgt);
+
+       OBD_FREE_PTR(tgt);
+
+       /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+          do it ourselves. And we can't do it from lov_cleanup,
+          because we just lost our only reference to it. */
+       if (osc_obd)
+               class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+       if (*val < LOV_DEFAULT_STRIPE_SIZE) {
+               LCONSOLE_WARN("Increasing default stripe size to min %u\n",
+                             LOV_DEFAULT_STRIPE_SIZE);
+               *val = LOV_DEFAULT_STRIPE_SIZE;
+       } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+               *val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+               LCONSOLE_WARN("Changing default stripe size to "LPU64" (a "
+                             "multiple of %u)\n",
+                             *val, LOV_MIN_STRIPE_SIZE);
+       }
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+       if (*val == 0)
+               *val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+       /* from lov_setstripe */
+       if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+               LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+               *val = 0;
+       }
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+       /* fix qos_maxage */
+       if (*val == 0)
+               *val = QOS_DEFAULT_MAXAGE;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+       lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+       lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+       lov_fix_desc_pattern(&desc->ld_pattern);
+       lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       struct lov_desc *desc;
+       struct lov_obd *lov = &obd->u.lov;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("LOV setup requires a descriptor\n");
+               RETURN(-EINVAL);
+       }
+
+       desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+       if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("descriptor size wrong: %d > %d\n",
+                      (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       if (desc->ld_magic != LOV_DESC_MAGIC) {
+               if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+                           CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+                                  obd->obd_name, desc);
+                           lustre_swab_lov_desc(desc);
+               } else {
+                       CERROR("%s: Bad lov desc magic: %#x\n",
+                              obd->obd_name, desc->ld_magic);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       lov_fix_desc(desc);
+
+       desc->ld_active_tgt_count = 0;
+       lov->desc = *desc;
+       lov->lov_tgt_size = 0;
+
+       mutex_init(&lov->lov_lock);
+       atomic_set(&lov->lov_refcount, 0);
+       lov->lov_sp_me = LUSTRE_SP_CLI;
+
+       init_rwsem(&lov->lov_notify_lock);
+
+       lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+                                                  HASH_POOLS_MAX_BITS,
+                                                  HASH_POOLS_BKT_BITS, 0,
+                                                  CFS_HASH_MIN_THETA,
+                                                  CFS_HASH_MAX_THETA,
+                                                  &pool_hash_operations,
+                                                  CFS_HASH_DEFAULT);
+       INIT_LIST_HEAD(&lov->lov_pool_list);
+       lov->lov_pool_count = 0;
+       rc = lov_ost_pool_init(&lov->lov_packed, 0);
+       if (rc)
+               GOTO(out, rc);
+
+       lprocfs_lov_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+       {
+               int rc;
+
+               rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+                                       0444, &lov_proc_target_fops, obd);
+               if (rc)
+                       CWARN("Error adding the target_obd file\n");
+       }
+#endif
+       lov->lov_pool_proc_entry = lprocfs_register("pools",
+                                                   obd->obd_proc_entry,
+                                                   NULL, NULL);
+
+       RETURN(0);
+
+out:
+       return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       struct lov_obd *lov = &obd->u.lov;
+
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY: {
+               int i;
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                               continue;
+                       obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+                                      OBD_CLEANUP_EARLY);
+               }
+               break;
+       }
+       case OBD_CLEANUP_EXPORTS:
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct list_head *pos, *tmp;
+       struct pool_desc *pool;
+       ENTRY;
+
+       list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+               pool = list_entry(pos, struct pool_desc, pool_list);
+               /* free pool structs */
+               CDEBUG(D_INFO, "delete pool %p\n", pool);
+               /* In the function below, .hs_keycmp resolves to
+                * pool_hashkey_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               lov_pool_del(obd, pool->pool_name);
+       }
+       cfs_hash_putref(lov->lov_pools_hash_body);
+       lov_ost_pool_free(&lov->lov_packed);
+
+       lprocfs_obd_cleanup(obd);
+       if (lov->lov_tgts) {
+               int i;
+               obd_getref(obd);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+
+                       /* Inactive targets may never have connected */
+                       if (lov->lov_tgts[i]->ltd_active ||
+                           atomic_read(&lov->lov_refcount))
+                           /* We should never get here - these
+                              should have been removed in the
+                            disconnect. */
+                               CERROR("lov tgt %d not cleaned!"
+                                      " deathrow=%d, lovrc=%d\n",
+                                      i, lov->lov_death_row,
+                                      atomic_read(&lov->lov_refcount));
+                       lov_del_target(obd, i, 0, 0);
+               }
+               obd_putref(obd);
+               OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+                        lov->lov_tgt_size);
+               lov->lov_tgt_size = 0;
+       }
+       RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                           __u32 *indexp, int *genp)
+{
+       struct obd_uuid obd_uuid;
+       int cmd;
+       int rc = 0;
+       ENTRY;
+
+       switch(cmd = lcfg->lcfg_command) {
+       case LCFG_LOV_ADD_OBD:
+       case LCFG_LOV_ADD_INA:
+       case LCFG_LOV_DEL_OBD: {
+               __u32 index;
+               int gen;
+               /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                       GOTO(out, rc = -EINVAL);
+
+               obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+               if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1)
+                       GOTO(out, rc = -EINVAL);
+               if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+                       GOTO(out, rc = -EINVAL);
+               index = *indexp;
+               gen = *genp;
+               if (cmd == LCFG_LOV_ADD_OBD)
+                       rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+               else if (cmd == LCFG_LOV_ADD_INA)
+                       rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+               else
+                       rc = lov_del_target(obd, index, &obd_uuid, gen);
+               GOTO(out, rc);
+       }
+       case LCFG_PARAM: {
+               struct lprocfs_static_vars lvars = { 0 };
+               struct lov_desc *desc = &(obd->u.lov.desc);
+
+               if (!desc)
+                       GOTO(out, rc = -EINVAL);
+
+               lprocfs_lov_init_vars(&lvars);
+
+               rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               GOTO(out, rc);
+       }
+       case LCFG_POOL_NEW:
+       case LCFG_POOL_ADD:
+       case LCFG_POOL_DEL:
+       case LCFG_POOL_REM:
+               GOTO(out, rc);
+
+       default: {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+
+       }
+       }
+out:
+       RETURN(rc);
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+                       struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+       struct lov_stripe_md *obj_mdp, *lsm;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       unsigned ost_idx;
+       int rc, i;
+       ENTRY;
+
+       LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+               src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+       OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+       if (obj_mdp == NULL)
+               RETURN(-ENOMEM);
+
+       ost_idx = src_oa->o_nlink;
+       lsm = *ea;
+       if (lsm == NULL)
+               GOTO(out, rc = -EINVAL);
+       if (ost_idx >= lov->desc.ld_tgt_count ||
+           !lov->lov_tgts[ost_idx])
+               GOTO(out, rc = -EINVAL);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (lsm->lsm_oinfo[i]->loi_ost_idx == ost_idx) {
+                       if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) !=
+                                       ostid_id(&src_oa->o_oi))
+                               GOTO(out, rc = -EINVAL);
+                       break;
+               }
+       }
+       if (i == lsm->lsm_stripe_count)
+               GOTO(out, rc = -EINVAL);
+
+       rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+                       src_oa, &obj_mdp, oti);
+out:
+       OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+       RETURN(rc);
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+                     struct obdo *src_oa, struct lov_stripe_md **ea,
+                     struct obd_trans_info *oti)
+{
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ea != NULL);
+       if (exp == NULL)
+               RETURN(-EINVAL);
+
+       if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+           src_oa->o_flags == OBD_FL_DELORPHAN) {
+               /* should be used with LOV anymore */
+               LBUG();
+       }
+
+       lov = &exp->exp_obd->u.lov;
+       if (!lov->desc.ld_active_tgt_count)
+               RETURN(-EIO);
+
+       obd_getref(exp->exp_obd);
+       /* Recreate a specific object id at the given OST index */
+       if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+           (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+                rc = lov_recreate(exp, src_oa, ea, oti);
+       }
+
+       obd_putref(exp->exp_obd);
+       RETURN(rc);
+}
+
+#define ASSERT_LSM_MAGIC(lsmp)                                           \
+do {                                                                       \
+       LASSERT((lsmp) != NULL);                                                \
+       LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||                    \
+                (lsmp)->lsm_magic == LOV_MAGIC_V3),                        \
+                "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic);            \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md *lsm,
+                      struct obd_trans_info *oti, struct obd_export *md_exp,
+                      void *capa)
+{
+       struct lov_request_set *set;
+       struct obd_info oinfo;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0, err = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       if (oa->o_valid & OBD_MD_FLCOOKIE) {
+               LASSERT(oti);
+               LASSERT(oti->oti_logcookies);
+       }
+
+       lov = &exp->exp_obd->u.lov;
+       obd_getref(exp->exp_obd);
+       rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+       if (rc)
+               GOTO(out, rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (oa->o_valid & OBD_MD_FLCOOKIE)
+                       oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+               err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                 req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+               err = lov_update_common_set(set, req, err);
+               if (err) {
+                       CERROR("%s: destroying objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+
+       if (rc == 0) {
+               LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+               rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+       }
+       err = lov_fini_destroy_set(set);
+out:
+       obd_putref(exp->exp_obd);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_getattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+
+       rc = lov_prep_getattr_set(exp, oinfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      " %u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+               rc = obd_getattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi);
+               err = lov_update_common_set(set, req, rc);
+               if (err) {
+                       CERROR("%s: getattr objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&oinfo->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, err);
+                       break;
+               }
+       }
+
+       rc = lov_fini_getattr_set(set);
+       if (err)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       /* don't do attribute merge if this aysnc op failed */
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_getattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                             struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *lovset;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0, err;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+
+       rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+              POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+              oinfo->oi_md->lsm_stripe_size);
+
+       list_for_each(pos, &lovset->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+               rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, rqset);
+               if (rc) {
+                       CERROR("%s: getattr objid "DOSTID" subobj"
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&oinfo->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, rc);
+                       GOTO(out, rc);
+               }
+       }
+
+       if (!list_empty(&rqset->set_requests)) {
+               LASSERT(rc == 0);
+               LASSERT (rqset->set_interpret == NULL);
+               rqset->set_interpret = lov_getattr_interpret;
+               rqset->set_arg = (void *)lovset;
+               RETURN(rc);
+       }
+out:
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_getattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_setattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       /* for now, we only expect the following updates here */
+       LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE |
+                                           OBD_MD_FLMODE | OBD_MD_FLATIME |
+                                           OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                                           OBD_MD_FLFLAGS | OBD_MD_FLSIZE |
+                                           OBD_MD_FLGROUP | OBD_MD_FLUID |
+                                           OBD_MD_FLGID | OBD_MD_FLFID |
+                                           OBD_MD_FLGENER)));
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_setattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi, NULL);
+               err = lov_update_setattr_set(set, req, rc);
+               if (err) {
+                       CERROR("%s: setattr objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+                              err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       err = lov_fini_setattr_set(set);
+       if (!rc)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_setattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+   needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct obd_trans_info *oti,
+                            struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+               LASSERT(oti);
+               LASSERT(oti->oti_logcookies);
+       }
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+              POSTID(&oinfo->oi_md->lsm_oi),
+              oinfo->oi_md->lsm_stripe_count,
+              oinfo->oi_md->lsm_stripe_size);
+
+       list_for_each(pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+                       oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+               rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, oti, rqset);
+               if (rc) {
+                       CERROR("error: setattr objid "DOSTID" subobj"
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, rc);
+                       break;
+               }
+       }
+
+       /* If we are not waiting for responses on async requests, return. */
+       if (rc || !rqset || list_empty(&rqset->set_requests)) {
+               int err;
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               err = lov_fini_setattr_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_setattr_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_punch_interpret(struct ptlrpc_request_set *rqset,
+                              void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_punch_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+/* FIXME: maybe we'll just make one node the authoritative attribute node, then
+ * we can send this 'punch' to just the authoritative node and the nodes
+ * that the punch will affect. */
+static int lov_punch(const struct lu_env *env, struct obd_export *exp,
+                    struct obd_info *oinfo, struct obd_trans_info *oti,
+                    struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_punch_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_punch(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                              &req->rq_oi, NULL, rqset);
+               if (rc) {
+                       CERROR("%s: punch objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, rc);
+                       break;
+               }
+       }
+
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err;
+               err = lov_fini_punch_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_punch_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_sync_interpret(struct ptlrpc_request_set *rqset,
+                             void *data, int rc)
+{
+       struct lov_request_set *lovset = data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_sync_set(lovset);
+       RETURN(rc ?: err);
+}
+
+static int lov_sync(const struct lu_env *env, struct obd_export *exp,
+                   struct obd_info *oinfo, obd_off start, obd_off end,
+                   struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set = NULL;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       LASSERT(rqset != NULL);
+
+       if (!exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_sync_set(exp, oinfo, start, end, &set);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "fsync objid "DOSTID" ["LPX64", "LPX64"]\n",
+              POSTID(&set->set_oi->oi_oa->o_oi), start, end);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_sync(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                             &req->rq_oi, req->rq_oi.oi_policy.l_extent.start,
+                             req->rq_oi.oi_policy.l_extent.end, rqset);
+               if (rc) {
+                       CERROR("%s: fsync objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+                              rc);
+                       break;
+               }
+       }
+
+       /* If we are not waiting for responses on async requests, return. */
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err = lov_fini_sync_set(set);
+
+               RETURN(rc ?: err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_sync_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
+                        obd_count oa_bufs, struct brw_page *pga)
+{
+       struct obd_info oinfo = { { { 0 } } };
+       int i, rc = 0;
+
+       oinfo.oi_oa = lov_oinfo->oi_oa;
+
+       /* The caller just wants to know if there's a chance that this
+        * I/O can succeed */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off);
+               int ost = lov_oinfo->oi_md->lsm_oinfo[stripe]->loi_ost_idx;
+               obd_off start, end;
+
+               if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
+                                          pga[i].off + pga[i].count - 1,
+                                          &start, &end))
+                       continue;
+
+               if (!lov->lov_tgts[ost] || !lov->lov_tgts[ost]->ltd_active) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", ost);
+                       return -EIO;
+               }
+
+               rc = obd_brw(OBD_BRW_CHECK, lov->lov_tgts[ost]->ltd_exp, &oinfo,
+                            1, &pga[i], NULL);
+               if (rc)
+                       break;
+       }
+       return rc;
+}
+
+static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+                  obd_count oa_bufs, struct brw_page *pga,
+                  struct obd_trans_info *oti)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int err, rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (cmd == OBD_BRW_CHECK) {
+               rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
+               RETURN(rc);
+       }
+
+       rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               struct obd_export *sub_exp;
+               struct brw_page *sub_pga;
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp;
+               sub_pga = set->set_pga + req->rq_pgaidx;
+               rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+                            sub_pga, oti);
+               if (rc)
+                       break;
+               lov_update_common_set(set, req, rc);
+       }
+
+       err = lov_fini_brw_set(set);
+       if (!rc)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       ENTRY;
+       rc = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode, rc, rqset);
+       RETURN(rc);
+}
+
+static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+                      struct ldlm_enqueue_info *einfo,
+                      struct ptlrpc_request_set *rqset)
+{
+       ldlm_mode_t mode = einfo->ei_mode;
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       ldlm_error_t rc;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       LASSERT(mode == (mode & -mode));
+
+       /* we should never be asked to replay a lock this way. */
+       LASSERT((oinfo->oi_flags & LDLM_FL_REPLAY) == 0);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_enqueue(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi, einfo, rqset);
+               if (rc != ELDLM_OK)
+                       GOTO(out, rc);
+       }
+
+       if (rqset && !list_empty(&rqset->set_requests)) {
+               LASSERT(rc == 0);
+               LASSERT(rqset->set_interpret == NULL);
+               rqset->set_interpret = lov_enqueue_interpret;
+               rqset->set_arg = (void *)set;
+               RETURN(rc);
+       }
+out:
+       rc = lov_fini_enqueue_set(set, mode, rc, rqset);
+       RETURN(rc);
+}
+
+static int lov_change_cbdata(struct obd_export *exp,
+                            struct lov_stripe_md *lsm, ldlm_iterator_t it,
+                            void *data)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+                                      &submd, it, data);
+       }
+       RETURN(rc);
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int lov_find_cbdata(struct obd_export *exp,
+                          struct lov_stripe_md *lsm, ldlm_iterator_t it,
+                          void *data)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+                       continue;
+               }
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+                                    &submd, it, data);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+       RETURN(rc);
+}
+
+static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       struct lov_request_set *set;
+       struct obd_info oinfo;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       struct lustre_handle *lov_lockhp;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       LASSERT(lockh);
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each(pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+               lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+
+               rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                               req->rq_oi.oi_md, mode, lov_lockhp);
+               rc = lov_update_common_set(set, req, rc);
+               if (rc) {
+                       CERROR("%s: cancel objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&req->rq_oi.oi_md->lsm_oi),
+                              req->rq_idx, rc);
+                       err = rc;
+               }
+
+       }
+       lov_fini_cancel_set(set);
+       RETURN(err);
+}
+
+static int lov_cancel_unused(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       if (lsm == NULL) {
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       int err;
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+                               continue;
+
+                       err = obd_cancel_unused(lov->lov_tgts[i]->ltd_exp, NULL,
+                                               flags, opaque);
+                       if (!rc)
+                               rc = err;
+               }
+               RETURN(rc);
+       }
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               int idx = loi->loi_ost_idx;
+               int err;
+
+               if (!lov->lov_tgts[idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL\n", idx);
+                       continue;
+               }
+
+               if (!lov->lov_tgts[idx]->ltd_active)
+                       CDEBUG(D_HA, "lov idx %d inactive\n", idx);
+
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               err = obd_cancel_unused(lov->lov_tgts[idx]->ltd_exp,
+                                       &submd, flags, opaque);
+               if (err && lov->lov_tgts[idx]->ltd_active) {
+                       CERROR("%s: cancel unused objid "DOSTID
+                              " subobj "DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&loi->loi_oi), idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       RETURN(rc);
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+
+       err = lov_fini_statfs_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+                           __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+       struct obd_device      *obd = class_exp2obd(exp);
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo != NULL);
+       LASSERT(oinfo->oi_osfs != NULL);
+
+       lov = &obd->u.lov;
+       rc = lov_prep_statfs_set(obd, oinfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+               rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                     &req->rq_oi, max_age, rqset);
+               if (rc)
+                       break;
+       }
+
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err;
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               err = lov_fini_statfs_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_statfs_interpret;
+       rqset->set_arg = (void *)set;
+       RETURN(0);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct ptlrpc_request_set *set = NULL;
+       struct obd_info oinfo = { { { 0 } } };
+       int rc = 0;
+       ENTRY;
+
+
+       /* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+        * statfs requests */
+       set = ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       oinfo.oi_osfs = osfs;
+       oinfo.oi_flags = flags;
+       rc = lov_statfs_async(exp, &oinfo, max_age, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+
+       RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+       struct obd_uuid *uuidp;
+       ENTRY;
+
+       switch (cmd) {
+       case IOC_OBD_STATFS: {
+               struct obd_ioctl_data *data = karg;
+               struct obd_device *osc_obd;
+               struct obd_statfs stat_buf = {0};
+               __u32 index;
+               __u32 flags;
+
+               memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+               if ((index >= count))
+                       RETURN(-ENODEV);
+
+               if (!lov->lov_tgts[index])
+                       /* Try again with the next index */
+                       RETURN(-EAGAIN);
+               if (!lov->lov_tgts[index]->ltd_active)
+                       RETURN(-ENODATA);
+
+               osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+               if (!osc_obd)
+                       RETURN(-EINVAL);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       RETURN(-EFAULT);
+
+               flags = uarg ? *(__u32*)uarg : 0;
+               /* got statfs data */
+               rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               flags);
+               if (rc)
+                       RETURN(rc);
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       RETURN(-EFAULT);
+               break;
+       }
+       case OBD_IOC_LOV_GET_CONFIG: {
+               struct obd_ioctl_data *data;
+               struct lov_desc *desc;
+               char *buf = NULL;
+               __u32 *genp;
+
+               len = 0;
+               if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+                       RETURN(-EINVAL);
+
+               data = (struct obd_ioctl_data *)buf;
+
+               if (sizeof(*desc) > data->ioc_inllen1) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               if (sizeof(__u32) * count > data->ioc_inllen3) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               desc = (struct lov_desc *)data->ioc_inlbuf1;
+               memcpy(desc, &(lov->desc), sizeof(*desc));
+
+               uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+               genp = (__u32 *)data->ioc_inlbuf3;
+               /* the uuid will be empty for deleted OSTs */
+               for (i = 0; i < count; i++, uuidp++, genp++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+                       *uuidp = lov->lov_tgts[i]->ltd_uuid;
+                       *genp = lov->lov_tgts[i]->ltd_gen;
+               }
+
+               if (copy_to_user((void *)uarg, buf, len))
+                       rc = -EFAULT;
+               obd_ioctl_freedata(buf, len);
+               break;
+       }
+       case LL_IOC_LOV_SETSTRIPE:
+               rc = lov_setstripe(exp, len, karg, uarg);
+               break;
+       case LL_IOC_LOV_GETSTRIPE:
+               rc = lov_getstripe(exp, karg, uarg);
+               break;
+       case LL_IOC_LOV_SETEA:
+               rc = lov_setea(exp, karg, uarg);
+               break;
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct lov_tgt_desc *tgt = NULL;
+               struct obd_quotactl *oqctl;
+
+               if (qctl->qc_valid == QC_OSTIDX) {
+                       if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+                               RETURN(-EINVAL);
+
+                       tgt = lov->lov_tgts[qctl->qc_idx];
+                       if (!tgt || !tgt->ltd_exp)
+                               RETURN(-EINVAL);
+               } else if (qctl->qc_valid == QC_UUID) {
+                       for (i = 0; i < count; i++) {
+                               tgt = lov->lov_tgts[i];
+                               if (!tgt ||
+                                   !obd_uuid_equals(&tgt->ltd_uuid,
+                                                    &qctl->obd_uuid))
+                                       continue;
+
+                               if (tgt->ltd_exp == NULL)
+                                       RETURN(-EINVAL);
+
+                               break;
+                       }
+               } else {
+                       RETURN(-EINVAL);
+               }
+
+               if (i >= count)
+                       RETURN(-EAGAIN);
+
+               LASSERT(tgt && tgt->ltd_exp);
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_OSTIDX;
+                       qctl->obd_uuid = tgt->ltd_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       default: {
+               int set = 0;
+
+               if (count == 0)
+                       RETURN(-ENOTTY);
+
+               for (i = 0; i < count; i++) {
+                       int err;
+                       struct obd_device *osc_obd;
+
+                       /* OST was disconnected */
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+                               continue;
+
+                       /* ll_umount_begin() sets force flag but for lov, not
+                        * osc. Let's pass it through */
+                       osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+                       osc_obd->obd_force = obddev->obd_force;
+                       err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+                                           len, karg, uarg);
+                       if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+                               RETURN(err);
+                       } else if (err) {
+                               if (lov->lov_tgts[i]->ltd_active) {
+                                       CDEBUG(err == -ENOTTY ?
+                                              D_IOCTL : D_WARNING,
+                                              "iocontrol OSC %s on OST "
+                                              "idx %d cmd %x: err = %d\n",
+                                              lov_uuid2str(lov, i),
+                                              i, cmd, err);
+                                       if (!rc)
+                                               rc = err;
+                               }
+                       } else {
+                               set = 1;
+                       }
+               }
+               if (!set && !rc)
+                       rc = -EIO;
+       }
+       }
+
+       RETURN(rc);
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+                                  struct lov_stripe_md *lsm, obd_size fm_start,
+                                  obd_size fm_end, int *start_stripe)
+{
+       obd_size local_end = fiemap->fm_extents[0].fe_logical;
+       obd_off lun_start, lun_end;
+       obd_size fm_end_offset;
+       int stripe_no = -1, i;
+
+       if (fiemap->fm_extent_count == 0 ||
+           fiemap->fm_extents[0].fe_logical == 0)
+               return 0;
+
+       /* Find out stripe_no from ost_index saved in the fe_device */
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (lsm->lsm_oinfo[i]->loi_ost_idx ==
+                                       fiemap->fm_extents[0].fe_device) {
+                       stripe_no = i;
+                       break;
+               }
+       }
+       if (stripe_no == -1)
+               return -EINVAL;
+
+       /* If we have finished mapping on previous device, shift logical
+        * offset to start of next device */
+       if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+                                  &lun_start, &lun_end)) != 0 &&
+                                  local_end < lun_end) {
+               fm_end_offset = local_end;
+               *start_stripe = stripe_no;
+       } else {
+               /* This is a special value to indicate that caller should
+                * calculate offset in next stripe. */
+               fm_end_offset = 0;
+               *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+       }
+
+       return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start,
+                           obd_size fm_end, int start_stripe,
+                           int *stripe_count)
+{
+       int last_stripe;
+       obd_off obd_start, obd_end;
+       int i, j;
+
+       if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+               last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+                                                             start_stripe - 1);
+               *stripe_count = lsm->lsm_stripe_count;
+       } else {
+               for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+                    i = (i + 1) % lsm->lsm_stripe_count, j++) {
+                       if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+                                                  &obd_start, &obd_end)) == 0)
+                               break;
+               }
+               *stripe_count = j;
+               last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+       }
+
+       return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+                   extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+                                 struct ll_fiemap_extent *lcl_fm_ext,
+                                 int ost_index, unsigned int ext_count,
+                                 int current_extent)
+{
+       char *to;
+       int ext;
+
+       for (ext = 0; ext < ext_count; ext++) {
+               lcl_fm_ext[ext].fe_device = ost_index;
+               lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+       }
+
+       /* Copy fm_extent's from fm_local to return buffer */
+       to = (char *)fiemap + fiemap_count_to_size(current_extent);
+       memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+                     __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+       struct ll_fiemap_info_key *fm_key = key;
+       struct ll_user_fiemap *fiemap = val;
+       struct ll_user_fiemap *fm_local = NULL;
+       struct ll_fiemap_extent *lcl_fm_ext;
+       int count_local;
+       unsigned int get_num_extents = 0;
+       int ost_index = 0, actual_start_stripe, start_stripe;
+       obd_size fm_start, fm_end, fm_length, fm_end_offset;
+       obd_size curr_loc;
+       int current_extent = 0, rc = 0, i;
+       int ost_eof = 0; /* EOF for object */
+       int ost_done = 0; /* done with required mapping for this OST? */
+       int last_stripe;
+       int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+       unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+       if (lsm == NULL)
+               GOTO(out, rc = 0);
+
+       if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+               buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+       OBD_ALLOC_LARGE(fm_local, buffer_size);
+       if (fm_local == NULL)
+               GOTO(out, rc = -ENOMEM);
+       lcl_fm_ext = &fm_local->fm_extents[0];
+
+       count_local = fiemap_size_to_count(buffer_size);
+
+       memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+       fm_start = fiemap->fm_start;
+       fm_length = fiemap->fm_length;
+       /* Calculate start stripe, last stripe and length of mapping */
+       actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+       fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+                                               fm_start + fm_length - 1);
+       /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+       if (fm_end > fm_key->oa.o_size)
+               fm_end = fm_key->oa.o_size;
+
+       last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+                                           actual_start_stripe, &stripe_count);
+
+       fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+                                                 fm_end, &start_stripe);
+       if (fm_end_offset == -EINVAL)
+               GOTO(out, rc = -EINVAL);
+
+       if (fiemap->fm_extent_count == 0) {
+               get_num_extents = 1;
+               count_local = 0;
+       }
+
+       /* Check each stripe */
+       for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+            i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+               obd_size req_fm_len; /* Stores length of required mapping */
+               obd_size len_mapped_single_call;
+               obd_off lun_start, lun_end, obd_object_end;
+               unsigned int ext_count;
+
+               cur_stripe_wrap = cur_stripe;
+
+               /* Find out range of mapping on this stripe */
+               if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+                                          &lun_start, &obd_object_end)) == 0)
+                       continue;
+
+               /* If this is a continuation FIEMAP call and we are on
+                * starting stripe then lun_start needs to be set to
+                * fm_end_offset */
+               if (fm_end_offset != 0 && cur_stripe == start_stripe)
+                       lun_start = fm_end_offset;
+
+               if (fm_length != ~0ULL) {
+                       /* Handle fm_start + fm_length overflow */
+                       if (fm_start + fm_length < fm_start)
+                               fm_length = ~0ULL - fm_start;
+                       lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+                                                    cur_stripe);
+               } else {
+                       lun_end = ~0ULL;
+               }
+
+               if (lun_start == lun_end)
+                       continue;
+
+               req_fm_len = obd_object_end - lun_start;
+               fm_local->fm_length = 0;
+               len_mapped_single_call = 0;
+
+               /* If the output buffer is very large and the objects have many
+                * extents we may need to loop on a single OST repeatedly */
+               ost_eof = 0;
+               ost_done = 0;
+               do {
+                       if (get_num_extents == 0) {
+                               /* Don't get too many extents. */
+                               if (current_extent + count_local >
+                                   fiemap->fm_extent_count)
+                                       count_local = fiemap->fm_extent_count -
+                                                                current_extent;
+                       }
+
+                       lun_start += len_mapped_single_call;
+                       fm_local->fm_length = req_fm_len - len_mapped_single_call;
+                       req_fm_len = fm_local->fm_length;
+                       fm_local->fm_extent_count = count_local;
+                       fm_local->fm_mapped_extents = 0;
+                       fm_local->fm_flags = fiemap->fm_flags;
+
+                       fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+                       ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+                       if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count)
+                               GOTO(out, rc = -EINVAL);
+
+                       /* If OST is inactive, return extent with UNKNOWN flag */
+                       if (!lov->lov_tgts[ost_index]->ltd_active) {
+                               fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+                               fm_local->fm_mapped_extents = 1;
+
+                               lcl_fm_ext[0].fe_logical = lun_start;
+                               lcl_fm_ext[0].fe_length = obd_object_end -
+                                                                     lun_start;
+                               lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+                               goto inactive_tgt;
+                       }
+
+                       fm_local->fm_start = lun_start;
+                       fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+                       memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+                       *vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+                       rc = obd_get_info(NULL,
+                                         lov->lov_tgts[ost_index]->ltd_exp,
+                                         keylen, key, vallen, fm_local, lsm);
+                       if (rc != 0)
+                               GOTO(out, rc);
+
+inactive_tgt:
+                       ext_count = fm_local->fm_mapped_extents;
+                       if (ext_count == 0) {
+                               ost_done = 1;
+                               /* If last stripe has hole at the end,
+                                * then we need to return */
+                               if (cur_stripe_wrap == last_stripe) {
+                                       fiemap->fm_mapped_extents = 0;
+                                       goto finish;
+                               }
+                               break;
+                       }
+
+                       /* If we just need num of extents then go to next device */
+                       if (get_num_extents) {
+                               current_extent += ext_count;
+                               break;
+                       }
+
+                       len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+                                 lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+                       /* Have we finished mapping on this device? */
+                       if (req_fm_len <= len_mapped_single_call)
+                               ost_done = 1;
+
+                       /* Clear the EXTENT_LAST flag which can be present on
+                        * last extent */
+                       if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+                               lcl_fm_ext[ext_count - 1].fe_flags &=
+                                                           ~FIEMAP_EXTENT_LAST;
+
+                       curr_loc = lov_stripe_size(lsm,
+                                          lcl_fm_ext[ext_count - 1].fe_logical+
+                                          lcl_fm_ext[ext_count - 1].fe_length,
+                                          cur_stripe);
+                       if (curr_loc >= fm_key->oa.o_size)
+                               ost_eof = 1;
+
+                       fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+                                                    ost_index, ext_count,
+                                                    current_extent);
+
+                       current_extent += ext_count;
+
+                       /* Ran out of available extents? */
+                       if (current_extent >= fiemap->fm_extent_count)
+                               goto finish;
+               } while (ost_done == 0 && ost_eof == 0);
+
+               if (cur_stripe_wrap == last_stripe)
+                       goto finish;
+       }
+
+finish:
+       /* Indicate that we are returning device offsets unless file just has
+        * single stripe */
+       if (lsm->lsm_stripe_count > 1)
+               fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+       if (get_num_extents)
+               goto skip_last_device_calc;
+
+       /* Check if we have reached the last stripe and whether mapping for that
+        * stripe is done. */
+       if (cur_stripe_wrap == last_stripe) {
+               if (ost_done || ost_eof)
+                       fiemap->fm_extents[current_extent - 1].fe_flags |=
+                                                            FIEMAP_EXTENT_LAST;
+       }
+
+skip_last_device_calc:
+       fiemap->fm_mapped_extents = current_extent;
+
+out:
+       OBD_FREE_LARGE(fm_local, buffer_size);
+       return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       int i, rc;
+       ENTRY;
+
+       if (!vallen || !val)
+               RETURN(-EFAULT);
+
+       obd_getref(obddev);
+
+       if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+               struct {
+                       char name[16];
+                       struct ldlm_lock *lock;
+               } *data = key;
+               struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+               struct lov_oinfo *loi;
+               __u32 *stripe = val;
+
+               if (*vallen < sizeof(*stripe))
+                       GOTO(out, rc = -EFAULT);
+               *vallen = sizeof(*stripe);
+
+               /* XXX This is another one of those bits that will need to
+                * change if we ever actually support nested LOVs.  It uses
+                * the lock's export to find out which stripe it is. */
+               /* XXX - it's assumed all the locks for deleted OSTs have
+                * been cancelled. Also, the export for deleted OSTs will
+                * be NULL and won't match the lock's export. */
+               for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                       loi = lsm->lsm_oinfo[i];
+                       if (!lov->lov_tgts[loi->loi_ost_idx])
+                               continue;
+                       if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+                           data->lock->l_conn_export &&
+                           ostid_res_name_eq(&loi->loi_oi, res_id)) {
+                               *stripe = i;
+                               GOTO(out, rc = 0);
+                       }
+               }
+               LDLM_ERROR(data->lock, "lock on inode without such object");
+               dump_lsm(D_ERROR, lsm);
+               GOTO(out, rc = -ENXIO);
+       } else if (KEY_IS(KEY_LAST_ID)) {
+               struct obd_id_info *info = val;
+               __u32 size = sizeof(obd_id);
+               struct lov_tgt_desc *tgt;
+
+               LASSERT(*vallen == sizeof(struct obd_id_info));
+               tgt = lov->lov_tgts[info->idx];
+
+               if (!tgt || !tgt->ltd_active)
+                       GOTO(out, rc = -ESRCH);
+
+               rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+                                 &size, info->data, NULL);
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_LOVDESC)) {
+               struct lov_desc *desc_ret = val;
+               *desc_ret = lov->desc;
+
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_FIEMAP)) {
+               rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+               GOTO(out, rc);
+       } else if (KEY_IS(KEY_CONNECT_FLAG)) {
+               struct lov_tgt_desc *tgt;
+               __u64 ost_idx = *((__u64*)val);
+
+               LASSERT(*vallen == sizeof(__u64));
+               LASSERT(ost_idx < lov->desc.ld_tgt_count);
+               tgt = lov->lov_tgts[ost_idx];
+
+               if (!tgt || !tgt->ltd_exp)
+                       GOTO(out, rc = -ESRCH);
+
+               *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = lov->desc.ld_tgt_count;
+               GOTO(out, rc = 0);
+       }
+
+       rc = -EINVAL;
+
+out:
+       obd_putref(obddev);
+       RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       obd_count count;
+       int i, rc = 0, err;
+       struct lov_tgt_desc *tgt;
+       unsigned incr, check_uuid,
+                do_inactive, no_set;
+       unsigned next_id = 0,  mds_con = 0, capa = 0;
+       ENTRY;
+
+       incr = check_uuid = do_inactive = no_set = 0;
+       if (set == NULL) {
+               no_set = 1;
+               set = ptlrpc_prep_set();
+               if (!set)
+                       RETURN(-ENOMEM);
+       }
+
+       obd_getref(obddev);
+       count = lov->desc.ld_tgt_count;
+
+       if (KEY_IS(KEY_NEXT_ID)) {
+               count = vallen / sizeof(struct obd_id_info);
+               vallen = sizeof(obd_id);
+               incr = sizeof(struct obd_id_info);
+               do_inactive = 1;
+               next_id = 1;
+       } else if (KEY_IS(KEY_CHECKSUM)) {
+               do_inactive = 1;
+       } else if (KEY_IS(KEY_EVICT_BY_NID)) {
+               /* use defaults:  do_inactive = incr = 0; */
+       } else if (KEY_IS(KEY_MDS_CONN)) {
+               mds_con = 1;
+       } else if (KEY_IS(KEY_CAPA_KEY)) {
+               capa = 1;
+       } else if (KEY_IS(KEY_CACHE_SET)) {
+               LASSERT(lov->lov_cache == NULL);
+               lov->lov_cache = val;
+               do_inactive = 1;
+       }
+
+       for (i = 0; i < count; i++, val = (char *)val + incr) {
+               if (next_id) {
+                       tgt = lov->lov_tgts[((struct obd_id_info*)val)->idx];
+               } else {
+                       tgt = lov->lov_tgts[i];
+               }
+               /* OST was disconnected */
+               if (!tgt || !tgt->ltd_exp)
+                       continue;
+
+               /* OST is inactive and we don't want inactive OSCs */
+               if (!tgt->ltd_active && !do_inactive)
+                       continue;
+
+               if (mds_con) {
+                       struct mds_group_info *mgi;
+
+                       LASSERT(vallen == sizeof(*mgi));
+                       mgi = (struct mds_group_info *)val;
+
+                       /* Only want a specific OSC */
+                       if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+                                               &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, sizeof(int),
+                                        &mgi->group, set);
+               } else if (next_id) {
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, vallen,
+                                        ((struct obd_id_info*)val)->data, set);
+               } else if (capa) {
+                       struct mds_capa_info *info = (struct mds_capa_info*)val;
+
+                       LASSERT(vallen == sizeof(*info));
+
+                        /* Only want a specific OSC */
+                       if (info->uuid &&
+                           !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+                                                key, sizeof(*info->capa),
+                                                info->capa, set);
+               } else {
+                       /* Only want a specific OSC */
+                       if (check_uuid &&
+                           !obd_uuid_equals(val, &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, vallen, val, set);
+               }
+
+               if (!rc)
+                       rc = err;
+       }
+
+       obd_putref(obddev);
+       if (no_set) {
+               err = ptlrpc_set_wait(set);
+               if (!rc)
+                       rc = err;
+               ptlrpc_set_destroy(set);
+       }
+       RETURN(rc);
+}
+
+static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          int cmd, __u64 *offset)
+{
+       __u32 ssize = lsm->lsm_stripe_size;
+       __u64 start;
+
+       start = *offset;
+       lov_do_div64(start, ssize);
+       start = start * ssize;
+
+       CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64
+                          ", end "LPU64"\n", *offset, ssize, start,
+                          start + ssize - 1);
+       if (cmd == OBD_CALC_STRIPE_END) {
+               *offset = start + ssize - 1;
+       } else if (cmd == OBD_CALC_STRIPE_START) {
+               *offset = start;
+       } else {
+               LBUG();
+       }
+
+       RETURN(0);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+{
+       LASSERT(md->lsm_lock_owner != current_pid());
+       spin_lock(&md->lsm_lock);
+       LASSERT(md->lsm_lock_owner == 0);
+       md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+{
+       LASSERT(md->lsm_lock_owner == current_pid());
+       md->lsm_lock_owner = 0;
+       spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+                       struct obd_quotactl *oqctl)
+{
+       struct lov_obd      *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       __u64           curspace = 0;
+       __u64           bhardlimit = 0;
+       int               i, rc = 0;
+       ENTRY;
+
+       if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+           oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+           oqctl->qc_cmd != Q_GETOQUOTA &&
+           oqctl->qc_cmd != Q_INITQUOTA &&
+           oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+           oqctl->qc_cmd != Q_FINVALIDATE) {
+               CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+               RETURN(-EFAULT);
+       }
+
+       /* for lov tgt */
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               int err;
+
+               tgt = lov->lov_tgts[i];
+
+               if (!tgt)
+                       continue;
+
+               if (!tgt->ltd_active || tgt->ltd_reap) {
+                       if (oqctl->qc_cmd == Q_GETOQUOTA &&
+                           lov->lov_tgts[i]->ltd_activate) {
+                               rc = -EREMOTEIO;
+                               CERROR("ost %d is inactive\n", i);
+                       } else {
+                               CDEBUG(D_HA, "ost %d is inactive\n", i);
+                       }
+                       continue;
+               }
+
+               err = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (err) {
+                       if (tgt->ltd_active && !rc)
+                               rc = err;
+                       continue;
+               }
+
+               if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                       curspace += oqctl->qc_dqblk.dqb_curspace;
+                       bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+               }
+       }
+       obd_putref(obd);
+
+       if (oqctl->qc_cmd == Q_GETOQUOTA) {
+               oqctl->qc_dqblk.dqb_curspace = curspace;
+               oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+       }
+       RETURN(rc);
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+                         struct obd_quotactl *oqctl)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       int          i, rc = 0;
+       ENTRY;
+
+       obd_getref(obd);
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (!lov->lov_tgts[i])
+                       continue;
+
+               /* Skip quota check on the administratively disabled OSTs. */
+               if (!lov->lov_tgts[i]->ltd_activate) {
+                       CWARN("lov idx %d was administratively disabled, "
+                             "skip quotacheck on it.\n", i);
+                       continue;
+               }
+
+               if (!lov->lov_tgts[i]->ltd_active) {
+                       CERROR("lov idx %d inactive\n", i);
+                       rc = -EIO;
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               int err;
+
+               if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+                       continue;
+
+               err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+               if (err && !rc)
+                       rc = err;
+       }
+
+out:
+       obd_putref(obd);
+
+       RETURN(rc);
+}
+
+struct obd_ops lov_obd_ops = {
+       .o_owner               = THIS_MODULE,
+       .o_setup               = lov_setup,
+       .o_precleanup     = lov_precleanup,
+       .o_cleanup           = lov_cleanup,
+       //.o_process_config      = lov_process_config,
+       .o_connect           = lov_connect,
+       .o_disconnect     = lov_disconnect,
+       .o_statfs             = lov_statfs,
+       .o_statfs_async = lov_statfs_async,
+       .o_packmd             = lov_packmd,
+       .o_unpackmd         = lov_unpackmd,
+       .o_create             = lov_create,
+       .o_destroy           = lov_destroy,
+       .o_getattr           = lov_getattr,
+       .o_getattr_async       = lov_getattr_async,
+       .o_setattr           = lov_setattr,
+       .o_setattr_async       = lov_setattr_async,
+       .o_brw           = lov_brw,
+       .o_merge_lvb       = lov_merge_lvb,
+       .o_adjust_kms     = lov_adjust_kms,
+       .o_punch               = lov_punch,
+       .o_sync         = lov_sync,
+       .o_enqueue           = lov_enqueue,
+       .o_change_cbdata       = lov_change_cbdata,
+       .o_find_cbdata   = lov_find_cbdata,
+       .o_cancel             = lov_cancel,
+       .o_cancel_unused       = lov_cancel_unused,
+       .o_iocontrol       = lov_iocontrol,
+       .o_get_info         = lov_get_info,
+       .o_set_info_async      = lov_set_info_async,
+       .o_extent_calc   = lov_extent_calc,
+       .o_llog_init       = lov_llog_init,
+       .o_llog_finish   = lov_llog_finish,
+       .o_notify             = lov_notify,
+       .o_pool_new         = lov_pool_new,
+       .o_pool_rem         = lov_pool_remove,
+       .o_pool_add         = lov_pool_add,
+       .o_pool_del         = lov_pool_del,
+       .o_getref             = lov_getref,
+       .o_putref             = lov_putref,
+       .o_quotactl         = lov_quotactl,
+       .o_quotacheck     = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+int __init lov_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+       rc = lu_kmem_init(lov_caches);
+       if (rc)
+               return rc;
+
+       lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+                                             sizeof(struct lov_oinfo),
+                                             0, SLAB_HWCACHE_ALIGN, NULL);
+       if (lov_oinfo_slab == NULL) {
+               lu_kmem_fini(lov_caches);
+               return -ENOMEM;
+       }
+       lprocfs_lov_init_vars(&lvars);
+
+       rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+                                LUSTRE_LOV_NAME, &lov_device_type);
+
+       if (rc) {
+               kmem_cache_destroy(lov_oinfo_slab);
+               lu_kmem_fini(lov_caches);
+       }
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+       class_unregister_type(LUSTRE_LOV_NAME);
+       kmem_cache_destroy(lov_oinfo_slab);
+
+       lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(lov, LUSTRE_VERSION_STRING, lov_init, lov_exit);
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644 (file)
index 0000000..aa8ae80
--- /dev/null
@@ -0,0 +1,942 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include <lustre_debug.h>
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+       int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+                       struct lov_object *lov,
+                       const struct cl_object_conf *conf,
+                       union lov_layout_state *state);
+       int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state);
+       void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+                        union lov_layout_state *state);
+       void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state);
+       int  (*llo_print)(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct lu_object *o);
+       int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, struct page *vmpage);
+       int  (*llo_lock_init)(const struct lu_env *env,
+                             struct cl_object *obj, struct cl_lock *lock,
+                             const struct cl_io *io);
+       int  (*llo_io_init)(const struct lu_env *env,
+                           struct cl_object *obj, struct cl_io *io);
+       int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+                             struct lov_object *lov,
+                             union  lov_layout_state *state)
+{
+       /*
+        * File without objects.
+        */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+                         struct lov_device *dev, struct lov_object *lov,
+                         const struct cl_object_conf *conf,
+                         union  lov_layout_state *state)
+{
+       return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+                             struct lov_object *lov,
+                             union  lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                     struct cl_device *dev,
+                                     const struct lu_fid *fid,
+                                     const struct cl_object_conf *conf)
+{
+       struct lu_object *o;
+
+       ENTRY;
+       o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+       LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+       RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+                       struct cl_object *stripe,
+                       struct lov_layout_raid0 *r0, int idx)
+{
+       struct cl_object_header *hdr;
+       struct cl_object_header *subhdr;
+       struct cl_object_header *parent;
+       struct lov_oinfo        *oinfo;
+       int result;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+               /* For sanity:test_206.
+                * Do not leave the object in cache to avoid accessing
+                * freed memory. This is because osc_object is referring to
+                * lov_oinfo of lsm_stripe_data which will be freed due to
+                * this failure. */
+               cl_object_kill(env, stripe);
+               cl_object_put(env, stripe);
+               return -EIO;
+       }
+
+       hdr    = cl_object_header(lov2cl(lov));
+       subhdr = cl_object_header(stripe);
+       parent = subhdr->coh_parent;
+
+       oinfo = lov->lo_lsm->lsm_oinfo[idx];
+       CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+              " idx: %d gen: %d\n",
+              PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+              PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+              oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+       if (parent == NULL) {
+               subhdr->coh_parent = hdr;
+               subhdr->coh_nesting = hdr->coh_nesting + 1;
+               lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+               r0->lo_sub[idx] = cl2lovsub(stripe);
+               r0->lo_sub[idx]->lso_super = lov;
+               r0->lo_sub[idx]->lso_index = idx;
+               result = 0;
+       } else {
+               struct lu_object  *old_obj;
+               struct lov_object *old_lov;
+               unsigned int mask = D_INODE;
+
+               old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+               LASSERT(old_obj != NULL);
+               old_lov = cl2lov(lu2cl(old_obj));
+               if (old_lov->lo_layout_invalid) {
+                       /* the object's layout has already changed but isn't
+                        * refreshed */
+                       lu_object_unhash(env, &stripe->co_lu);
+                       result = -EAGAIN;
+               } else {
+                       mask = D_ERROR;
+                       result = -EIO;
+               }
+
+               LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+                               "stripe %d is already owned.\n", idx);
+               LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+               LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+               cl_object_put(env, stripe);
+       }
+       return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+                         struct lov_device *dev, struct lov_object *lov,
+                         const struct cl_object_conf *conf,
+                         union  lov_layout_state *state)
+{
+       int result;
+       int i;
+
+       struct cl_object        *stripe;
+       struct lov_thread_info  *lti     = lov_env_info(env);
+       struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+       struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+       struct lu_fid      *ofid    = &lti->lti_fid;
+       struct lov_layout_raid0 *r0      = &state->raid0;
+
+       ENTRY;
+
+       if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+               dump_lsm(D_ERROR, lsm);
+               LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+                        LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+       }
+
+       LASSERT(lov->lo_lsm == NULL);
+       lov->lo_lsm = lsm_addref(lsm);
+       r0->lo_nr  = lsm->lsm_stripe_count;
+       LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+       OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+       if (r0->lo_sub != NULL) {
+               result = 0;
+               subconf->coc_inode = conf->coc_inode;
+               spin_lock_init(&r0->lo_sub_lock);
+               /*
+                * Create stripe cl_objects.
+                */
+               for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+                       struct cl_device *subdev;
+                       struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+                       int ost_idx = oinfo->loi_ost_idx;
+
+                       result = ostid_to_fid(ofid, &oinfo->loi_oi,
+                                             oinfo->loi_ost_idx);
+                       if (result != 0)
+                               GOTO(out, result);
+
+                       subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+                       subconf->u.coc_oinfo = oinfo;
+                       LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+                       /* In the function below, .hs_keycmp resolves to
+                        * lu_obj_hop_keycmp() */
+                       /* coverity[overrun-buffer-val] */
+                       stripe = lov_sub_find(env, subdev, ofid, subconf);
+                       if (!IS_ERR(stripe)) {
+                               result = lov_init_sub(env, lov, stripe, r0, i);
+                               if (result == -EAGAIN) { /* try again */
+                                       --i;
+                                       result = 0;
+                               }
+                       } else {
+                               result = PTR_ERR(stripe);
+                       }
+               }
+       } else
+               result = -ENOMEM;
+out:
+       RETURN(result);
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+       LASSERT(lov->lo_type == LLT_EMPTY);
+
+       lov_layout_wait(env, lov);
+
+       cl_object_prune(env, &lov->lo_cl);
+       return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+                              struct lovsub_object *los, int idx)
+{
+       struct cl_object        *sub;
+       struct lov_layout_raid0 *r0;
+       struct lu_site    *site;
+       struct lu_site_bkt_data *bkt;
+       wait_queue_t      *waiter;
+
+       r0  = &lov->u.raid0;
+       LASSERT(r0->lo_sub[idx] == los);
+
+       sub  = lovsub2cl(los);
+       site = sub->co_lu.lo_dev->ld_site;
+       bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+       cl_object_kill(env, sub);
+       /* release a reference to the sub-object and ... */
+       lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+       cl_object_put(env, sub);
+
+       /* ... wait until it is actually destroyed---sub-object clears its
+        * ->lo_sub[] slot in lovsub_object_fini() */
+       if (r0->lo_sub[idx] == los) {
+               waiter = &lov_env_info(env)->lti_waiter;
+               init_waitqueue_entry_current(waiter);
+               add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               while (1) {
+                       /* this wait-queue is signaled at the end of
+                        * lu_object_free(). */
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       spin_lock(&r0->lo_sub_lock);
+                       if (r0->lo_sub[idx] == los) {
+                               spin_unlock(&r0->lo_sub_lock);
+                               waitq_wait(waiter, TASK_UNINTERRUPTIBLE);
+                       } else {
+                               spin_unlock(&r0->lo_sub_lock);
+                               set_current_state(TASK_RUNNING);
+                               break;
+                       }
+               }
+               remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+       }
+       LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+       int i;
+
+       ENTRY;
+
+       dump_lsm(D_INODE, lsm);
+
+       lov_layout_wait(env, lov);
+       if (r0->lo_sub != NULL) {
+               for (i = 0; i < r0->lo_nr; ++i) {
+                       struct lovsub_object *los = r0->lo_sub[i];
+
+                       if (los != NULL) {
+                               cl_locks_prune(env, &los->lso_cl, 1);
+                               /*
+                                * If top-level object is to be evicted from
+                                * the cache, so are its sub-objects.
+                                */
+                               lov_subobject_kill(env, lov, los, i);
+                       }
+               }
+       }
+       cl_object_prune(env, &lov->lo_cl);
+       RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state)
+{
+       LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state)
+{
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       ENTRY;
+
+       if (r0->lo_sub != NULL) {
+               OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+               r0->lo_sub = NULL;
+       }
+
+       dump_lsm(D_INODE, lov->lo_lsm);
+       lov_free_memmd(&lov->lo_lsm);
+
+       EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o)
+{
+       (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+       return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o)
+{
+       struct lov_object       *lov = lu2lov(o);
+       struct lov_layout_raid0 *r0  = lov_r0(lov);
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+       int i;
+
+       (*p)(env, cookie, "stripes: %d, %svalid, lsm{%p 0x%08X %d %u %u}: \n",
+               r0->lo_nr, lov->lo_layout_invalid ? "in" : "", lsm,
+               lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+               lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+       for (i = 0; i < r0->lo_nr; ++i) {
+               struct lu_object *sub;
+
+               if (r0->lo_sub[i] != NULL) {
+                       sub = lovsub2lu(r0->lo_sub[i]);
+                       lu_object_print(env, cookie, p, sub);
+               } else
+                       (*p)(env, cookie, "sub %d absent\n", i);
+       }
+       return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+                             struct cl_attr *attr)
+{
+       attr->cat_blocks = 0;
+       return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+                             struct cl_attr *attr)
+{
+       struct lov_object       *lov = cl2lov(obj);
+       struct lov_layout_raid0 *r0 = lov_r0(lov);
+       struct cl_attr          *lov_attr = &r0->lo_attr;
+       int                      result = 0;
+
+       ENTRY;
+
+       /* this is called w/o holding type guard mutex, so it must be inside
+        * an on going IO otherwise lsm may be replaced.
+        * LU-2117: it turns out there exists one exception. For mmaped files,
+        * the lock of those files may be requested in the other file's IO
+        * context, and this function is called in ccc_lock_state(), it will
+        * hit this assertion.
+        * Anyway, it's still okay to call attr_get w/o type guard as layout
+        * can't go if locks exist. */
+       /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+       if (!r0->lo_attr_valid) {
+               struct lov_stripe_md    *lsm = lov->lo_lsm;
+               struct ost_lvb    *lvb = &lov_env_info(env)->lti_lvb;
+               __u64               kms = 0;
+
+               memset(lvb, 0, sizeof(*lvb));
+               /* XXX: timestamps can be negative by sanity:test_39m,
+                * how can it be? */
+               lvb->lvb_atime = LLONG_MIN;
+               lvb->lvb_ctime = LLONG_MIN;
+               lvb->lvb_mtime = LLONG_MIN;
+
+               /*
+                * XXX that should be replaced with a loop over sub-objects,
+                * doing cl_object_attr_get() on them. But for now, let's
+                * reuse old lov code.
+                */
+
+               /*
+                * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+                * happy. It's not needed, because new code uses
+                * ->coh_attr_guard spin-lock to protect consistency of
+                * sub-object attributes.
+                */
+               lov_stripe_lock(lsm);
+               result = lov_merge_lvb_kms(lsm, lvb, &kms);
+               lov_stripe_unlock(lsm);
+               if (result == 0) {
+                       cl_lvb2attr(lov_attr, lvb);
+                       lov_attr->cat_kms = kms;
+                       r0->lo_attr_valid = 1;
+               }
+       }
+       if (result == 0) { /* merge results */
+               attr->cat_blocks = lov_attr->cat_blocks;
+               attr->cat_size = lov_attr->cat_size;
+               attr->cat_kms = lov_attr->cat_kms;
+               if (attr->cat_atime < lov_attr->cat_atime)
+                       attr->cat_atime = lov_attr->cat_atime;
+               if (attr->cat_ctime < lov_attr->cat_ctime)
+                       attr->cat_ctime = lov_attr->cat_ctime;
+               if (attr->cat_mtime < lov_attr->cat_mtime)
+                       attr->cat_mtime = lov_attr->cat_mtime;
+       }
+       RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+       [LLT_EMPTY] = {
+               .llo_init      = lov_init_empty,
+               .llo_delete    = lov_delete_empty,
+               .llo_fini      = lov_fini_empty,
+               .llo_install   = lov_install_empty,
+               .llo_print     = lov_print_empty,
+               .llo_page_init = lov_page_init_empty,
+               .llo_lock_init = lov_lock_init_empty,
+               .llo_io_init   = lov_io_init_empty,
+               .llo_getattr   = lov_attr_get_empty
+       },
+       [LLT_RAID0] = {
+               .llo_init      = lov_init_raid0,
+               .llo_delete    = lov_delete_raid0,
+               .llo_fini      = lov_fini_raid0,
+               .llo_install   = lov_install_raid0,
+               .llo_print     = lov_print_raid0,
+               .llo_page_init = lov_page_init_raid0,
+               .llo_lock_init = lov_lock_init_raid0,
+               .llo_io_init   = lov_io_init_raid0,
+               .llo_getattr   = lov_attr_get_raid0
+       }
+};
+
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)                           \
+({                                                                   \
+       struct lov_object                     *__obj = (obj);     \
+       enum lov_layout_type                __llt;                \
+                                                                       \
+       __llt = __obj->lo_type;                                  \
+       LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+       lov_dispatch[__llt].op(__VA_ARGS__);                        \
+})
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+       if (lov->lo_owner != current)
+               down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+       if (lov->lo_owner != current)
+               up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                     \
+({                                                                   \
+       struct lov_object                     *__obj = (obj);     \
+       int                                  __lock = !!(lock);      \
+       typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;              \
+                                                                       \
+       if (__lock)                                                  \
+               lov_conf_freeze(__obj);                                 \
+       __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);    \
+       if (__lock)                                                  \
+               lov_conf_thaw(__obj);                                   \
+       __result;                                                      \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)                 \
+       LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)                               \
+do {                                                               \
+       struct lov_object                     *__obj = (obj);     \
+       enum lov_layout_type                __llt;                \
+                                                                       \
+       lov_conf_freeze(__obj);                                         \
+       __llt = __obj->lo_type;                                  \
+       LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+       lov_dispatch[__llt].op(__VA_ARGS__);                        \
+       lov_conf_thaw(__obj);                                           \
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+       LASSERT(lov->lo_owner != current);
+       down_write(&lov->lo_type_guard);
+       LASSERT(lov->lo_owner == NULL);
+       lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+       lov->lo_owner = NULL;
+       up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+       struct l_wait_info lwi = { 0 };
+       ENTRY;
+
+       while (atomic_read(&lov->lo_active_ios) > 0) {
+               CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+                       PFID(lu_object_fid(lov2lu(lov))),
+                       atomic_read(&lov->lo_active_ios));
+
+               l_wait_event(lov->lo_waitq,
+                            atomic_read(&lov->lo_active_ios) == 0, &lwi);
+       }
+       RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+                            struct lov_object *lov,
+                            const struct cl_object_conf *conf)
+{
+       int result;
+       enum lov_layout_type llt = LLT_EMPTY;
+       union lov_layout_state *state = &lov->u;
+       const struct lov_layout_operations *old_ops;
+       const struct lov_layout_operations *new_ops;
+
+       struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+       void *cookie;
+       struct lu_env *env;
+       int refcheck;
+       ENTRY;
+
+       LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+       if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL)
+               llt = LLT_RAID0; /* only raid0 is supported. */
+       LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+       cookie = cl_env_reenter();
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env)) {
+               cl_env_reexit(cookie);
+               RETURN(PTR_ERR(env));
+       }
+
+       old_ops = &lov_dispatch[lov->lo_type];
+       new_ops = &lov_dispatch[llt];
+
+       result = old_ops->llo_delete(env, lov, &lov->u);
+       if (result == 0) {
+               old_ops->llo_fini(env, lov, &lov->u);
+
+               LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+               LASSERT(hdr->coh_tree.rnode == NULL);
+               LASSERT(hdr->coh_pages == 0);
+
+               lov->lo_type = LLT_EMPTY;
+               result = new_ops->llo_init(env,
+                                       lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+                                       lov, conf, state);
+               if (result == 0) {
+                       new_ops->llo_install(env, lov, state);
+                       lov->lo_type = llt;
+               } else {
+                       new_ops->llo_delete(env, lov, state);
+                       new_ops->llo_fini(env, lov, state);
+                       /* this file becomes an EMPTY file. */
+               }
+       }
+
+       cl_env_put(env, &refcheck);
+       cl_env_reexit(cookie);
+       RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf)
+{
+       struct lov_device           *dev   = lu2lov_dev(obj->lo_dev);
+       struct lov_object           *lov   = lu2lov(obj);
+       const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+       union  lov_layout_state      *set   = &lov->u;
+       const struct lov_layout_operations *ops;
+       int result;
+
+       ENTRY;
+       init_rwsem(&lov->lo_type_guard);
+       atomic_set(&lov->lo_active_ios, 0);
+       init_waitqueue_head(&lov->lo_waitq);
+
+       cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+       /* no locking is necessary, as object is being created */
+       lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
+       ops = &lov_dispatch[lov->lo_type];
+       result = ops->llo_init(env, dev, lov, cconf, set);
+       if (result == 0)
+               ops->llo_install(env, lov, set);
+       RETURN(result);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_object_conf *conf)
+{
+       struct lov_stripe_md *lsm = NULL;
+       struct lov_object *lov = cl2lov(obj);
+       int result = 0;
+       ENTRY;
+
+       lov_conf_lock(lov);
+       if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+               lov->lo_layout_invalid = true;
+               GOTO(out, result = 0);
+       }
+
+       if (conf->coc_opc == OBJECT_CONF_WAIT) {
+               if (lov->lo_layout_invalid &&
+                   atomic_read(&lov->lo_active_ios) > 0) {
+                       lov_conf_unlock(lov);
+                       result = lov_layout_wait(env, lov);
+                       lov_conf_lock(lov);
+               }
+               GOTO(out, result);
+       }
+
+       LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+       if (conf->u.coc_md != NULL)
+               lsm = conf->u.coc_md->lsm;
+       if ((lsm == NULL && lov->lo_lsm == NULL) ||
+           (lsm != NULL && lov->lo_lsm != NULL &&
+            lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) {
+               /* same version of layout */
+               lov->lo_layout_invalid = false;
+               GOTO(out, result = 0);
+       }
+
+       /* will change layout - check if there still exists active IO. */
+       if (atomic_read(&lov->lo_active_ios) > 0) {
+               lov->lo_layout_invalid = true;
+               GOTO(out, result = -EBUSY);
+       }
+
+       lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+       EXIT;
+
+out:
+       lov_conf_unlock(lov);
+       RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lov_object *lov = lu2lov(obj);
+
+       ENTRY;
+       LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+       EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lov_object *lov = lu2lov(obj);
+
+       ENTRY;
+       LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+       lu_object_fini(obj);
+       OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+       EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+                                   llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_io *io)
+{
+       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+       return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+                                    !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       /* do not take lock, as this function is called under a
+        * spin-lock. Layout is protected from changing by ongoing IO. */
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_attr *attr, unsigned valid)
+{
+       /*
+        * No dispatch is required here, as no layout implements this.
+        */
+       return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io)
+{
+       /* No need to lock because we've taken one refcount of layout.  */
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+                                   io);
+}
+
+static const struct cl_object_operations lov_ops = {
+       .coo_page_init = lov_page_init,
+       .coo_lock_init = lov_lock_init,
+       .coo_io_init   = lov_io_init,
+       .coo_attr_get  = lov_attr_get,
+       .coo_attr_set  = lov_attr_set,
+       .coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+       .loo_object_init      = lov_object_init,
+       .loo_object_delete    = lov_object_delete,
+       .loo_object_release   = NULL,
+       .loo_object_free      = lov_object_free,
+       .loo_object_print     = lov_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev)
+{
+       struct lov_object *lov;
+       struct lu_object  *obj;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, __GFP_IO);
+       if (lov != NULL) {
+               obj = lov2lu(lov);
+               lu_object_init(obj, NULL, dev);
+               lov->lo_cl.co_ops = &lov_ops;
+               lov->lo_type = -1; /* invalid, to catch uninitialized type */
+               /*
+                * object io operation vector (cl_object::co_iop) is installed
+                * later in lov_object_init(), as different vectors are used
+                * for object with different layouts.
+                */
+               obj->lo_ops = &lov_lu_obj_ops;
+       } else
+               obj = NULL;
+       RETURN(obj);
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+       struct lov_stripe_md *lsm = NULL;
+
+       lov_conf_freeze(lov);
+       if (lov->lo_lsm != NULL) {
+               lsm = lsm_addref(lov->lo_lsm);
+               CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+                       lsm, atomic_read(&lsm->lsm_refc),
+                       lov->lo_layout_invalid, current);
+       }
+       lov_conf_thaw(lov);
+       return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+       if (lsm == NULL)
+               return;
+
+       CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+               lsm, atomic_read(&lsm->lsm_refc), current);
+
+       lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+       struct lu_object *luobj;
+       struct lov_stripe_md *lsm = NULL;
+
+       if (clobj == NULL)
+               return NULL;
+
+       luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL)
+               lsm = lov_lsm_addref(lu2lov(luobj));
+       return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+       if (lsm != NULL)
+               lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+       struct lu_object *luobj;
+       int rc = 0;
+       ENTRY;
+
+       luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL) {
+               struct lov_object *lov = lu2lov(luobj);
+
+               lov_conf_freeze(lov);
+               switch (lov->lo_type) {
+               case LLT_RAID0: {
+                       struct lov_stripe_md *lsm;
+                       int i;
+
+                       lsm = lov->lo_lsm;
+                       LASSERT(lsm != NULL);
+                       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+                               if (loi->loi_ar.ar_rc && !rc)
+                                       rc = loi->loi_ar.ar_rc;
+                               loi->loi_ar.ar_rc = 0;
+                       }
+               }
+               case LLT_EMPTY:
+                       break;
+               default:
+                       LBUG();
+               }
+               lov_conf_thaw(lov);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644 (file)
index 0000000..f62b7e5
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+                        int stripeno)
+{
+       unsigned long ssize = lsm->lsm_stripe_size;
+       unsigned long stripe_size;
+       obd_off swidth;
+       obd_size lov_size;
+       int magic = lsm->lsm_magic;
+       ENTRY;
+
+       if (ost_size == 0)
+               RETURN(0);
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_size = lov_do_div64(ost_size, ssize);
+       if (stripe_size)
+               lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+       else
+               lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+       RETURN(lov_size);
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *          S                                        E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *          S   E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S            E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+                     int stripeno, obd_off *obdoff)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, this_stripe, swidth;
+       int magic = lsm->lsm_magic;
+       int ret = 0;
+
+       if (lov_off == OBD_OBJECT_EOF) {
+               *obdoff = OBD_OBJECT_EOF;
+               return 0;
+       }
+
+       LASSERT(lsm_op_find(magic) != NULL);
+
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+                                               &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_off = lov_do_div64(lov_off, swidth);
+
+       this_stripe = (obd_off)stripeno * ssize;
+       if (stripe_off < this_stripe) {
+               stripe_off = 0;
+               ret = -1;
+       } else {
+               stripe_off -= this_stripe;
+
+               if (stripe_off >= ssize) {
+                       stripe_off = ssize;
+                       ret = 1;
+               }
+       }
+
+       *obdoff = lov_off * ssize + stripe_off;
+       return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *                                            S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *                                S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+                          int stripeno)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, this_stripe, swidth;
+       int magic = lsm->lsm_magic;
+
+       if (file_size == OBD_OBJECT_EOF)
+               return OBD_OBJECT_EOF;
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+                                               &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_off = lov_do_div64(file_size, swidth);
+
+       this_stripe = (obd_off)stripeno * ssize;
+       if (stripe_off < this_stripe) {
+               /* Move to end of previous stripe, or zero */
+               if (file_size > 0) {
+                       file_size--;
+                       stripe_off = ssize;
+               } else {
+                       stripe_off = 0;
+               }
+       } else {
+               stripe_off -= this_stripe;
+
+               if (stripe_off >= ssize) {
+                       /* Clamp to end of this stripe */
+                       stripe_off = ssize;
+               }
+       }
+
+       return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+                         obd_off start, obd_off end,
+                         obd_off *obd_start, obd_off *obd_end)
+{
+       int start_side, end_side;
+
+       start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+       end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+       CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n",
+              start, end, start_side, *obd_start, *obd_end, end_side);
+
+       /* this stripe doesn't intersect the file extent when neither
+        * start or the end intersected the stripe and obd_start and
+        * obd_end got rounded up to the save value. */
+       if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+               return 0;
+
+       /* as mentioned in the lov_stripe_offset commentary, end
+        * might have been shifted in the wrong direction.  This
+        * happens when an end offset is before the stripe when viewed
+        * through the "mod stripe size" math. we detect it being shifted
+        * in the wrong direction and touch it up.
+        * interestingly, this can't underflow since end must be > start
+        * if we passed through the previous check.
+        * (should we assert for that somewhere?) */
+       if (end_side != 0)
+               (*obd_end)--;
+
+       return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, swidth;
+       int magic = lsm->lsm_magic;
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+       stripe_off = lov_do_div64(lov_off, swidth);
+
+       /* Puts stripe_off/ssize result into stripe_off */
+       lov_do_div64(stripe_off, ssize);
+
+       return stripe_off;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644 (file)
index 0000000..8bb57aa
--- /dev/null
@@ -0,0 +1,677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre_net.h>
+#include <obd.h>
+#include <obd_lov.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_user.h>
+
+#include "lov_internal.h"
+
+static void lov_dump_lmm_common(int level, void *lmmp)
+{
+       struct lov_mds_md *lmm = lmmp;
+       struct ost_id   oi;
+
+       lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+       CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+              POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+              le32_to_cpu(lmm->lmm_pattern));
+       CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+              le32_to_cpu(lmm->lmm_stripe_size),
+              le16_to_cpu(lmm->lmm_stripe_count),
+              le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+                                int stripe_count)
+{
+       int i;
+
+       if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+               CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+                      stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+       }
+
+       for (i = 0; i < stripe_count; ++i, ++lod) {
+               struct ost_id   oi;
+
+               ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+               CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+                      le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+       }
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+       lov_dump_lmm_common(level, lmm);
+       lov_dump_lmm_objects(level, lmm->lmm_objects,
+                            le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+       lov_dump_lmm_common(level, lmm);
+       CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+       lov_dump_lmm_objects(level, lmm->lmm_objects,
+                            le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+       int magic;
+
+       magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+       switch (magic) {
+       case LOV_MAGIC_V1:
+               return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+       case LOV_MAGIC_V3:
+               return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+       default:
+               CERROR("Cannot recognize lmm_magic %x", magic);
+       }
+       return;
+}
+
+#define LMM_ASSERT(test)                                               \
+do {                                                               \
+       if (!(test)) lov_dump_lmm(D_ERROR, lmm);                        \
+       LASSERT(test); /* so we know what assertion failed */      \
+} while(0)
+
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ *     underlying OSC device(s) to get their EA sizes so we can stack
+ *     LOVs properly.  For now lov_mds_md_size() just assumes one obd_id
+ *     per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+              struct lov_stripe_md *lsm)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_mds_md_v1 *lmmv1;
+       struct lov_mds_md_v3 *lmmv3;
+       __u16 stripe_count;
+       struct lov_ost_data_v1 *lmm_objects;
+       int lmm_size, lmm_magic;
+       int i;
+       int cplen = 0;
+       ENTRY;
+
+       if (lsm) {
+               lmm_magic = lsm->lsm_magic;
+       } else {
+               if (lmmp && *lmmp)
+                       lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+               else
+                       /* lsm == NULL and lmmp == NULL */
+                       lmm_magic = LOV_MAGIC;
+       }
+
+       if ((lmm_magic != LOV_MAGIC_V1) &&
+           (lmm_magic != LOV_MAGIC_V3)) {
+               CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+                       lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+               RETURN(-EINVAL);
+
+       }
+
+       if (lsm) {
+               /* If we are just sizing the EA, limit the stripe count
+                * to the actual number of OSTs in this filesystem. */
+               if (!lmmp) {
+                       stripe_count = lov_get_stripecnt(lov, lmm_magic,
+                                                        lsm->lsm_stripe_count);
+                       lsm->lsm_stripe_count = stripe_count;
+               } else {
+                       stripe_count = lsm->lsm_stripe_count;
+               }
+       } else {
+               /* No need to allocate more than maximum supported stripes.
+                * Anyway, this is pretty inaccurate since ld_tgt_count now
+                * represents max index and we should rely on the actual number
+                * of OSTs instead */
+               stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+                                                   lmm_magic);
+               if (stripe_count > lov->desc.ld_tgt_count)
+                       stripe_count = lov->desc.ld_tgt_count;
+       }
+
+       /* XXX LOV STACKING call into osc for sizes */
+       lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+       if (!lmmp)
+               RETURN(lmm_size);
+
+       if (*lmmp && !lsm) {
+               stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+               lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+               OBD_FREE_LARGE(*lmmp, lmm_size);
+               *lmmp = NULL;
+               RETURN(0);
+       }
+
+       if (!*lmmp) {
+               OBD_ALLOC_LARGE(*lmmp, lmm_size);
+               if (!*lmmp)
+                       RETURN(-ENOMEM);
+       }
+
+       CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+              lmm_magic, lmm_size);
+
+       lmmv1 = *lmmp;
+       lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+       if (lmm_magic == LOV_MAGIC_V3)
+               lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+       else
+               lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+       if (!lsm)
+               RETURN(lmm_size);
+
+       /* lmmv1 and lmmv3 point to the same struct and have the
+        * same first fields
+        */
+       lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+       lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+       lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+       lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+       lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+       if (lsm->lsm_magic == LOV_MAGIC_V3) {
+               cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+                               sizeof(lmmv3->lmm_pool_name));
+               if (cplen >= sizeof(lmmv3->lmm_pool_name))
+                       RETURN(-E2BIG);
+               lmm_objects = lmmv3->lmm_objects;
+       } else {
+               lmm_objects = lmmv1->lmm_objects;
+       }
+
+       for (i = 0; i < stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               /* XXX LOV STACKING call down to osc_packmd() to do packing */
+               LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+                        " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+                        i, stripe_count, loi->loi_ost_idx);
+               ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+               lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+               lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+       }
+
+       RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+       __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+       if (!stripe_count)
+               stripe_count = lov->desc.ld_default_stripe_count;
+       if (stripe_count > lov->desc.ld_active_tgt_count)
+               stripe_count = lov->desc.ld_active_tgt_count;
+       if (!stripe_count)
+               stripe_count = 1;
+
+       /* stripe count is based on whether ldiskfs can handle
+        * larger EA sizes */
+       if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+           lov->lov_ocd.ocd_max_easize)
+               max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+                                                  magic);
+
+       if (stripe_count > max_stripes)
+               stripe_count = max_stripes;
+
+       return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+       int rc;
+
+       if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+               char *buffer;
+               int sz;
+
+               CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+                      le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+               sz = lmm_bytes * 2 + 1;
+               OBD_ALLOC_LARGE(buffer, sz);
+               if (buffer != NULL) {
+                       int i;
+
+                       for (i = 0; i < lmm_bytes; i++)
+                               sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+                       buffer[sz - 1] = '\0';
+                       CERROR("%s\n", buffer);
+                       OBD_FREE_LARGE(buffer, sz);
+               }
+               return -EINVAL;
+       }
+       rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+                                    lmm_bytes, stripe_count);
+       return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+                   int pattern, int magic)
+{
+       int i, lsm_size;
+       ENTRY;
+
+       CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+       *lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+       if (!*lsmp) {
+               CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+               RETURN(-ENOMEM);
+       }
+
+       atomic_set(&(*lsmp)->lsm_refc, 1);
+       spin_lock_init(&(*lsmp)->lsm_lock);
+       (*lsmp)->lsm_magic = magic;
+       (*lsmp)->lsm_stripe_count = stripe_count;
+       (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+       (*lsmp)->lsm_pattern = pattern;
+       (*lsmp)->lsm_pool_name[0] = '\0';
+       (*lsmp)->lsm_layout_gen = 0;
+       (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+       for (i = 0; i < stripe_count; i++)
+               loi_init((*lsmp)->lsm_oinfo[i]);
+
+       RETURN(lsm_size);
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+       struct lov_stripe_md *lsm = *lsmp;
+       int refc;
+
+       *lsmp = NULL;
+       LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+       if ((refc = atomic_dec_return(&lsm->lsm_refc)) == 0) {
+               LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+               lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+       }
+       return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_bytes)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       int rc = 0, lsm_size;
+       __u16 stripe_count;
+       __u32 magic;
+       ENTRY;
+
+       /* If passed an MDS struct use values from there, otherwise defaults */
+       if (lmm) {
+               rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+               if (rc)
+                       RETURN(rc);
+               magic = le32_to_cpu(lmm->lmm_magic);
+       } else {
+               magic = LOV_MAGIC;
+               stripe_count = lov_get_stripecnt(lov, magic, 0);
+       }
+
+       /* If we aren't passed an lsmp struct, we just want the size */
+       if (!lsmp) {
+               /* XXX LOV STACKING call into osc for sizes */
+               LBUG();
+               RETURN(lov_stripe_md_size(stripe_count));
+       }
+       /* If we are passed an allocated struct but nothing to unpack, free */
+       if (*lsmp && !lmm) {
+               lov_free_memmd(lsmp);
+               RETURN(0);
+       }
+
+       lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
+                                  magic);
+       if (lsm_size < 0)
+               RETURN(lsm_size);
+
+       /* If we are passed a pointer but nothing to unpack, we only alloc */
+       if (!lmm)
+               RETURN(lsm_size);
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+       if (rc) {
+               lov_free_memmd(lsmp);
+               RETURN(rc);
+       }
+
+       RETURN(lsm_size);
+}
+
+static int __lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                          struct lov_stripe_md **lsmp,
+                          struct lov_user_md *lump)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       char buffer[sizeof(struct lov_user_md_v3)];
+       struct lov_user_md_v3 *lumv3 = (struct lov_user_md_v3 *)&buffer[0];
+       struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&buffer[0];
+       int lmm_magic;
+       __u16 stripe_count;
+       int rc;
+       int cplen = 0;
+       ENTRY;
+
+       rc = lov_lum_swab_if_needed(lumv3, &lmm_magic, lump);
+       if (rc)
+               RETURN(rc);
+
+       /* in the rest of the tests, as *lumv1 and lumv3 have the same
+        * fields, we use lumv1 to avoid code duplication */
+
+       if (lumv1->lmm_pattern == 0) {
+               lumv1->lmm_pattern = lov->desc.ld_pattern ?
+                       lov->desc.ld_pattern : LOV_PATTERN_RAID0;
+       }
+
+       if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
+               CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
+                      lumv1->lmm_pattern);
+               RETURN(-EINVAL);
+       }
+
+       /* 64kB is the largest common page size we see (ia64), and matches the
+        * check in lfs */
+       if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+               CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
+                      lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+               lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+       }
+
+       if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+           (lumv1->lmm_stripe_offset !=
+            (typeof(lumv1->lmm_stripe_offset))(-1))) {
+               CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
+                      lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
+               RETURN(-EINVAL);
+       }
+       stripe_count = lov_get_stripecnt(lov, lmm_magic,
+                                        lumv1->lmm_stripe_count);
+
+       if (max_lmm_size) {
+               int max_stripes = (max_lmm_size -
+                                  lov_mds_md_size(0, lmm_magic)) /
+                                  sizeof(struct lov_ost_data_v1);
+               if (unlikely(max_stripes < stripe_count)) {
+                       CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n",
+                              stripe_count, max_stripes);
+                       stripe_count = max_stripes;
+               }
+       }
+
+       if (lmm_magic == LOV_USER_MAGIC_V3) {
+               struct pool_desc *pool;
+
+               /* In the function below, .hs_keycmp resolves to
+                * pool_hashkey_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               pool = lov_find_pool(lov, lumv3->lmm_pool_name);
+               if (pool != NULL) {
+                       if (lumv3->lmm_stripe_offset !=
+                           (typeof(lumv3->lmm_stripe_offset))(-1)) {
+                               rc = lov_check_index_in_pool(
+                                       lumv3->lmm_stripe_offset, pool);
+                               if (rc < 0) {
+                                       lov_pool_putref(pool);
+                                       RETURN(-EINVAL);
+                               }
+                       }
+
+                       if (stripe_count > pool_tgt_count(pool))
+                               stripe_count = pool_tgt_count(pool);
+
+                       lov_pool_putref(pool);
+               }
+       }
+
+       rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
+
+       if (rc >= 0) {
+               (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+               (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+               if (lmm_magic == LOV_USER_MAGIC_V3) {
+                       cplen = strlcpy((*lsmp)->lsm_pool_name,
+                                       lumv3->lmm_pool_name,
+                                       sizeof((*lsmp)->lsm_pool_name));
+                       if (cplen >= sizeof((*lsmp)->lsm_pool_name))
+                               rc = -E2BIG;
+               }
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/* Configure object striping information on a new file.
+ *
+ * @lmmu is a pointer to a user struct with one or more of the fields set to
+ * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
+ * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
+ * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
+ */
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                 struct lov_stripe_md **lsmp, struct lov_user_md *lump)
+{
+       int rc;
+       mm_segment_t seg;
+
+       seg = get_fs();
+       set_fs(KERNEL_DS);
+
+       rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump);
+       set_fs(seg);
+       RETURN(rc);
+}
+
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+             struct lov_user_md *lump)
+{
+       int i;
+       int rc;
+       struct obd_export *oexp;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       obd_id last_id = 0;
+       struct lov_user_ost_data_v1 *lmm_objects;
+
+       ENTRY;
+
+       if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+               lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+       else
+               lmm_objects = lump->lmm_objects;
+
+       for (i = 0; i < lump->lmm_stripe_count; i++) {
+               __u32 len = sizeof(last_id);
+               oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
+               rc = obd_get_info(NULL, oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
+                                 &len, &last_id, NULL);
+               if (rc)
+                       RETURN(rc);
+               if (ostid_id(&lmm_objects[i].l_ost_oi) > last_id) {
+                       CERROR("Setting EA for object > than last id on"
+                              " ost idx %d "DOSTID" > "LPD64" \n",
+                              lmm_objects[i].l_ost_idx,
+                              POSTID(&lmm_objects[i].l_ost_oi), last_id);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       rc = lov_setstripe(exp, 0, lsmp, lump);
+       if (rc)
+               RETURN(rc);
+
+       for (i = 0; i < lump->lmm_stripe_count; i++) {
+               (*lsmp)->lsm_oinfo[i]->loi_ost_idx =
+                       lmm_objects[i].l_ost_idx;
+               (*lsmp)->lsm_oinfo[i]->loi_oi = lmm_objects[i].l_ost_oi;
+       }
+       RETURN(0);
+}
+
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+                 struct lov_user_md *lump)
+{
+       /*
+        * XXX huge struct allocated on stack.
+        */
+       /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+       struct lov_user_md_v3 lum;
+       struct lov_mds_md *lmmk = NULL;
+       int rc, lmm_size;
+       int lum_size;
+       mm_segment_t seg;
+       ENTRY;
+
+       if (!lsm)
+               RETURN(-ENODATA);
+
+       /*
+        * "Switch to kernel segment" to allow copying from kernel space by
+        * copy_{to,from}_user().
+        */
+       seg = get_fs();
+       set_fs(KERNEL_DS);
+
+       /* we only need the header part from user space to get lmm_magic and
+        * lmm_stripe_count, (the header part is common to v1 and v3) */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(&lum, lump, lum_size))
+               GOTO(out_set, rc = -EFAULT);
+       else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+                (lum.lmm_magic != LOV_USER_MAGIC_V3))
+               GOTO(out_set, rc = -EINVAL);
+
+       if (lum.lmm_stripe_count &&
+           (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+               /* Return right size of stripe to user */
+               lum.lmm_stripe_count = lsm->lsm_stripe_count;
+               rc = copy_to_user(lump, &lum, lum_size);
+               GOTO(out_set, rc = -EOVERFLOW);
+       }
+       rc = lov_packmd(exp, &lmmk, lsm);
+       if (rc < 0)
+               GOTO(out_set, rc);
+       lmm_size = rc;
+       rc = 0;
+
+       /* FIXME: Bug 1185 - copy fields properly when structs change */
+       /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+       CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+       CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]);
+
+       if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+           ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+           (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+               lustre_swab_lov_mds_md(lmmk);
+               lustre_swab_lov_user_md_objects(
+                               (struct lov_user_ost_data*)lmmk->lmm_objects,
+                               lmmk->lmm_stripe_count);
+       }
+       if (lum.lmm_magic == LOV_USER_MAGIC) {
+               /* User request for v1, we need skip lmm_pool_name */
+               if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+                       memmove((char*)(&lmmk->lmm_stripe_count) +
+                               sizeof(lmmk->lmm_stripe_count),
+                               ((struct lov_mds_md_v3*)lmmk)->lmm_objects,
+                               lmmk->lmm_stripe_count *
+                               sizeof(struct lov_ost_data_v1));
+                       lmm_size -= LOV_MAXPOOLNAME;
+               }
+       } else {
+               /* if v3 we just have to update the lum_size */
+               lum_size = sizeof(struct lov_user_md_v3);
+       }
+
+       /* User wasn't expecting this many OST entries */
+       if (lum.lmm_stripe_count == 0)
+               lmm_size = lum_size;
+       else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count)
+               GOTO(out_set, rc = -EOVERFLOW);
+       /*
+        * Have a difference between lov_mds_md & lov_user_md.
+        * So we have to re-order the data before copy to user.
+        */
+       lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+       lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+       ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+       ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+       if (copy_to_user(lump, lmmk, lmm_size))
+               rc = -EFAULT;
+
+       obd_free_diskmd(exp, &lmmk);
+out_set:
+       set_fs(seg);
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644 (file)
index 0000000..65790d6
--- /dev/null
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+       const struct cl_page  *page = slice->cpl_page;
+       const struct cl_page  *sub  = lov_sub_page(slice);
+
+       return ergo(sub != NULL,
+                   page->cp_child == sub &&
+                   sub->cp_parent == page &&
+                   page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct cl_page  *sub = lov_sub_page(slice);
+
+       LINVRNT(lov_page_invariant(slice));
+       ENTRY;
+
+       if (sub != NULL) {
+               LASSERT(sub->cp_state == CPS_FREEING);
+               lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+               sub->cp_parent = NULL;
+               slice->cpl_page->cp_child = NULL;
+               cl_page_put(env, sub);
+       }
+       EXIT;
+}
+
+static int lov_page_own(const struct lu_env *env,
+                       const struct cl_page_slice *slice, struct cl_io *io,
+                       int nonblock)
+{
+       struct lov_io     *lio = lov_env_io(env);
+       struct lov_io_sub *sub;
+
+       LINVRNT(lov_page_invariant(slice));
+       LINVRNT(!cl2lov_page(slice)->lps_invalid);
+       ENTRY;
+
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               lov_sub_page(slice)->cp_owner = sub->sub_io;
+               lov_sub_put(sub);
+       } else
+               LBUG(); /* Arrgh */
+       RETURN(0);
+}
+
+static void lov_page_assume(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io)
+{
+       lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io)
+{
+       struct lov_io     *lio = lov_env_io(env);
+       struct lov_io_sub *sub;
+       int rc = 0;
+
+       LINVRNT(lov_page_invariant(slice));
+       LINVRNT(!cl2lov_page(slice)->lps_invalid);
+       ENTRY;
+
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+                                      slice->cpl_page->cp_child, CRT_WRITE);
+               lov_sub_put(sub);
+       } else {
+               rc = PTR_ERR(sub);
+               CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+       }
+       RETURN(rc);
+}
+
+static int lov_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct lov_page *lp = cl2lov_page(slice);
+
+       return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+       .cpo_fini   = lov_page_fini,
+       .cpo_own    = lov_page_own,
+       .cpo_assume = lov_page_assume,
+       .io = {
+               [CRT_WRITE] = {
+                       .cpo_cache_add = lov_page_cache_add
+               }
+       },
+       .cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+                               struct cl_page_slice *slice)
+{
+       LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct lov_object *loo = cl2lov(obj);
+       struct lov_layout_raid0 *r0 = lov_r0(loo);
+       struct lov_io     *lio = lov_env_io(env);
+       struct cl_page    *subpage;
+       struct cl_object  *subobj;
+       struct lov_io_sub *sub;
+       struct lov_page   *lpg = cl_object_page_slice(obj, page);
+       loff_t       offset;
+       obd_off     suboff;
+       int             stripe;
+       int             rc;
+       ENTRY;
+
+       offset = cl_offset(obj, page->cp_index);
+       stripe = lov_stripe_number(loo->lo_lsm, offset);
+       LASSERT(stripe < r0->lo_nr);
+       rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+                                  &suboff);
+       LASSERT(rc == 0);
+
+       lpg->lps_invalid = 1;
+       cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+       sub = lov_sub_get(env, lio, stripe);
+       if (IS_ERR(sub))
+               GOTO(out, rc = PTR_ERR(sub));
+
+       subobj = lovsub2cl(r0->lo_sub[stripe]);
+       subpage = cl_page_find_sub(sub->sub_env, subobj,
+                                  cl_index(subobj, suboff), vmpage, page);
+       lov_sub_put(sub);
+       if (IS_ERR(subpage))
+               GOTO(out, rc = PTR_ERR(subpage));
+
+       if (likely(subpage->cp_parent == page)) {
+               lu_ref_add(&subpage->cp_reference, "lov", page);
+               lpg->lps_invalid = 0;
+               rc = 0;
+       } else {
+               CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+               CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+               LASSERT(0);
+       }
+
+       EXIT;
+out:
+       return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+       .cpo_fini   = lov_empty_page_fini,
+       .cpo_print  = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct lov_page *lpg = cl_object_page_slice(obj, page);
+       void *addr;
+       ENTRY;
+
+       cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+       addr = kmap(vmpage);
+       memset(addr, 0, cl_page_size(obj));
+       kunmap(vmpage);
+       cl_page_export(env, page, 1);
+       RETURN(0);
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644 (file)
index 0000000..0f3f96d
--- /dev/null
@@ -0,0 +1,682 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+               _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       if (atomic_dec_and_test(&pool->pool_refcount)) {
+               LASSERT(hlist_unhashed(&pool->pool_hash));
+               LASSERT(list_empty(&pool->pool_list));
+               LASSERT(pool->pool_proc_entry == NULL);
+               lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+               lov_ost_pool_free(&(pool->pool_obds));
+               OBD_FREE_PTR(pool);
+               EXIT;
+       }
+}
+
+void lov_pool_putref_locked(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+       atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask)
+{
+       int i;
+       __u32 result;
+       char *poolname;
+
+       result = 0;
+       poolname = (char *)key;
+       for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+               if (poolname[i] == '\0')
+                       break;
+               result = (result << 4)^(result >> 28) ^  poolname[i];
+       }
+       return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       return (pool->pool_name);
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+       char *pool_name;
+       struct pool_desc *pool;
+
+       pool_name = (char *)key;
+       pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+       return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(cfs_hash_t *hs,
+                                        struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+       .hs_hash        = pool_hashfn,
+       .hs_key  = pool_key,
+       .hs_keycmp      = pool_hashkey_keycmp,
+       .hs_object      = pool_hashobject,
+       .hs_get  = pool_hashrefcount_get,
+       .hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+       int magic;
+       struct pool_desc *pool;
+       int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)s->private;
+       int prev_idx;
+
+       LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+       /* test if end of file */
+       if (*pos >= pool_tgt_count(iter->pool))
+               return NULL;
+
+       /* iterate to find a non empty entry */
+       prev_idx = iter->idx;
+       down_read(&pool_tgt_rw_sem(iter->pool));
+       iter->idx++;
+       if (iter->idx == pool_tgt_count(iter->pool)) {
+               iter->idx = prev_idx; /* we stay on the last entry */
+               up_read(&pool_tgt_rw_sem(iter->pool));
+               return NULL;
+       }
+       up_read(&pool_tgt_rw_sem(iter->pool));
+       (*pos)++;
+       /* return != NULL to continue */
+       return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+       struct pool_desc *pool = (struct pool_desc *)s->private;
+       struct pool_iterator *iter;
+
+       lov_pool_getref(pool);
+       if ((pool_tgt_count(pool) == 0) ||
+           (*pos >= pool_tgt_count(pool))) {
+               /* iter is not created, so stop() has no way to
+                * find pool to dec ref */
+               lov_pool_putref(pool);
+               return NULL;
+       }
+
+       OBD_ALLOC_PTR(iter);
+       if (!iter)
+               return ERR_PTR(-ENOMEM);
+       iter->magic = POOL_IT_MAGIC;
+       iter->pool = pool;
+       iter->idx = 0;
+
+       /* we use seq_file private field to memorized iterator so
+        * we can free it at stop() */
+       /* /!\ do not forget to restore it to pool before freeing it */
+       s->private = iter;
+       if (*pos > 0) {
+               loff_t i;
+               void *ptr;
+
+               i = 0;
+               do {
+                    ptr = pool_proc_next(s, &iter, &i);
+               } while ((i < *pos) && (ptr != NULL));
+               return ptr;
+       }
+       return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+       /* in some cases stop() method is called 2 times, without
+        * calling start() method (see seq_read() from fs/seq_file.c)
+        * we have to free only if s->private is an iterator */
+       if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+               /* we restore s->private so next call to pool_proc_start()
+                * will work */
+               s->private = iter->pool;
+               lov_pool_putref(iter->pool);
+               OBD_FREE_PTR(iter);
+       }
+       return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)v;
+       struct lov_tgt_desc *tgt;
+
+       LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+       LASSERT(iter->pool != NULL);
+       LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+       down_read(&pool_tgt_rw_sem(iter->pool));
+       tgt = pool_tgt(iter->pool, iter->idx);
+       up_read(&pool_tgt_rw_sem(iter->pool));
+       if (tgt)
+               seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+       return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+       .start    = pool_proc_start,
+       .next      = pool_proc_next,
+       .stop      = pool_proc_stop,
+       .show      = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+       int rc;
+
+       rc = seq_open(file, &pool_proc_ops);
+       if (!rc) {
+               struct seq_file *s = file->private_data;
+               s->private = PROC_I(inode)->pde->data;
+       }
+       return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+       .open      = pool_proc_open,
+       .read      = seq_read,
+       .llseek  = seq_lseek,
+       .release        = seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+       int i;
+
+       lov_pool_getref(pool);
+
+       CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+              pool->pool_name, pool->pool_obds.op_count);
+       down_read(&pool_tgt_rw_sem(pool));
+
+       for (i = 0; i < pool_tgt_count(pool) ; i++) {
+               if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+                       continue;
+               CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+                      pool->pool_name, i,
+                      obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+       }
+
+       up_read(&pool_tgt_rw_sem(pool));
+       lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+       ENTRY;
+
+       if (count == 0)
+               count = LOV_POOL_INIT_COUNT;
+       op->op_array = NULL;
+       op->op_count = 0;
+       init_rwsem(&op->op_rw_sem);
+       op->op_size = count;
+       OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       if (op->op_array == NULL) {
+               op->op_size = 0;
+               RETURN(-ENOMEM);
+       }
+       EXIT;
+       return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+       __u32 *new;
+       int new_size;
+
+       LASSERT(min_count != 0);
+
+       if (op->op_count < op->op_size)
+               return 0;
+
+       new_size = max(min_count, 2 * op->op_size);
+       OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+       if (new == NULL)
+               return -ENOMEM;
+
+       /* copy old array to new one */
+       memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+       OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       op->op_array = new;
+       op->op_size = new_size;
+       return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+       int rc = 0, i;
+       ENTRY;
+
+       down_write(&op->op_rw_sem);
+
+       rc = lov_ost_pool_extend(op, min_count);
+       if (rc)
+               GOTO(out, rc);
+
+       /* search ost in pool array */
+       for (i = 0; i < op->op_count; i++) {
+               if (op->op_array[i] == idx)
+                       GOTO(out, rc = -EEXIST);
+       }
+       /* ost not found we add it */
+       op->op_array[op->op_count] = idx;
+       op->op_count++;
+       EXIT;
+out:
+       up_write(&op->op_rw_sem);
+       return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+       int i;
+       ENTRY;
+
+       down_write(&op->op_rw_sem);
+
+       for (i = 0; i < op->op_count; i++) {
+               if (op->op_array[i] == idx) {
+                       memmove(&op->op_array[i], &op->op_array[i + 1],
+                               (op->op_count - i - 1) * sizeof(op->op_array[0]));
+                       op->op_count--;
+                       up_write(&op->op_rw_sem);
+                       EXIT;
+                       return 0;
+               }
+       }
+
+       up_write(&op->op_rw_sem);
+       RETURN(-EINVAL);
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+       ENTRY;
+
+       if (op->op_size == 0)
+               RETURN(0);
+
+       down_write(&op->op_rw_sem);
+
+       OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       op->op_array = NULL;
+       op->op_count = 0;
+       op->op_size = 0;
+
+       up_write(&op->op_rw_sem);
+       RETURN(0);
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+       struct lov_obd *lov;
+       struct pool_desc *new_pool;
+       int rc;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       if (strlen(poolname) > LOV_MAXPOOLNAME)
+               RETURN(-ENAMETOOLONG);
+
+       OBD_ALLOC_PTR(new_pool);
+       if (new_pool == NULL)
+               RETURN(-ENOMEM);
+
+       strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+       new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+       new_pool->pool_lobd = obd;
+       /* ref count init to 1 because when created a pool is always used
+        * up to deletion
+        */
+       atomic_set(&new_pool->pool_refcount, 1);
+       rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+       if (rc)
+              GOTO(out_err, rc);
+
+       memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+       rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+       if (rc)
+               GOTO(out_free_pool_obds, rc);
+
+       INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#ifdef LPROCFS
+       /* we need this assert seq_file is not implementated for liblustre */
+       /* get ref for /proc file */
+       lov_pool_getref(new_pool);
+       new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+                                                      poolname, NULL, NULL,
+                                                      new_pool,
+                                                      &pool_proc_operations);
+       if (IS_ERR(new_pool->pool_proc_entry)) {
+               CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+               new_pool->pool_proc_entry = NULL;
+               lov_pool_putref(new_pool);
+       }
+       CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+       spin_lock(&obd->obd_dev_lock);
+       list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+       lov->lov_pool_count++;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* add to find only when it fully ready  */
+       rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                                &new_pool->pool_hash);
+       if (rc)
+               GOTO(out_err, rc = -EEXIST);
+
+       CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+              poolname, lov->lov_pool_count);
+
+       RETURN(0);
+
+out_err:
+       spin_lock(&obd->obd_dev_lock);
+       list_del_init(&new_pool->pool_list);
+       lov->lov_pool_count--;
+       spin_unlock(&obd->obd_dev_lock);
+
+       lprocfs_remove(&new_pool->pool_proc_entry);
+
+       lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+       lov_ost_pool_free(&new_pool->pool_obds);
+       OBD_FREE_PTR(new_pool);
+       return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       /* lookup and kill hash reference */
+       pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       if (pool->pool_proc_entry != NULL) {
+               CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+               lprocfs_remove(&pool->pool_proc_entry);
+               lov_pool_putref(pool);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       list_del_init(&pool->pool_list);
+       lov->lov_pool_count--;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* release last reference */
+       lov_pool_putref(pool);
+
+       RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+       struct obd_uuid ost_uuid;
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       unsigned int lov_idx;
+       int rc;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       obd_str2uuid(&ost_uuid, ostname);
+
+
+       /* search ost in lov array */
+       obd_getref(obd);
+       for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+               if (!lov->lov_tgts[lov_idx])
+                       continue;
+               if (obd_uuid_equals(&ost_uuid,
+                                   &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                       break;
+       }
+       /* test if ost found in lov */
+       if (lov_idx == lov->desc.ld_tgt_count)
+               GOTO(out, rc = -EINVAL);
+
+       rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+       if (rc)
+               GOTO(out, rc);
+
+       pool->pool_rr.lqr_dirty = 1;
+
+       CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+              ostname, poolname,  pool_tgt_count(pool));
+
+       EXIT;
+out:
+       obd_putref(obd);
+       lov_pool_putref(pool);
+       return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+       struct obd_uuid ost_uuid;
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       unsigned int lov_idx;
+       int rc = 0;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       obd_str2uuid(&ost_uuid, ostname);
+
+       obd_getref(obd);
+       /* search ost in lov array, to get index */
+       for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+               if (!lov->lov_tgts[lov_idx])
+                       continue;
+
+               if (obd_uuid_equals(&ost_uuid,
+                                   &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                       break;
+       }
+
+       /* test if ost found in lov */
+       if (lov_idx == lov->desc.ld_tgt_count)
+               GOTO(out, rc = -EINVAL);
+
+       lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+       pool->pool_rr.lqr_dirty = 1;
+
+       CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+              poolname);
+
+       EXIT;
+out:
+       obd_putref(obd);
+       lov_pool_putref(pool);
+       return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+       int i, rc;
+       ENTRY;
+
+       /* caller may no have a ref on pool if it got the pool
+        * without calling lov_find_pool() (e.g. go through the lov pool
+        * list)
+        */
+       lov_pool_getref(pool);
+
+       down_read(&pool_tgt_rw_sem(pool));
+
+       for (i = 0; i < pool_tgt_count(pool); i++) {
+               if (pool_tgt_array(pool)[i] == idx)
+                       GOTO(out, rc = 0);
+       }
+       rc = -ENOENT;
+       EXIT;
+out:
+       up_read(&pool_tgt_rw_sem(pool));
+
+       lov_pool_putref(pool);
+       return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+       struct pool_desc *pool;
+
+       pool = NULL;
+       if (poolname[0] != '\0') {
+               pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+               if (pool == NULL)
+                       CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+                             poolname);
+               if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+                       CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+                              poolname);
+                       /* pool is ignored, so we remove ref on it */
+                       lov_pool_putref(pool);
+                       pool = NULL;
+               }
+       }
+       return pool;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644 (file)
index 0000000..13f1637
--- /dev/null
@@ -0,0 +1,1551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+       set->set_count = 0;
+       atomic_set(&set->set_completes, 0);
+       atomic_set(&set->set_success, 0);
+       atomic_set(&set->set_finish_checked, 0);
+       set->set_cookies = 0;
+       INIT_LIST_HEAD(&set->set_list);
+       atomic_set(&set->set_refcount, 1);
+       init_waitqueue_head(&set->set_waitq);
+       spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+       struct list_head *pos, *n;
+       ENTRY;
+
+       LASSERT(set);
+       list_for_each_safe(pos, n, &set->set_list) {
+               struct lov_request *req = list_entry(pos,
+                                                        struct lov_request,
+                                                        rq_link);
+               list_del_init(&req->rq_link);
+
+               if (req->rq_oi.oi_oa)
+                       OBDO_FREE(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_md)
+                       OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_osfs)
+                       OBD_FREE(req->rq_oi.oi_osfs,
+                                sizeof(*req->rq_oi.oi_osfs));
+               OBD_FREE(req, sizeof(*req));
+       }
+
+       if (set->set_pga) {
+               int len = set->set_oabufs * sizeof(*set->set_pga);
+               OBD_FREE_LARGE(set->set_pga, len);
+       }
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       OBD_FREE(set, sizeof(*set));
+       EXIT;
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+       int completes = atomic_read(&set->set_completes);
+
+       CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+       if (completes == set->set_count) {
+               if (idempotent)
+                       return 1;
+               if (atomic_inc_return(&set->set_finish_checked) == 1)
+                       return 1;
+       }
+       return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+                   struct lov_request *req, int rc)
+{
+       req->rq_complete = 1;
+       req->rq_rc = rc;
+
+       atomic_inc(&set->set_completes);
+       if (rc == 0)
+               atomic_inc(&set->set_success);
+
+       wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+                         struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !(lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active))
+               rc = 0;
+
+       /* FIXME in raid1 regime, should return 0 */
+       RETURN(rc);
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+       list_add_tail(&req->rq_link, &set->set_list);
+       set->set_count++;
+       req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+       int rc = 0;
+       mutex_lock(&lov->lov_lock);
+
+       if (lov->lov_tgts[idx] == NULL ||
+           lov->lov_tgts[idx]->ltd_active ||
+           (lov->lov_tgts[idx]->ltd_exp != NULL &&
+            class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+               rc = 1;
+
+       mutex_unlock(&lov->lov_lock);
+       return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+       wait_queue_head_t waitq;
+       struct l_wait_info lwi;
+       struct lov_tgt_desc *tgt;
+       int rc = 0;
+
+       mutex_lock(&lov->lov_lock);
+
+       tgt = lov->lov_tgts[ost_idx];
+
+       if (unlikely(tgt == NULL))
+               GOTO(out, rc = 0);
+
+       if (likely(tgt->ltd_active))
+               GOTO(out, rc = 1);
+
+       if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+               GOTO(out, rc = 0);
+
+       mutex_unlock(&lov->lov_lock);
+
+       init_waitqueue_head(&waitq);
+       lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+                                  cfs_time_seconds(1), NULL, NULL);
+
+       rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+       if (tgt != NULL && tgt->ltd_active)
+               return 1;
+
+       return 0;
+
+out:
+       mutex_unlock(&lov->lov_lock);
+       return rc;
+}
+
+extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                              struct lov_oinfo *loi, int flags,
+                              struct ost_lvb *lvb, __u32 mode, int rc);
+
+static int lov_update_enqueue_lov(struct obd_export *exp,
+                                 struct lustre_handle *lov_lockhp,
+                                 struct lov_oinfo *loi, int flags, int idx,
+                                 struct ost_id *oi, int rc)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+
+       if (rc != ELDLM_OK &&
+           !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) {
+               memset(lov_lockhp, 0, sizeof(*lov_lockhp));
+               if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) {
+                       /* -EUSERS used by OST to report file contention */
+                       if (rc != -EINTR && rc != -EUSERS)
+                               CERROR("%s: enqueue objid "DOSTID" subobj"
+                                      DOSTID" on OST idx %d: rc %d\n",
+                                      exp->exp_obd->obd_name,
+                                      POSTID(oi), POSTID(&loi->loi_oi),
+                                      loi->loi_ost_idx, rc);
+               } else
+                       rc = ELDLM_OK;
+       }
+       return rc;
+}
+
+int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
+{
+       struct lov_request_set *set = req->rq_rqset;
+       struct lustre_handle *lov_lockhp;
+       struct obd_info *oi = set->set_oi;
+       struct lov_oinfo *loi;
+       ENTRY;
+
+       LASSERT(oi != NULL);
+
+       lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+       loi = oi->oi_md->lsm_oinfo[req->rq_stripe];
+
+       /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
+        * and that copy can be arbitrarily out of date.
+        *
+        * The LOV API is due for a serious rewriting anyways, and this
+        * can be addressed then. */
+
+       lov_stripe_lock(oi->oi_md);
+       osc_update_enqueue(lov_lockhp, loi, oi->oi_flags,
+                          &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc);
+       if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT))
+               memset(lov_lockhp, 0, sizeof *lov_lockhp);
+       rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags,
+                                   req->rq_idx, &oi->oi_md->lsm_oi, rc);
+       lov_stripe_unlock(oi->oi_md);
+       lov_update_set(set, req, rc);
+       RETURN(rc);
+}
+
+/* The callback for osc_enqueue that updates lov info for every OSC request. */
+static int cb_update_enqueue(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct ldlm_enqueue_info *einfo;
+       struct lov_request *lovreq;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       einfo = lovreq->rq_rqset->set_ei;
+       return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc);
+}
+
+static int enqueue_done(struct lov_request_set *set, __u32 mode)
+{
+       struct lov_request *req;
+       struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+       int completes = atomic_read(&set->set_completes);
+       int rc = 0;
+       ENTRY;
+
+       /* enqueue/match success, just return */
+       if (completes && completes == atomic_read(&set->set_success))
+               RETURN(0);
+
+       /* cancel enqueued/matched locks */
+       list_for_each_entry(req, &set->set_list, rq_link) {
+               struct lustre_handle *lov_lockhp;
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+
+               lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+               LASSERT(lov_lockhp);
+               if (!lustre_handle_is_used(lov_lockhp))
+                       continue;
+
+               rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                               req->rq_oi.oi_md, mode, lov_lockhp);
+               if (rc && lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active)
+                       CERROR("%s: cancelling obdjid "DOSTID" on OST"
+                              "idx %d error: rc = %d\n",
+                              set->set_exp->exp_obd->obd_name,
+                              POSTID(&req->rq_oi.oi_md->lsm_oi),
+                              req->rq_idx, rc);
+       }
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+       RETURN(rc);
+}
+
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+                        struct ptlrpc_request_set *rqset)
+{
+       int ret = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       /* Do enqueue_done only for sync requests and if any request
+        * succeeded. */
+       if (!rqset) {
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               ret = enqueue_done(set, mode);
+       } else if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc ? rc : ret);
+}
+
+static void lov_llh_addref(void *llhp)
+{
+       struct lov_lock_handles *llh = llhp;
+
+       atomic_inc(&llh->llh_refcount);
+       CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh,
+              atomic_read(&llh->llh_refcount));
+}
+
+static struct portals_handle_ops lov_handle_ops = {
+       .hop_addref = lov_llh_addref,
+       .hop_free   = NULL,
+};
+
+static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm)
+{
+       struct lov_lock_handles *llh;
+
+       OBD_ALLOC(llh, sizeof *llh +
+                 sizeof(*llh->llh_handles) * lsm->lsm_stripe_count);
+       if (llh == NULL)
+               return NULL;
+
+       atomic_set(&llh->llh_refcount, 2);
+       llh->llh_stripe_count = lsm->lsm_stripe_count;
+       INIT_LIST_HEAD(&llh->llh_handle.h_link);
+       class_handle_hash(&llh->llh_handle, &lov_handle_ops);
+
+       return llh;
+}
+
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct lov_request_set **reqset)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_ei = einfo;
+       set->set_lockh = lov_llh_new(oinfo->oi_md);
+       if (set->set_lockh == NULL)
+               GOTO(out_set, rc = -ENOMEM);
+       oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+               obd_off start, end;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_stripe_intersects(oinfo->oi_md, i,
+                                          oinfo->oi_policy.l_extent.start,
+                                          oinfo->oi_policy.l_extent.end,
+                                          &start, &end))
+                       continue;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md) +
+                       sizeof(struct lov_oinfo *) +
+                       sizeof(struct lov_oinfo);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               req->rq_oi.oi_md->lsm_oinfo[0] =
+                       ((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) +
+                       sizeof(struct lov_oinfo *);
+
+               /* Set lov request specific parameters. */
+               req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
+               req->rq_oi.oi_cb_up = cb_update_enqueue;
+               req->rq_oi.oi_flags = oinfo->oi_flags;
+
+               LASSERT(req->rq_oi.oi_lockh);
+
+               req->rq_oi.oi_policy.l_extent.gid =
+                       oinfo->oi_policy.l_extent.gid;
+               req->rq_oi.oi_policy.l_extent.start = start;
+               req->rq_oi.oi_policy.l_extent.end = end;
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid =
+                       loi->loi_kms_valid;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms = loi->loi_kms;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb = loi->loi_lvb;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(0);
+out_set:
+       lov_fini_enqueue_set(set, einfo->ei_mode, rc, NULL);
+       RETURN(rc);
+}
+
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       rc = enqueue_done(set, mode);
+       if ((set->set_count == atomic_read(&set->set_success)) &&
+           (flags & LDLM_FL_TEST_LOCK))
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct lov_stripe_md *lsm, ldlm_policy_data_t *policy,
+                      __u32 mode, struct lustre_handle *lockh,
+                      struct lov_request_set **reqset)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_lockh = lov_llh_new(lsm);
+       if (set->set_lockh == NULL)
+               GOTO(out_set, rc = -ENOMEM);
+       lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++){
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+               obd_off start, end;
+
+               loi = lsm->lsm_oinfo[i];
+               if (!lov_stripe_intersects(lsm, i, policy->l_extent.start,
+                                          policy->l_extent.end, &start, &end))
+                       continue;
+
+               /* FIXME raid1 should grace this error */
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out_set, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_oi.oi_policy.l_extent.start = start;
+               req->rq_oi.oi_policy.l_extent.end = end;
+               req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid;
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_match_set(set, mode, 0);
+       RETURN(rc);
+}
+
+int lov_fini_cancel_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+
+       LASSERT(set->set_exp);
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+                       struct lov_stripe_md *lsm, __u32 mode,
+                       struct lustre_handle *lockh,
+                       struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_lockh = lov_handle2llh(lockh);
+       if (set->set_lockh == NULL) {
+               CERROR("LOV: invalid lov lock handle %p\n", lockh);
+               GOTO(out_set, rc = -EINVAL);
+       }
+       lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++){
+               struct lov_request *req;
+               struct lustre_handle *lov_lockhp;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               lov_lockhp = set->set_lockh->llh_handles + i;
+               if (!lustre_handle_is_used(lov_lockhp)) {
+                       CDEBUG(D_INFO, "lov idx %d subobj "DOSTID" no lock\n",
+                              loi->loi_ost_idx, POSTID(&loi->loi_oi));
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_cancel_set(set);
+       RETURN(rc);
+}
+static int common_attr_done(struct lov_request_set *set)
+{
+       struct list_head *pos;
+       struct lov_request *req;
+       struct obdo *tmp_oa;
+       int rc = 0, attrset = 0;
+       ENTRY;
+
+       LASSERT(set->set_oi != NULL);
+
+       if (set->set_oi->oi_oa == NULL)
+               RETURN(0);
+
+       if (!atomic_read(&set->set_success))
+               RETURN(-EIO);
+
+       OBDO_ALLOC(tmp_oa);
+       if (tmp_oa == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+               if (req->rq_oi.oi_oa->o_valid == 0)   /* inactive stripe */
+                       continue;
+               lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+                               req->rq_oi.oi_oa->o_valid,
+                               set->set_oi->oi_md, req->rq_stripe, &attrset);
+       }
+       if (!attrset) {
+               CERROR("No stripes had valid attrs\n");
+               rc = -EIO;
+       }
+       if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+           (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+               /* When we take attributes of some epoch, we require all the
+                * ost to be active. */
+               CERROR("Not all the stripes had valid attrs\n");
+               GOTO(out, rc = -EIO);
+       }
+
+       tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+       memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+       if (tmp_oa)
+               OBDO_FREE(tmp_oa);
+       RETURN(rc);
+
+}
+
+static int brw_done(struct lov_request_set *set)
+{
+       struct lov_stripe_md *lsm = set->set_oi->oi_md;
+       struct lov_oinfo     *loi = NULL;
+       struct list_head *pos;
+       struct lov_request *req;
+       ENTRY;
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+
+               loi = lsm->lsm_oinfo[req->rq_stripe];
+
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS)
+                       loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks;
+       }
+
+       RETURN(0);
+}
+
+int lov_fini_brw_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = brw_done(set);
+               /* FIXME update qos data here */
+       }
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pga,
+                    struct obd_trans_info *oti,
+                    struct lov_request_set **reqset)
+{
+       struct {
+               obd_count       index;
+               obd_count       count;
+               obd_count       off;
+       } *info = NULL;
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i, shift;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oti = oti;
+       set->set_oi = oinfo;
+       set->set_oabufs = oa_bufs;
+       OBD_ALLOC_LARGE(set->set_pga, oa_bufs * sizeof(*set->set_pga));
+       if (!set->set_pga)
+               GOTO(out, rc = -ENOMEM);
+
+       OBD_ALLOC_LARGE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+       if (!info)
+               GOTO(out, rc = -ENOMEM);
+
+       /* calculate the page count for each stripe */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+               info[stripe].count++;
+       }
+
+       /* alloc and initialize lov request */
+       shift = 0;
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++){
+               struct lov_oinfo *loi = NULL;
+               struct lov_request *req;
+
+               if (info[i].count == 0)
+                       continue;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               if (oinfo->oi_oa) {
+                       memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                              sizeof(*req->rq_oi.oi_oa));
+               }
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBDO_FREE(req->rq_oi.oi_oa);
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oabufs = info[i].count;
+               req->rq_pgaidx = shift;
+               shift += req->rq_oabufs;
+
+               /* remember the index for sort brw_page array */
+               info[i].index = req->rq_pgaidx;
+
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out, rc = -EIO);
+
+       /* rotate & sort the brw_page array */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+
+               shift = info[stripe].index + info[stripe].off;
+               LASSERT(shift < oa_bufs);
+               set->set_pga[shift] = pga[i];
+               lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe,
+                                 &set->set_pga[shift].off);
+               info[stripe].off++;
+       }
+out:
+       if (info)
+               OBD_FREE_LARGE(info,
+                              sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+
+       if (rc == 0)
+               *reqset = set;
+       else
+               lov_fini_brw_set(set);
+
+       RETURN(rc);
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes))
+               rc = common_attr_done(set);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+/* The callback for osc_getattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH)
+                               /* SOM requires all the OSTs to be active. */
+                               GOTO(out_set, rc = -EIO);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_cb_up = cb_getattr_update;
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_getattr_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(0);
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obdo *src_oa, struct lov_stripe_md *lsm,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_oi->oi_oa = src_oa;
+       set->set_oti = oti;
+       if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+               set->set_cookies = oti->oti_logcookies;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+
+               loi = lsm->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_destroy_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = common_attr_done(set);
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+       RETURN(rc);
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+       struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !(lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active))
+               rc = 0;
+
+       if (rc == 0) {
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+                               req->rq_oi.oi_oa->o_ctime;
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+                               req->rq_oi.oi_oa->o_mtime;
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+                               req->rq_oi.oi_oa->o_atime;
+       }
+
+       RETURN(rc);
+}
+
+/* The callback for osc_setattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oti = oti;
+       set->set_oi = oinfo;
+       if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+               set->set_cookies = oti->oti_logcookies;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+               req->rq_oi.oi_cb_up = cb_setattr_update;
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+                       int off = lov_stripe_offset(oinfo->oi_md,
+                                                   oinfo->oi_oa->o_size, i,
+                                                   &req->rq_oi.oi_oa->o_size);
+
+                       if (off < 0 && req->rq_oi.oi_oa->o_size)
+                               req->rq_oi.oi_oa->o_size--;
+
+                       CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
+                              i, req->rq_oi.oi_oa->o_size,
+                              oinfo->oi_oa->o_size);
+               }
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_setattr_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_punch_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = -EIO;
+               /* FIXME update qos data here */
+               if (atomic_read(&set->set_success))
+                       rc = common_attr_done(set);
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_update_punch_set(struct lov_request_set *set,
+                        struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+       struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !lov->lov_tgts[req->rq_idx]->ltd_active)
+               rc = 0;
+
+       if (rc == 0) {
+               lov_stripe_lock(lsm);
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) {
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_blocks =
+                               req->rq_oi.oi_oa->o_blocks;
+               }
+
+               lov_stripe_unlock(lsm);
+       }
+
+       RETURN(rc);
+}
+
+/* The callback for osc_punch that finilizes a request info when a response
+ * is received. */
+static int cb_update_punch(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct obd_trans_info *oti,
+                      struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_oi = oinfo;
+       set->set_exp = exp;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+               obd_off rs, re;
+
+               if (!lov_stripe_intersects(oinfo->oi_md, i,
+                                          oinfo->oi_policy.l_extent.start,
+                                          oinfo->oi_policy.l_extent.end,
+                                          &rs, &re))
+                       continue;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out_set, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_valid |= OBD_MD_FLGROUP;
+
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+               req->rq_oi.oi_cb_up = cb_update_punch;
+
+               req->rq_oi.oi_policy.l_extent.start = rs;
+               req->rq_oi.oi_policy.l_extent.end = re;
+               req->rq_oi.oi_policy.l_extent.gid = -1;
+
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_punch_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_sync_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               if (!atomic_read(&set->set_success))
+                       rc = -EIO;
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+/* The callback for osc_sync that finilizes a request info when a
+ * response is recieved. */
+static int cb_sync_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
+                     obd_off start, obd_off end,
+                     struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC_PTR(set);
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+               obd_off rs, re;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               if (!lov_stripe_intersects(oinfo->oi_md, i, start, end, &rs,
+                                          &re))
+                       continue;
+
+               OBD_ALLOC_PTR(req);
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               *req->rq_oi.oi_oa = *oinfo->oi_oa;
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+
+               req->rq_oi.oi_policy.l_extent.start = rs;
+               req->rq_oi.oi_policy.l_extent.end = re;
+               req->rq_oi.oi_policy.l_extent.gid = -1;
+               req->rq_oi.oi_cb_up = cb_sync_update;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_sync_set(set);
+       RETURN(rc);
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)                                     \
+       do {                                                        \
+               if ((tot) + (add) < (tot))                            \
+                       (tot) = LOV_U64_MAX;                        \
+               else                                                \
+                       (tot) += (add);                          \
+       } while(0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
+{
+       ENTRY;
+
+       if (success) {
+               __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+                                                          LOV_MAGIC, 0);
+               if (osfs->os_files != LOV_U64_MAX)
+                       lov_do_div64(osfs->os_files, expected_stripes);
+               if (osfs->os_ffree != LOV_U64_MAX)
+                       lov_do_div64(osfs->os_ffree, expected_stripes);
+
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+               obd->obd_osfs_age = cfs_time_current_64();
+               spin_unlock(&obd->obd_osfs_lock);
+               RETURN(0);
+       }
+
+       RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+
+       if (atomic_read(&set->set_completes)) {
+               rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+                                    atomic_read(&set->set_success));
+       }
+       lov_put_reqset(set);
+       RETURN(rc);
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                      int success)
+{
+       int shift = 0, quit = 0;
+       __u64 tmp;
+
+       if (success == 0) {
+               memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+       } else {
+               if (osfs->os_bsize != lov_sfs->os_bsize) {
+                       /* assume all block sizes are always powers of 2 */
+                       /* get the bits difference */
+                       tmp = osfs->os_bsize | lov_sfs->os_bsize;
+                       for (shift = 0; shift <= 64; ++shift) {
+                               if (tmp & 1) {
+                                       if (quit)
+                                               break;
+                                       else
+                                               quit = 1;
+                                       shift = 0;
+                               }
+                               tmp >>= 1;
+                       }
+               }
+
+               if (osfs->os_bsize < lov_sfs->os_bsize) {
+                       osfs->os_bsize = lov_sfs->os_bsize;
+
+                       osfs->os_bfree  >>= shift;
+                       osfs->os_bavail >>= shift;
+                       osfs->os_blocks >>= shift;
+               } else if (shift != 0) {
+                       lov_sfs->os_bfree  >>= shift;
+                       lov_sfs->os_bavail >>= shift;
+                       lov_sfs->os_blocks >>= shift;
+               }
+               osfs->os_bfree += lov_sfs->os_bfree;
+               osfs->os_bavail += lov_sfs->os_bavail;
+               osfs->os_blocks += lov_sfs->os_blocks;
+               /* XXX not sure about this one - depends on policy.
+                *   - could be minimum if we always stripe on all OBDs
+                *     (but that would be wrong for any other policy,
+                *     if one of the OBDs has no more objects left)
+                *   - could be sum if we stripe whole objects
+                *   - could be average, just to give a nice number
+                *
+                * To give a "reasonable" (if not wholly accurate)
+                * number, we divide the total number of free objects
+                * by expected stripe count (watch out for overflow).
+                */
+               LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+               LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+       }
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       struct lov_request_set *set;
+       struct obd_statfs *osfs, *lov_sfs;
+       struct lov_obd *lov;
+       struct lov_tgt_desc *tgt;
+       struct obd_device *lovobd, *tgtobd;
+       int success;
+       ENTRY;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       set = lovreq->rq_rqset;
+       lovobd = set->set_obd;
+       lov = &lovobd->u.lov;
+       osfs = set->set_oi->oi_osfs;
+       lov_sfs = oinfo->oi_osfs;
+       success = atomic_read(&set->set_success);
+       /* XXX: the same is done in lov_update_common_set, however
+          lovset->set_exp is not initialized. */
+       lov_update_set(set, lovreq, rc);
+       if (rc)
+               GOTO(out, rc);
+
+       obd_getref(lovobd);
+       tgt = lov->lov_tgts[lovreq->rq_idx];
+       if (!tgt || !tgt->ltd_active)
+               GOTO(out_update, rc);
+
+       tgtobd = class_exp2obd(tgt->ltd_exp);
+       spin_lock(&tgtobd->obd_osfs_lock);
+       memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+       if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+               tgtobd->obd_osfs_age = cfs_time_current_64();
+       spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+       lov_update_statfs(osfs, lov_sfs, success);
+       obd_putref(lovobd);
+
+out:
+       if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+           lov_set_finished(set, 0)) {
+               lov_statfs_interpret(NULL, set, set->set_count !=
+                                    atomic_read(&set->set_success));
+       }
+
+       RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                       struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_obd = obd;
+       set->set_oi = oinfo;
+
+       /* We only get block data from the OBD */
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               struct lov_request *req;
+
+               if (lov->lov_tgts[i] == NULL ||
+                   (!lov_check_and_wait_active(lov, i) &&
+                    (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", i);
+                       continue;
+               }
+
+               /* skip targets that have been explicitely disabled by the
+                * administrator */
+               if (!lov->lov_tgts[i]->ltd_exp) {
+                       CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+               if (req->rq_oi.oi_osfs == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_idx = i;
+               req->rq_oi.oi_cb_up = cb_statfs_update;
+               req->rq_oi.oi_flags = oinfo->oi_flags;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_statfs_set(set);
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644 (file)
index 0000000..204ecd0
--- /dev/null
@@ -0,0 +1,211 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+                                 const struct cl_req_slice *slice, int ioret)
+{
+       struct lovsub_req *lsr;
+
+       ENTRY;
+       lsr = cl2lovsub_req(slice);
+       OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+       EXIT;
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+                               const struct cl_req_slice *slice,
+                               const struct cl_object *obj,
+                               struct cl_req_attr *attr, obd_valid flags)
+{
+       struct lovsub_object *subobj;
+
+       ENTRY;
+       subobj = cl2lovsub(obj);
+       /*
+        * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+        * unconditionally. It never changes anyway.
+        */
+       attr->cra_oa->o_stripe_idx = subobj->lso_index;
+       EXIT;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+       .cro_attr_set   = lovsub_req_attr_set,
+       .cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+                             const char *name, struct lu_device *next)
+{
+       struct lovsub_device  *lsd = lu2lovsub_dev(d);
+       struct lu_device_type *ldt;
+       int rc;
+
+       ENTRY;
+       next->ld_site = d->ld_site;
+       ldt = next->ld_type;
+       LASSERT(ldt != NULL);
+       rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+       if (rc) {
+               next->ld_site = NULL;
+               RETURN(rc);
+       }
+
+       lu_device_get(next);
+       lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+       lsd->acid_next = lu2cl_dev(next);
+       RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+                                           struct lu_device *d)
+{
+       struct lu_device *next;
+       struct lovsub_device *lsd;
+
+       ENTRY;
+       lsd = lu2lovsub_dev(d);
+       next = cl2lu_dev(lsd->acid_next);
+       lsd->acid_super = NULL;
+       lsd->acid_next = NULL;
+       RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+                                           struct lu_device *d)
+{
+       struct lovsub_device *lsd  = lu2lovsub_dev(d);
+       struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(lsd);
+       return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+                          struct cl_req *req)
+{
+       struct lovsub_req *lsr;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, __GFP_IO);
+       if (lsr != NULL) {
+               cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+       .ldo_object_alloc      = lovsub_object_alloc,
+       .ldo_process_config    = NULL,
+       .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+       .cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+                                            struct lu_device_type *t,
+                                            struct lustre_cfg *cfg)
+{
+       struct lu_device     *d;
+       struct lovsub_device *lsd;
+
+       OBD_ALLOC_PTR(lsd);
+       if (lsd != NULL) {
+               int result;
+
+               result = cl_device_init(&lsd->acid_cl, t);
+               if (result == 0) {
+                       d = lovsub2lu_dev(lsd);
+                       d->ld_ops        = &lovsub_lu_ops;
+                       lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+               } else
+                       d = ERR_PTR(result);
+       } else
+               d = ERR_PTR(-ENOMEM);
+       return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+       .ldto_device_alloc = lovsub_device_alloc,
+       .ldto_device_free  = lovsub_device_free,
+
+       .ldto_device_init    = lovsub_device_init,
+       .ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME      "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_LOVSUB_NAME,
+       .ldt_ops      = &lovsub_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644 (file)
index 0000000..783ec68
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644 (file)
index 0000000..03bab17
--- /dev/null
@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+                            struct cl_lock_slice *slice)
+{
+       struct lovsub_lock   *lsl;
+
+       ENTRY;
+       lsl = cl2lovsub_lock(slice);
+       LASSERT(list_empty(&lsl->lss_parents));
+       OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+       EXIT;
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+
+       ENTRY;
+       parent = lov->lls_cl.cls_lock;
+       cl_lock_get(parent);
+       lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+       cl_lock_mutex_get(env, parent);
+       EXIT;
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+
+       ENTRY;
+       parent = lov->lls_cl.cls_lock;
+       cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+       lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+       cl_lock_put(env, parent);
+       EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             enum cl_lock_state state)
+{
+       struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+       struct lov_lock_link *scan;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+       ENTRY;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               struct lov_lock *lov    = scan->lll_super;
+               struct cl_lock  *parent = lov->lls_cl.cls_lock;
+
+               if (sub->lss_active != parent) {
+                       lovsub_parent_lock(env, lov);
+                       cl_lock_signal(env, parent);
+                       lovsub_parent_unlock(env, lov);
+               }
+       }
+       EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+                                      const struct cl_lock_slice *slice)
+{
+       struct lovsub_lock *lock = cl2lovsub_lock(slice);
+       struct lov_lock    *lov;
+       unsigned long       dumbbell;
+
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+       if (!list_empty(&lock->lss_parents)) {
+               /*
+                * It is not clear whether all parents have to be asked and
+                * their estimations summed, or it is enough to ask one. For
+                * the current usages, one is always enough.
+                */
+               lov = container_of(lock->lss_parents.next,
+                                  struct lov_lock_link, lll_list)->lll_super;
+
+               lovsub_parent_lock(env, lov);
+               dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+               lovsub_parent_unlock(env, lov);
+       } else
+               dumbbell = 0;
+
+       RETURN(dumbbell);
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+                                 struct lov_object *lov,
+                                 int stripe, struct cl_lock_descr *out)
+{
+       pgoff_t size; /* stripe size in pages */
+       pgoff_t skip; /* how many pages in every stripe are occupied by
+                      * "other" stripes */
+       pgoff_t start;
+       pgoff_t end;
+
+       ENTRY;
+       start = in->cld_start;
+       end   = in->cld_end;
+
+       if (lov->lo_lsm->lsm_stripe_count > 1) {
+               size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+               skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+               /* XXX overflow check here? */
+               start += start/size * skip + stripe * size;
+
+               if (end != CL_PAGE_EOF) {
+                       end += end/size * skip + stripe * size;
+                       /*
+                        * And check for overflow...
+                        */
+                       if (end < in->cld_end)
+                               end = CL_PAGE_EOF;
+               }
+       }
+       out->cld_start = start;
+       out->cld_end   = end;
+       EXIT;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+                      struct lovsub_lock *sublock,
+                      const struct cl_lock_descr *d, int idx)
+{
+       struct cl_lock       *parent;
+       struct lovsub_object *subobj;
+       struct cl_lock_descr *pd;
+       struct cl_lock_descr *parent_descr;
+       int                result;
+
+       parent       = lov->lls_cl.cls_lock;
+       parent_descr = &parent->cll_descr;
+       LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+       subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+       pd     = &lov_env_info(env)->lti_ldescr;
+
+       pd->cld_obj  = parent_descr->cld_obj;
+       pd->cld_mode = parent_descr->cld_mode;
+       pd->cld_gid  = parent_descr->cld_gid;
+       lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+       lov->lls_sub[idx].sub_got = *d;
+       /*
+        * Notify top-lock about modification, if lock description changes
+        * materially.
+        */
+       if (!cl_lock_ext_match(parent_descr, pd))
+               result = cl_lock_modify(env, parent, pd);
+       else
+               result = 0;
+       return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+                             const struct cl_lock_slice *s,
+                             const struct cl_lock_descr *d)
+{
+       struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+       struct lov_lock_link *scan;
+       struct lov_lock      *lov;
+       int result                 = 0;
+
+       ENTRY;
+
+       LASSERT(cl_lock_mode_match(d->cld_mode,
+                                  s->cls_lock->cll_descr.cld_mode));
+       list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+               int rc;
+
+               lov = scan->lll_super;
+               lovsub_parent_lock(env, lov);
+               rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+               lovsub_parent_unlock(env, lov);
+               result = result ?: rc;
+       }
+       RETURN(result);
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              struct cl_lock_closure *closure)
+{
+       struct lovsub_lock   *sub;
+       struct cl_lock       *parent;
+       struct lov_lock_link *scan;
+       int                result;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+       ENTRY;
+
+       sub    = cl2lovsub_lock(slice);
+       result = 0;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               parent = scan->lll_super->lls_cl.cls_lock;
+               result = cl_lock_closure_build(env, parent, closure);
+               if (result != 0)
+                       break;
+       }
+       RETURN(result);
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+                                 struct cl_lock *child, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+       int          result;
+       ENTRY;
+
+       parent = lov->lls_cl.cls_lock;
+       if (parent->cll_error)
+               RETURN(0);
+
+       result = 0;
+       switch (parent->cll_state) {
+       case CLS_ENQUEUED:
+               /* See LU-1355 for the case that a glimpse lock is
+                * interrupted by signal */
+               LASSERT(parent->cll_flags & CLF_CANCELLED);
+               break;
+       case CLS_QUEUING:
+       case CLS_FREEING:
+               cl_lock_signal(env, parent);
+               break;
+       case CLS_INTRANSIT:
+               /*
+                * Here lies a problem: a sub-lock is canceled while top-lock
+                * is being unlocked. Top-lock cannot be moved into CLS_NEW
+                * state, because unlocking has to succeed eventually by
+                * placing lock into CLS_CACHED (or failing it), see
+                * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+                * state, because lov maintains an invariant that all
+                * sub-locks exist in CLS_CACHED (this allows cached top-lock
+                * to be reused immediately). Nor can we wait for top-lock
+                * state to change, because this can be synchronous to the
+                * current thread.
+                *
+                * We know for sure that lov_lock_unuse() will be called at
+                * least one more time to finish un-using, so leave a mark on
+                * the top-lock, that will be seen by the next call to
+                * lov_lock_unuse().
+                */
+               if (cl_lock_is_intransit(parent))
+                       lov->lls_cancel_race = 1;
+               break;
+       case CLS_CACHED:
+               /*
+                * if a sub-lock is canceled move its top-lock into CLS_NEW
+                * state to preserve an invariant that a top-lock in
+                * CLS_CACHED is immediately ready for re-use (i.e., has all
+                * sub-locks), and so that next attempt to re-use the top-lock
+                * enqueues missing sub-lock.
+                */
+               cl_lock_state_set(env, parent, CLS_NEW);
+               /* fall through */
+       case CLS_NEW:
+               /*
+                * if last sub-lock is canceled, destroy the top-lock (which
+                * is now `empty') proactively.
+                */
+               if (lov->lls_nr_filled == 0) {
+                       /* ... but unfortunately, this cannot be done easily,
+                        * as cancellation of a top-lock might acquire mutices
+                        * of its other sub-locks, violating lock ordering,
+                        * see cl_lock_{cancel,delete}() preconditions.
+                        *
+                        * To work around this, the mutex of this sub-lock is
+                        * released, top-lock is destroyed, and sub-lock mutex
+                        * acquired again. The list of parents has to be
+                        * re-scanned from the beginning after this.
+                        *
+                        * Only do this if no mutices other than on @child and
+                        * @parent are held by the current thread.
+                        *
+                        * TODO: The lock modal here is too complex, because
+                        * the lock may be canceled and deleted by voluntarily:
+                        *    cl_lock_request
+                        *      -> osc_lock_enqueue_wait
+                        *      -> osc_lock_cancel_wait
+                        *        -> cl_lock_delete
+                        *          -> lovsub_lock_delete
+                        *            -> cl_lock_cancel/delete
+                        *              -> ...
+                        *
+                        * The better choice is to spawn a kernel thread for
+                        * this purpose. -jay
+                        */
+                       if (cl_lock_nr_mutexed(env) == 2) {
+                               cl_lock_mutex_put(env, child);
+                               cl_lock_cancel(env, parent);
+                               cl_lock_delete(env, parent);
+                               result = 1;
+                       }
+               }
+               break;
+       case CLS_HELD:
+               CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+       default:
+               CERROR("Impossible state: %d\n", parent->cll_state);
+               LBUG();
+               break;
+       }
+
+       RETURN(result);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+                              const struct cl_lock_slice *slice)
+{
+       struct cl_lock     *child = slice->cls_lock;
+       struct lovsub_lock *sub   = cl2lovsub_lock(slice);
+       int restart;
+
+       LASSERT(cl_lock_is_mutexed(child));
+
+       ENTRY;
+       /*
+        * Destruction of a sub-lock might take multiple iterations, because
+        * when the last sub-lock of a given top-lock is deleted, top-lock is
+        * canceled proactively, and this requires to release sub-lock
+        * mutex. Once sub-lock mutex has been released, list of its parents
+        * has to be re-scanned from the beginning.
+        */
+       do {
+               struct lov_lock      *lov;
+               struct lov_lock_link *scan;
+               struct lov_lock_link *temp;
+               struct lov_lock_sub  *subdata;
+
+               restart = 0;
+               list_for_each_entry_safe(scan, temp,
+                                            &sub->lss_parents, lll_list) {
+                       lov     = scan->lll_super;
+                       subdata = &lov->lls_sub[scan->lll_idx];
+                       lovsub_parent_lock(env, lov);
+                       subdata->sub_got = subdata->sub_descr;
+                       lov_lock_unlink(env, scan, sub);
+                       restart = lovsub_lock_delete_one(env, child, lov);
+                       lovsub_parent_unlock(env, lov);
+
+                       if (restart) {
+                               cl_lock_mutex_get(env, child);
+                               break;
+                       }
+              }
+       } while (restart);
+       EXIT;
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+       struct lov_lock      *lov;
+       struct lov_lock_link *scan;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               lov = scan->lll_super;
+               (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+               if (lov != NULL)
+                       cl_lock_descr_print(env, cookie, p,
+                                           &lov->lls_cl.cls_lock->cll_descr);
+               (*p)(env, cookie, "] ");
+       }
+       return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+       .clo_fini    = lovsub_lock_fini,
+       .clo_state   = lovsub_lock_state,
+       .clo_delete  = lovsub_lock_delete,
+       .clo_modify  = lovsub_lock_modify,
+       .clo_closure = lovsub_lock_closure,
+       .clo_weigh   = lovsub_lock_weigh,
+       .clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+                    struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lovsub_lock *lsk;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, __GFP_IO);
+       if (lsk != NULL) {
+               INIT_LIST_HEAD(&lsk->lss_parents);
+               cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644 (file)
index 0000000..1b83d90
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+                      const struct lu_object_conf *conf)
+{
+       struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+       struct lu_object      *below;
+       struct lu_device      *under;
+
+       int result;
+
+       ENTRY;
+       under = &dev->acid_next->cd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+       if (below != NULL) {
+               lu_object_add(obj, below);
+               cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lovsub_object *los = lu2lovsub(obj);
+       struct lov_object    *lov = los->lso_super;
+       ENTRY;
+
+       /* We can't assume lov was assigned here, because of the shadow
+        * object handling in lu_object_find.
+        */
+       if (lov) {
+               LASSERT(lov->lo_type == LLT_RAID0);
+               LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+               spin_lock(&lov->u.raid0.lo_sub_lock);
+               lov->u.raid0.lo_sub[los->lso_index] = NULL;
+               spin_unlock(&lov->u.raid0.lo_sub_lock);
+       }
+
+       lu_object_fini(obj);
+       lu_object_header_fini(&los->lso_header.coh_lu);
+       OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+       EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+                              lu_printer_t p, const struct lu_object *obj)
+{
+       struct lovsub_object *los = lu2lovsub(obj);
+
+       return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_attr *attr, unsigned valid)
+{
+       struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+       ENTRY;
+       lov_r0(lov)->lo_attr_valid = 0;
+       RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+                                const struct cl_object *obj,
+                                struct ost_lvb *lvb)
+{
+       struct lovsub_object *los = cl2lovsub(obj);
+
+       ENTRY;
+       RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+       .coo_page_init = lovsub_page_init,
+       .coo_lock_init = lovsub_lock_init,
+       .coo_attr_set  = lovsub_attr_set,
+       .coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+       .loo_object_init      = lovsub_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = lovsub_object_free,
+       .loo_object_print     = lovsub_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                     const struct lu_object_header *unused,
+                                     struct lu_device *dev)
+{
+       struct lovsub_object *los;
+       struct lu_object     *obj;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, __GFP_IO);
+       if (los != NULL) {
+               struct cl_object_header *hdr;
+
+               obj = lovsub2lu(los);
+               hdr = &los->lso_header;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+               los->lso_cl.co_ops = &lovsub_ops;
+               obj->lo_ops = &lovsub_lu_obj_ops;
+       } else
+               obj = NULL;
+       RETURN(obj);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644 (file)
index 0000000..bc9e683
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+                            struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+       .cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *unused)
+{
+       struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+       ENTRY;
+
+       cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+       RETURN(0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644 (file)
index 0000000..732d5c7
--- /dev/null
@@ -0,0 +1,304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+#ifdef LPROCFS
+static int lov_rd_stripesize(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size);
+}
+
+static int lov_wr_stripesize(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+       __u64 val;
+       int rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_stripe_size(&val);
+       desc->ld_default_stripe_size = val;
+       return count;
+}
+
+static int lov_rd_stripeoffset(char *page, char **start, off_t off, int count,
+                              int *eof, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_offset);
+}
+
+static int lov_wr_stripeoffset(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+       __u64 val;
+       int rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       desc->ld_default_stripe_offset = val;
+       return count;
+}
+
+static int lov_rd_stripetype(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
+{
+       struct obd_device* dev = (struct obd_device*)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, "%u\n", desc->ld_pattern);
+}
+
+static int lov_wr_stripetype(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+       int val, rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_pattern(&val);
+       desc->ld_pattern = val;
+       return count;
+}
+
+static int lov_rd_stripecount(char *page, char **start, off_t off, int count,
+                             int *eof, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, "%d\n",
+                       (__s16)(desc->ld_default_stripe_count + 1) - 1);
+}
+
+static int lov_wr_stripecount(struct file *file, const char *buffer,
+                             unsigned long count, void *data)
+{
+       struct obd_device *dev = (struct obd_device *)data;
+       struct lov_desc *desc;
+       int val, rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_stripe_count(&val);
+       desc->ld_default_stripe_count = val;
+       return count;
+}
+
+static int lov_rd_numobd(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct obd_device *dev = (struct obd_device*)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, "%u\n", desc->ld_tgt_count);
+
+}
+
+static int lov_rd_activeobd(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device* dev = (struct obd_device*)data;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       *eof = 1;
+       return snprintf(page, count, "%u\n", desc->ld_active_tgt_count);
+}
+
+static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device *dev = (struct obd_device*) data;
+       struct lov_obd *lov;
+
+       LASSERT(dev != NULL);
+       lov = &dev->u.lov;
+       *eof = 1;
+       return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid);
+}
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_device *dev = p->private;
+       struct lov_obd *lov = &dev->u.lov;
+
+       while (*pos < lov->desc.ld_tgt_count) {
+               if (lov->lov_tgts[*pos])
+                       return lov->lov_tgts[*pos];
+               ++*pos;
+       }
+       return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_device *dev = p->private;
+       struct lov_obd *lov = &dev->u.lov;
+
+       while (++*pos < lov->desc.ld_tgt_count) {
+               if (lov->lov_tgts[*pos])
+                       return lov->lov_tgts[*pos];
+       }
+       return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+       struct lov_tgt_desc *tgt = v;
+       return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+                         obd_uuid2str(&tgt->ltd_uuid),
+                         tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lov_tgt_sops = {
+       .start = lov_tgt_seq_start,
+       .stop = lov_tgt_seq_stop,
+       .next = lov_tgt_seq_next,
+       .show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file *seq;
+       int rc;
+
+       LPROCFS_ENTRY_AND_CHECK(dp);
+       rc = seq_open(file, &lov_tgt_sops);
+       if (rc) {
+               LPROCFS_EXIT();
+               return rc;
+       }
+
+       seq = file->private_data;
+       seq->private = dp->data;
+       return 0;
+}
+
+struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+       { "uuid",        lprocfs_rd_uuid,       0, 0 },
+       { "stripesize",   lov_rd_stripesize,      lov_wr_stripesize, 0 },
+       { "stripeoffset", lov_rd_stripeoffset,    lov_wr_stripeoffset, 0 },
+       { "stripecount",  lov_rd_stripecount,     lov_wr_stripecount, 0 },
+       { "stripetype",   lov_rd_stripetype,      lov_wr_stripetype, 0 },
+       { "numobd",       lov_rd_numobd,          0, 0 },
+       { "activeobd",    lov_rd_activeobd,       0, 0 },
+       { "filestotal",   lprocfs_rd_filestotal,  0, 0 },
+       { "filesfree",    lprocfs_rd_filesfree,   0, 0 },
+       /*{ "filegroups", lprocfs_rd_filegroups,  0, 0 },*/
+       { "blocksize",    lprocfs_rd_blksize,     0, 0 },
+       { "kbytestotal",  lprocfs_rd_kbytestotal, 0, 0 },
+       { "kbytesfree",   lprocfs_rd_kbytesfree,  0, 0 },
+       { "kbytesavail",  lprocfs_rd_kbytesavail, 0, 0 },
+       { "desc_uuid",    lov_rd_desc_uuid,       0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+       { "num_refs",     lprocfs_rd_numrefs,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_lov_module_vars;
+    lvars->obd_vars     = lprocfs_lov_obd_vars;
+}
+
+struct file_operations lov_proc_target_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lov_target_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/Makefile b/drivers/staging/lustre/lustre/lvfs/Makefile
new file mode 100644 (file)
index 0000000..f50b1c5
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lvfs.o
+
+lvfs-y := lvfs_linux.o fsfilt.o lvfs_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt.c b/drivers/staging/lustre/lustre/lvfs/fsfilt.c
new file mode 100644 (file)
index 0000000..064445c
--- /dev/null
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+
+LIST_HEAD(fsfilt_types);
+
+static struct fsfilt_operations *fsfilt_search_type(const char *type)
+{
+       struct fsfilt_operations *found;
+       struct list_head *p;
+
+       list_for_each(p, &fsfilt_types) {
+               found = list_entry(p, struct fsfilt_operations, fs_list);
+               if (!strcmp(found->fs_type, type)) {
+                       return found;
+               }
+       }
+       return NULL;
+}
+
+int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
+{
+       struct fsfilt_operations *found;
+
+       /* lock fsfilt_types list */
+       if ((found = fsfilt_search_type(fs_ops->fs_type))) {
+               if (found != fs_ops) {
+                       CERROR("different operations for type %s\n",
+                              fs_ops->fs_type);
+                       /* unlock fsfilt_types list */
+                       RETURN(-EEXIST);
+               }
+       } else {
+               try_module_get(THIS_MODULE);
+               list_add(&fs_ops->fs_list, &fsfilt_types);
+       }
+
+       /* unlock fsfilt_types list */
+       return 0;
+}
+EXPORT_SYMBOL(fsfilt_register_ops);
+
+void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
+{
+       struct list_head *p;
+
+       /* lock fsfilt_types list */
+       list_for_each(p, &fsfilt_types) {
+               struct fsfilt_operations *found;
+
+               found = list_entry(p, typeof(*found), fs_list);
+               if (found == fs_ops) {
+                       list_del(p);
+                       module_put(THIS_MODULE);
+                       break;
+               }
+       }
+       /* unlock fsfilt_types list */
+}
+EXPORT_SYMBOL(fsfilt_unregister_ops);
+
+struct fsfilt_operations *fsfilt_get_ops(const char *type)
+{
+       struct fsfilt_operations *fs_ops;
+
+       /* lock fsfilt_types list */
+       if (!(fs_ops = fsfilt_search_type(type))) {
+               char name[32];
+               int rc;
+
+               snprintf(name, sizeof(name) - 1, "fsfilt_%s", type);
+               name[sizeof(name) - 1] = '\0';
+
+               if (!(rc = request_module("%s", name))) {
+                       fs_ops = fsfilt_search_type(type);
+                       CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+                       if (!fs_ops)
+                               rc = -ENOENT;
+               }
+
+               if (rc) {
+                       CERROR("Can't find %s interface\n", name);
+                       RETURN(ERR_PTR(rc < 0 ? rc : -rc));
+                       /* unlock fsfilt_types list */
+               }
+       }
+       try_module_get(fs_ops->fs_owner);
+       /* unlock fsfilt_types list */
+
+       return fs_ops;
+}
+EXPORT_SYMBOL(fsfilt_get_ops);
+
+void fsfilt_put_ops(struct fsfilt_operations *fs_ops)
+{
+       module_put(fs_ops->fs_owner);
+}
+EXPORT_SYMBOL(fsfilt_put_ops);
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c
new file mode 100644 (file)
index 0000000..c1e99b3
--- /dev/null
@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_ext3.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <ldiskfs/ldiskfs_config.h>
+#include <ext4/ext4.h>
+#include <ext4/ext4_jbd2.h>
+#include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/quota.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lprocfs_status.h>
+
+#include <ext4/ext4_extents.h>
+
+#ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */
+#define ext3_ext_pblock(ex) ext_pblock((ex))
+#endif
+
+/* for kernels 2.6.18 and later */
+#define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
+
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+              ext3_ext_insert_extent(handle, inode, path, newext, flag)
+
+#define ext3_mb_discard_inode_preallocations(inode) \
+                ext3_discard_preallocations(inode)
+
+#define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
+#define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
+
+static struct kmem_cache *fcb_cache;
+
+struct fsfilt_cb_data {
+       struct ext4_journal_cb_entry cb_jcb; /* private data - MUST BE FIRST */
+       fsfilt_cb_t cb_func;        /* MDS/OBD completion function */
+       struct obd_device *cb_obd;      /* MDS/OBD completion device */
+       __u64 cb_last_rcvd;          /* MDS/OST last committed operation */
+       void *cb_data;            /* MDS/OST completion function data */
+};
+
+static char *fsfilt_ext3_get_label(struct super_block *sb)
+{
+       return EXT3_SB(sb)->s_es->s_volume_name;
+}
+
+/* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
+# include <ext4/truncate.h>
+
+/*
+ * We don't currently need any additional blocks for rmdir and
+ * unlink transactions because we are storing the OST oa_id inside
+ * the inode (which we will be changing anyways as part of this
+ * transaction).
+ */
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                              int logs)
+{
+       /* For updates to the last received file */
+       int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
+       journal_t *journal;
+       void *handle;
+
+       if (current->journal_info) {
+               CDEBUG(D_INODE, "increasing refcount on %p\n",
+                      current->journal_info);
+               goto journal_start;
+       }
+
+       switch(op) {
+       case FSFILT_OP_UNLINK:
+               /* delete one file + create/update logs for each stripe */
+               nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
+               nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                           FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
+               break;
+       case FSFILT_OP_CANCEL_UNLINK:
+               LASSERT(logs == 1);
+
+               /* blocks for log header bitmap update OR
+                * blocks for catalog header bitmap update + unlink of logs +
+                * blocks for delete the inode (include blocks truncating). */
+               nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+                         EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
+                         ext4_blocks_for_truncate(inode) + 3;
+               break;
+       default: CERROR("unknown transaction start op %d\n", op);
+               LBUG();
+       }
+
+       LASSERT(current->journal_info == desc_private);
+       journal = EXT3_SB(inode->i_sb)->s_journal;
+       if (nblocks > journal->j_max_transaction_buffers) {
+               CWARN("too many credits %d for op %ux%u using %d instead\n",
+                      nblocks, op, logs, journal->j_max_transaction_buffers);
+               nblocks = journal->j_max_transaction_buffers;
+       }
+
+ journal_start:
+       LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
+       handle = ext3_journal_start(inode, nblocks);
+
+       if (!IS_ERR(handle))
+               LASSERT(current->journal_info == handle);
+       else
+               CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+                      op, nblocks, PTR_ERR(handle));
+       return handle;
+}
+
+static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
+{
+       int rc;
+       handle_t *handle = h;
+
+       LASSERT(current->journal_info == handle);
+       if (force_sync)
+               handle->h_sync = 1; /* recovery likes this */
+
+       rc = ext3_journal_stop(handle);
+
+       return rc;
+}
+
+#ifndef EXT3_EXTENTS_FL
+#define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
+#endif
+
+#ifndef EXT_ASSERT
+#define EXT_ASSERT(cond)  BUG_ON(!(cond))
+#endif
+
+#define EXT_GENERATION(inode)     (EXT4_I(inode)->i_ext_generation)
+#define ext3_ext_base             inode
+#define ext3_ext_base2inode(inode)      (inode)
+#define EXT_DEPTH(inode)               ext_depth(inode)
+#define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
+                       ext3_ext_walk_space(inode, block, num, cb, cbdata);
+
+struct bpointers {
+       unsigned long *blocks;
+       unsigned long start;
+       int num;
+       int init_num;
+       int create;
+};
+
+static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
+                              unsigned long block, int *aflags)
+{
+       struct ext3_inode_info *ei = EXT3_I(inode);
+       unsigned long bg_start;
+       unsigned long colour;
+       int depth;
+
+       if (path) {
+               struct ext3_extent *ex;
+               depth = path->p_depth;
+
+               /* try to predict block placement */
+               if ((ex = path[depth].p_ext))
+                       return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
+
+               /* it looks index is empty
+                * try to find starting from index itself */
+               if (path[depth].p_bh)
+                       return path[depth].p_bh->b_blocknr;
+       }
+
+       /* OK. use inode's group */
+       bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+               le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+       colour = (current->pid % 16) *
+               (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+       return bg_start + colour + block;
+}
+
+#define ll_unmap_underlying_metadata(sb, blocknr) \
+       unmap_underlying_metadata((sb)->s_bdev, blocknr)
+
+#ifndef EXT3_MB_HINT_GROUP_ALLOC
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+                               struct ext3_ext_path *path, unsigned long block,
+                               unsigned long *count, int *err)
+{
+       unsigned long pblock, goal;
+       int aflags = 0;
+       struct inode *inode = ext3_ext_base2inode(base);
+
+       goal = ext3_ext_find_goal(inode, path, block, &aflags);
+       aflags |= 2; /* block have been already reserved */
+       pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
+       return pblock;
+
+}
+#else
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+                               struct ext3_ext_path *path, unsigned long block,
+                               unsigned long *count, int *err)
+{
+       struct inode *inode = ext3_ext_base2inode(base);
+       struct ext3_allocation_request ar;
+       unsigned long pblock;
+       int aflags;
+
+       /* find neighbour allocated blocks */
+       ar.lleft = block;
+       *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
+       if (*err)
+               return 0;
+       ar.lright = block;
+       *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
+       if (*err)
+               return 0;
+
+       /* allocate new block */
+       ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
+       ar.inode = inode;
+       ar.logical = block;
+       ar.len = *count;
+       ar.flags = EXT3_MB_HINT_DATA;
+       pblock = ext3_mb_new_blocks(handle, &ar, err);
+       *count = ar.len;
+       return pblock;
+}
+#endif
+
+static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
+                                 struct ext3_ext_path *path,
+                                 struct ext3_ext_cache *cex,
+#ifdef HAVE_EXT_PREPARE_CB_EXTENT
+                                  struct ext3_extent *ex,
+#endif
+                                 void *cbdata)
+{
+       struct bpointers *bp = cbdata;
+       struct inode *inode = ext3_ext_base2inode(base);
+       struct ext3_extent nex;
+       unsigned long pblock;
+       unsigned long tgen;
+       int err, i;
+       unsigned long count;
+       handle_t *handle;
+
+#ifdef EXT3_EXT_CACHE_EXTENT
+       if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
+#else
+       if ((cex->ec_len != 0) && (cex->ec_start != 0))
+#endif
+                                                  {
+               err = EXT_CONTINUE;
+               goto map;
+       }
+
+       if (bp->create == 0) {
+               i = 0;
+               if (cex->ec_block < bp->start)
+                       i = bp->start - cex->ec_block;
+               if (i >= cex->ec_len)
+                       CERROR("nothing to do?! i = %d, e_num = %u\n",
+                                       i, cex->ec_len);
+               for (; i < cex->ec_len && bp->num; i++) {
+                       *(bp->blocks) = 0;
+                       bp->blocks++;
+                       bp->num--;
+                       bp->start++;
+               }
+
+               return EXT_CONTINUE;
+       }
+
+       tgen = EXT_GENERATION(base);
+       count = ext3_ext_calc_credits_for_insert(base, path);
+
+       handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
+       if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
+       }
+
+       if (tgen != EXT_GENERATION(base)) {
+               /* the tree has changed. so path can be invalid at moment */
+               ext3_journal_stop(handle);
+               return EXT_REPEAT;
+       }
+
+       /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+        * protected by i_data_sem as whole. so we patch it to store
+        * generation to path and now verify the tree hasn't changed */
+       down_write((&EXT4_I(inode)->i_data_sem));
+
+       /* validate extent, make sure the extent tree does not changed */
+       if (EXT_GENERATION(base) != path[0].p_generation) {
+               /* cex is invalid, try again */
+               up_write(&EXT4_I(inode)->i_data_sem);
+               ext3_journal_stop(handle);
+               return EXT_REPEAT;
+       }
+
+       count = cex->ec_len;
+       pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
+       if (!pblock)
+               goto out;
+       EXT_ASSERT(count <= cex->ec_len);
+
+       /* insert new extent */
+       nex.ee_block = cpu_to_le32(cex->ec_block);
+       ext3_ext_store_pblock(&nex, pblock);
+       nex.ee_len = cpu_to_le16(count);
+       err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
+       if (err) {
+               /* free data blocks we just allocated */
+               /* not a good idea to call discard here directly,
+                * but otherwise we'd need to call it every free() */
+#ifdef EXT3_MB_HINT_GROUP_ALLOC
+               ext3_mb_discard_inode_preallocations(inode);
+#endif
+#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
+               ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex),
+                                cpu_to_le16(nex.ee_len), 0);
+#else
+               ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex),
+                                cpu_to_le16(nex.ee_len), 0);
+#endif
+               goto out;
+       }
+
+       /*
+        * Putting len of the actual extent we just inserted,
+        * we are asking ext3_ext_walk_space() to continue
+        * scaning after that block
+        */
+       cex->ec_len = le16_to_cpu(nex.ee_len);
+       cex->ec_start = ext4_ext_pblock(&nex);
+       BUG_ON(le16_to_cpu(nex.ee_len) == 0);
+       BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
+
+out:
+       up_write((&EXT4_I(inode)->i_data_sem));
+       ext3_journal_stop(handle);
+map:
+       if (err >= 0) {
+               /* map blocks */
+               if (bp->num == 0) {
+                       CERROR("hmm. why do we find this extent?\n");
+                       CERROR("initial space: %lu:%u\n",
+                               bp->start, bp->init_num);
+#ifdef EXT3_EXT_CACHE_EXTENT
+                       CERROR("current extent: %u/%u/%llu %d\n",
+                               cex->ec_block, cex->ec_len,
+                               (unsigned long long)cex->ec_start,
+                               cex->ec_type);
+#else
+                       CERROR("current extent: %u/%u/%llu\n",
+                               cex->ec_block, cex->ec_len,
+                               (unsigned long long)cex->ec_start);
+#endif
+               }
+               i = 0;
+               if (cex->ec_block < bp->start)
+                       i = bp->start - cex->ec_block;
+               if (i >= cex->ec_len)
+                       CERROR("nothing to do?! i = %d, e_num = %u\n",
+                                       i, cex->ec_len);
+               for (; i < cex->ec_len && bp->num; i++) {
+                       *(bp->blocks) = cex->ec_start + i;
+#ifdef EXT3_EXT_CACHE_EXTENT
+                       if (cex->ec_type != EXT3_EXT_CACHE_EXTENT)
+#else
+                       if ((cex->ec_len == 0) || (cex->ec_start == 0))
+#endif
+                                                                       {
+                               /* unmap any possible underlying metadata from
+                                * the block device mapping.  bug 6998. */
+                               ll_unmap_underlying_metadata(inode->i_sb,
+                                                            *(bp->blocks));
+                       }
+                       bp->blocks++;
+                       bp->num--;
+                       bp->start++;
+               }
+       }
+       return err;
+}
+
+int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
+                      unsigned long num, unsigned long *blocks,
+                      int create)
+{
+       struct ext3_ext_base *base = inode;
+       struct bpointers bp;
+       int err;
+
+       CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
+              block, block + num - 1, (unsigned) inode->i_ino);
+
+       bp.blocks = blocks;
+       bp.start = block;
+       bp.init_num = bp.num = num;
+       bp.create = create;
+
+       err = fsfilt_ext3_ext_walk_space(base, block, num,
+                                        ext3_ext_new_extent_cb, &bp);
+       ext3_ext_invalidate_cache(base);
+
+       return err;
+}
+
+int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
+                                   int pages, unsigned long *blocks,
+                                   int create)
+{
+       int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+       int rc = 0, i = 0;
+       struct page *fp = NULL;
+       int clen = 0;
+
+       CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
+               inode->i_ino, pages, (*page)->index);
+
+       /* pages are sorted already. so, we just have to find
+        * contig. space and process them properly */
+       while (i < pages) {
+               if (fp == NULL) {
+                       /* start new extent */
+                       fp = *page++;
+                       clen = 1;
+                       i++;
+                       continue;
+               } else if (fp->index + clen == (*page)->index) {
+                       /* continue the extent */
+                       page++;
+                       clen++;
+                       i++;
+                       continue;
+               }
+
+               /* process found extent */
+               rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                       clen * blocks_per_page, blocks,
+                                       create);
+               if (rc)
+                       GOTO(cleanup, rc);
+
+               /* look for next extent */
+               fp = NULL;
+               blocks += blocks_per_page * clen;
+       }
+
+       if (fp)
+               rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                       clen * blocks_per_page, blocks,
+                                       create);
+cleanup:
+       return rc;
+}
+
+int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
+                                  int pages, unsigned long *blocks,
+                                  int create)
+{
+       int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+       unsigned long *b;
+       int rc = 0, i;
+
+       for (i = 0, b = blocks; i < pages; i++, page++) {
+               rc = ext3_map_inode_page(inode, *page, b, create);
+               if (rc) {
+                       CERROR("ino %lu, blk %lu create %d: rc %d\n",
+                              inode->i_ino, *b, create, rc);
+                       break;
+               }
+
+               b += blocks_per_page;
+       }
+       return rc;
+}
+
+int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
+                               int pages, unsigned long *blocks,
+                               int create, struct mutex *optional_mutex)
+{
+       int rc;
+
+       if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
+               rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
+                                                    blocks, create);
+               return rc;
+       }
+       if (optional_mutex != NULL)
+               mutex_lock(optional_mutex);
+       rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create);
+       if (optional_mutex != NULL)
+               mutex_unlock(optional_mutex);
+
+       return rc;
+}
+
+int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
+{
+       unsigned long block;
+       struct buffer_head *bh;
+       int err, blocksize, csize, boffs, osize = size;
+
+       /* prevent reading after eof */
+       spin_lock(&inode->i_lock);
+       if (i_size_read(inode) < *offs + size) {
+               size = i_size_read(inode) - *offs;
+               spin_unlock(&inode->i_lock);
+               if (size < 0) {
+                       CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
+                              i_size_read(inode), *offs);
+                       return -EBADR;
+               } else if (size == 0) {
+                       return 0;
+               }
+       } else {
+               spin_unlock(&inode->i_lock);
+       }
+
+       blocksize = 1 << inode->i_blkbits;
+
+       while (size > 0) {
+               block = *offs >> inode->i_blkbits;
+               boffs = *offs & (blocksize - 1);
+               csize = min(blocksize - boffs, size);
+               bh = ext3_bread(NULL, inode, block, 0, &err);
+               if (!bh) {
+                       CERROR("can't read block: %d\n", err);
+                       return err;
+               }
+
+               memcpy(buf, bh->b_data + boffs, csize);
+               brelse(bh);
+
+               *offs += csize;
+               buf += csize;
+               size -= csize;
+       }
+       return osize;
+}
+EXPORT_SYMBOL(fsfilt_ext3_read);
+
+static int fsfilt_ext3_read_record(struct file * file, void *buf,
+                                  int size, loff_t *offs)
+{
+       int rc;
+       rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
+       if (rc > 0)
+               rc = 0;
+       return rc;
+}
+
+int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
+                               loff_t *offs, handle_t *handle)
+{
+       struct buffer_head *bh = NULL;
+       loff_t old_size = i_size_read(inode), offset = *offs;
+       loff_t new_size = i_size_read(inode);
+       unsigned long block;
+       int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
+
+       while (bufsize > 0) {
+               if (bh != NULL)
+                       brelse(bh);
+
+               block = offset >> inode->i_blkbits;
+               boffs = offset & (blocksize - 1);
+               size = min(blocksize - boffs, bufsize);
+               bh = ext3_bread(handle, inode, block, 1, &err);
+               if (!bh) {
+                       CERROR("can't read/create block: %d\n", err);
+                       break;
+               }
+
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err) {
+                       CERROR("journal_get_write_access() returned error %d\n",
+                              err);
+                       break;
+               }
+               LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
+               memcpy(bh->b_data + boffs, buf, size);
+               err = ext3_journal_dirty_metadata(handle, bh);
+               if (err) {
+                       CERROR("journal_dirty_metadata() returned error %d\n",
+                              err);
+                       break;
+               }
+               if (offset + size > new_size)
+                       new_size = offset + size;
+               offset += size;
+               bufsize -= size;
+               buf += size;
+       }
+       if (bh)
+               brelse(bh);
+
+       /* correct in-core and on-disk sizes */
+       if (new_size > i_size_read(inode)) {
+               spin_lock(&inode->i_lock);
+               if (new_size > i_size_read(inode))
+                       i_size_write(inode, new_size);
+               if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
+                       EXT3_I(inode)->i_disksize = i_size_read(inode);
+               if (i_size_read(inode) > old_size) {
+                       spin_unlock(&inode->i_lock);
+                       mark_inode_dirty(inode);
+               } else {
+                       spin_unlock(&inode->i_lock);
+               }
+       }
+
+       if (err == 0)
+               *offs = offset;
+       return err;
+}
+EXPORT_SYMBOL(fsfilt_ext3_write_handle);
+
+static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
+                                   loff_t *offs, int force_sync)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       handle_t *handle;
+       int err, block_count = 0, blocksize;
+
+       /* Determine how many transaction credits are needed */
+       blocksize = 1 << inode->i_blkbits;
+       block_count = (*offs & (blocksize - 1)) + bufsize;
+       block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
+
+       handle = ext3_journal_start(inode,
+                       block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
+       if (IS_ERR(handle)) {
+               CERROR("can't start transaction for %d blocks (%d bytes)\n",
+                      block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
+                      bufsize);
+               return PTR_ERR(handle);
+       }
+
+       err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
+
+       if (!err && force_sync)
+               handle->h_sync = 1; /* recovery likes this */
+
+       ext3_journal_stop(handle);
+
+       return err;
+}
+
+static int fsfilt_ext3_setup(struct super_block *sb)
+{
+       if (!EXT3_HAS_COMPAT_FEATURE(sb,
+                               EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+               CERROR("ext3 mounted without journal\n");
+               return -EINVAL;
+       }
+
+#ifdef S_PDIROPS
+       CWARN("Enabling PDIROPS\n");
+       set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
+       sb->s_flags |= S_PDIROPS;
+#endif
+       if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+               CWARN("filesystem doesn't have dir_index feature enabled\n");
+       return 0;
+}
+static struct fsfilt_operations fsfilt_ext3_ops = {
+       .fs_type                = "ext3",
+       .fs_owner              = THIS_MODULE,
+       .fs_getlabel        = fsfilt_ext3_get_label,
+       .fs_start              = fsfilt_ext3_start,
+       .fs_commit            = fsfilt_ext3_commit,
+       .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
+       .fs_write_record        = fsfilt_ext3_write_record,
+       .fs_read_record  = fsfilt_ext3_read_record,
+       .fs_setup              = fsfilt_ext3_setup,
+};
+
+static int __init fsfilt_ext3_init(void)
+{
+       int rc;
+
+       fcb_cache = kmem_cache_create("fsfilt_ext3_fcb",
+                                        sizeof(struct fsfilt_cb_data), 0, 0);
+       if (!fcb_cache) {
+               CERROR("error allocating fsfilt journal callback cache\n");
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       rc = fsfilt_register_ops(&fsfilt_ext3_ops);
+
+       if (rc) {
+               int err = kmem_cache_destroy(fcb_cache);
+               LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
+       }
+out:
+       return rc;
+}
+
+static void __exit fsfilt_ext3_exit(void)
+{
+       int rc;
+
+       fsfilt_unregister_ops(&fsfilt_ext3_ops);
+       rc = kmem_cache_destroy(fcb_cache);
+       LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
+}
+
+module_init(fsfilt_ext3_init);
+module_exit(fsfilt_ext3_exit);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c
new file mode 100644 (file)
index 0000000..97a8be2
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_lib.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             smp_id;
+       unsigned long                   flags = 0;
+
+       if (stats == NULL)
+               return;
+
+       /* With per-client stats, statistics are allocated only for
+        * single CPU area, so the smp_id should be 0 always. */
+       smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+       if (smp_id < 0)
+               return;
+
+       header = &stats->ls_cnt_header[idx];
+       percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+       percpu_cntr->lc_count++;
+
+       if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+               /*
+                * lprocfs_counter_add() can be called in interrupt context,
+                * as memory allocation could trigger memory shrinker call
+                * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+                * LU-1727.
+                *
+                * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+                * flag, because it needs accurate counting lest memory leak
+                * check reports error.
+                */
+               if (in_interrupt() &&
+                   (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq += amount;
+               else
+                       percpu_cntr->lc_sum += amount;
+
+               if (header->lc_config & LPROCFS_CNTR_STDDEV)
+                       percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+               if (amount < percpu_cntr->lc_min)
+                       percpu_cntr->lc_min = amount;
+               if (amount > percpu_cntr->lc_max)
+                       percpu_cntr->lc_max = amount;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             smp_id;
+       unsigned long                   flags = 0;
+
+       if (stats == NULL)
+               return;
+
+       /* With per-client stats, statistics are allocated only for
+        * single CPU area, so the smp_id should be 0 always. */
+       smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+       if (smp_id < 0)
+               return;
+
+       header = &stats->ls_cnt_header[idx];
+       percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+       if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+               /*
+                * Sometimes we use RCU callbacks to free memory which calls
+                * lprocfs_counter_sub(), and RCU callbacks may execute in
+                * softirq context - right now that's the only case we're in
+                * softirq context here, use separate counter for that.
+                * bz20650.
+                *
+                * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+                * flag, because it needs accurate counting lest memory leak
+                * check reports error.
+                */
+               if (in_interrupt() &&
+                   (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq -= amount;
+               else
+                       percpu_cntr->lc_sum -= amount;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+       struct lprocfs_counter  *cntr;
+       unsigned int            percpusize;
+       int                     rc = -ENOMEM;
+       unsigned long           flags = 0;
+       int                     i;
+
+       LASSERT(stats->ls_percpu[cpuid] == NULL);
+       LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+       percpusize = lprocfs_stats_counter_size(stats);
+       LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+       if (stats->ls_percpu[cpuid] != NULL) {
+               rc = 0;
+               if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       if (stats->ls_biggest_alloc_num <= cpuid)
+                               stats->ls_biggest_alloc_num = cpuid + 1;
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock, flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               }
+               /* initialize the ls_percpu[cpuid] non-zero counter */
+               for (i = 0; i < stats->ls_num; ++i) {
+                       cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+                       cntr->lc_min = LC_MIN_INIT;
+               }
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+#endif  /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
new file mode 100644 (file)
index 0000000..1e6f32c
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_linux.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/version.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/lustre_compat25.h>
+#include <lvfs.h>
+
+#include <obd.h>
+#include <lustre_lib.h>
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+/* refine later and change to seqlock or simlar from libcfs */
+
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+                                             msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+#endif
+
+static void push_group_info(struct lvfs_run_ctxt *save,
+                           struct group_info *ginfo)
+{
+       if (!ginfo) {
+               save->ngroups = current_ngroups;
+               current_ngroups = 0;
+       } else {
+               struct cred *cred;
+               task_lock(current);
+               save->group_info = current_cred()->group_info;
+               if ((cred = prepare_creds())) {
+                       cred->group_info = ginfo;
+                       commit_creds(cred);
+               }
+               task_unlock(current);
+       }
+}
+
+static void pop_group_info(struct lvfs_run_ctxt *save,
+                          struct group_info *ginfo)
+{
+       if (!ginfo) {
+               current_ngroups = save->ngroups;
+       } else {
+               struct cred *cred;
+               task_lock(current);
+               if ((cred = prepare_creds())) {
+                       cred->group_info = save->group_info;
+                       commit_creds(cred);
+               }
+               task_unlock(current);
+       }
+}
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+              struct lvfs_ucred *uc)
+{
+       /* if there is underlaying dt_device then push_ctxt is not needed */
+       if (new_ctx->dt != NULL)
+               return;
+
+       //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
+       ASSERT_CTXT_MAGIC(new_ctx->magic);
+       OBD_SET_CTXT_MAGIC(save);
+
+       save->fs = get_fs();
+       LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
+       LASSERT(d_refcount(new_ctx->pwd));
+       save->pwd = dget(cfs_fs_pwd(current->fs));
+       save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
+       save->luc.luc_umask = current_umask();
+       save->ngroups = current_cred()->group_info->ngroups;
+
+       LASSERT(save->pwd);
+       LASSERT(save->pwdmnt);
+       LASSERT(new_ctx->pwd);
+       LASSERT(new_ctx->pwdmnt);
+
+       if (uc) {
+               struct cred *cred;
+               save->luc.luc_uid = current_uid();
+               save->luc.luc_gid = current_gid();
+               save->luc.luc_fsuid = current_fsuid();
+               save->luc.luc_fsgid = current_fsgid();
+               save->luc.luc_cap = current_cap();
+
+               if ((cred = prepare_creds())) {
+                       cred->uid = uc->luc_uid;
+                       cred->gid = uc->luc_gid;
+                       cred->fsuid = uc->luc_fsuid;
+                       cred->fsgid = uc->luc_fsgid;
+                       cred->cap_effective = uc->luc_cap;
+                       commit_creds(cred);
+               }
+
+               push_group_info(save,
+                               uc->luc_ginfo ?:
+                               uc->luc_identity ? uc->luc_identity->mi_ginfo :
+                                                  NULL);
+       }
+       current->fs->umask = 0; /* umask already applied on client */
+       set_fs(new_ctx->fs);
+       ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+             struct lvfs_ucred *uc)
+{
+       /* if there is underlaying dt_device then pop_ctxt is not needed */
+       if (new_ctx->dt != NULL)
+               return;
+
+       ASSERT_CTXT_MAGIC(saved->magic);
+       ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
+
+       LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
+                cfs_fs_pwd(current->fs), new_ctx->pwd);
+       LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
+                cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
+
+       set_fs(saved->fs);
+       ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+       dput(saved->pwd);
+       mntput(saved->pwdmnt);
+       current->fs->umask = saved->luc.luc_umask;
+       if (uc) {
+               struct cred *cred;
+               if ((cred = prepare_creds())) {
+                       cred->uid = saved->luc.luc_uid;
+                       cred->gid = saved->luc.luc_gid;
+                       cred->fsuid = saved->luc.luc_fsuid;
+                       cred->fsgid = saved->luc.luc_fsgid;
+                       cred->cap_effective = saved->luc.luc_cap;
+                       commit_creds(cred);
+               }
+
+               pop_group_info(saved,
+                              uc->luc_ginfo ?:
+                              uc->luc_identity ? uc->luc_identity->mi_ginfo :
+                                                 NULL);
+       }
+}
+EXPORT_SYMBOL(pop_ctxt);
+
+/* utility to rename a file */
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
+                 char *oldname, char *newname)
+{
+       struct dentry *dchild_old, *dchild_new;
+       int err = 0;
+       ENTRY;
+
+       ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
+       CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
+              (int)strlen(oldname), oldname, (int)strlen(newname), newname);
+
+       dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
+       if (IS_ERR(dchild_old))
+               RETURN(PTR_ERR(dchild_old));
+
+       if (!dchild_old->d_inode)
+               GOTO(put_old, err = -ENOENT);
+
+       dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
+       if (IS_ERR(dchild_new))
+               GOTO(put_old, err = PTR_ERR(dchild_new));
+
+       err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
+                           dir->d_inode, dchild_new, mnt);
+
+       dput(dchild_new);
+put_old:
+       dput(dchild_old);
+       RETURN(err);
+}
+EXPORT_SYMBOL(lustre_rename);
+
+/* Note: dput(dchild) will *not* be called if there is an error */
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
+                            int flags)
+{
+       struct path path = {
+               .dentry = de,
+               .mnt = ctxt->pwdmnt,
+       };
+       return ll_dentry_open(&path, flags, current_cred());
+}
+EXPORT_SYMBOL(l_dentry_open);
+
+#ifdef LPROCFS
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+                         struct lprocfs_counter_header *header,
+                         enum lprocfs_stats_flags flags,
+                         enum lprocfs_fields_flags field)
+{
+       __s64 ret = 0;
+
+       if (lc == NULL || header == NULL)
+               RETURN(0);
+
+       switch (field) {
+               case LPROCFS_FIELDS_FLAGS_CONFIG:
+                       ret = header->lc_config;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_SUM:
+                       ret = lc->lc_sum;
+                       if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                               ret += lc->lc_sum_irq;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_MIN:
+                       ret = lc->lc_min;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_MAX:
+                       ret = lc->lc_max;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_AVG:
+                       ret = (lc->lc_max - lc->lc_min) / 2;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+                       ret = lc->lc_sumsquare;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_COUNT:
+                       ret = lc->lc_count;
+                       break;
+               default:
+                       break;
+       };
+
+       RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+#endif /* LPROCFS */
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644 (file)
index 0000000..93bae24
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644 (file)
index 0000000..a6a8a0d
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+
+static int mdc_rd_max_rpcs_in_flight(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int mdc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 1 || val > MDC_MAX_RIF_MAX)
+               return -ERANGE;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_rpcs_in_flight = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return count;
+}
+
+/* temporary for testing */
+static int mdc_wr_kuc(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+       struct obd_device       *obd = data;
+       struct kuc_hdr          *lh;
+       struct hsm_action_list  *hal;
+       struct hsm_action_item  *hai;
+       int                      len;
+       int                      fd, rc;
+       ENTRY;
+
+       rc = lprocfs_write_helper(buffer, count, &fd);
+       if (rc)
+               RETURN(rc);
+
+       if (fd < 0)
+               RETURN(-ERANGE);
+       CWARN("message to fd %d\n", fd);
+
+       len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+               /* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+       OBD_ALLOC(lh, len);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = KUC_TRANSPORT_HSM;
+       lh->kuc_msgtype = HMT_ACTION_LIST;
+       lh->kuc_msglen = len;
+
+       hal = (struct hsm_action_list *)(lh + 1);
+       hal->hal_version = HAL_VERSION;
+       hal->hal_archive_id = 1;
+       hal->hal_flags = 0;
+       obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+       /* mock up an action list */
+       hal->hal_count = 2;
+       hai = hai_zero(hal);
+       hai->hai_action = HSMA_ARCHIVE;
+       hai->hai_fid.f_oid = 5;
+       hai->hai_len = sizeof(*hai);
+       hai = hai_next(hai);
+       hai->hai_action = HSMA_RESTORE;
+       hai->hai_fid.f_oid = 10;
+       hai->hai_len = sizeof(*hai);
+
+       /* This works for either broadcast or unicast to a single fd */
+       if (fd == 0) {
+               rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+       } else {
+               struct file *fp = fget(fd);
+
+               rc = libcfs_kkuc_msg_put(fp, lh);
+               fput(fp);
+       }
+       OBD_FREE(lh, len);
+       if (rc < 0)
+               RETURN(rc);
+       RETURN(count);
+}
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+       { "uuid",           lprocfs_rd_uuid,    0, 0 },
+       { "ping",           0, lprocfs_wr_ping,     0, 0, 0222 },
+       { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+       { "blocksize",       lprocfs_rd_blksize,     0, 0 },
+       { "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
+       { "kbytesfree",      lprocfs_rd_kbytesfree,  0, 0 },
+       { "kbytesavail",     lprocfs_rd_kbytesavail, 0, 0 },
+       { "filestotal",      lprocfs_rd_filestotal,  0, 0 },
+       { "filesfree",       lprocfs_rd_filesfree,   0, 0 },
+       /*{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },*/
+       { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
+       { "mds_conn_uuid",   lprocfs_rd_conn_uuid,   0, 0 },
+       /*
+        * FIXME: below proc entry is provided, but not in used, instead
+        * sbi->sb_md_brw_size is used, the per obd variable should be used
+        * when CMD is enabled, and dir pages are managed in MDC layer.
+        * Remember to enable proc write function.
+        */
+       { "max_pages_per_rpc",  lprocfs_obd_rd_max_pages_per_rpc,
+                               /* lprocfs_obd_wr_max_pages_per_rpc */0, 0 },
+       { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
+                               mdc_wr_max_rpcs_in_flight, 0 },
+       { "timeouts",   lprocfs_rd_timeouts,    0, 0 },
+       { "import",       lprocfs_rd_import,      lprocfs_wr_import, 0 },
+       { "state",         lprocfs_rd_state,       0, 0 },
+       { "hsm_nl",       0, mdc_wr_kuc,          0, 0, 0200 },
+       { "pinger_recov",    lprocfs_rd_pinger_recov,
+                            lprocfs_wr_pinger_recov, 0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+       { "num_refs",   lprocfs_rd_numrefs,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_mdc_module_vars;
+    lvars->obd_vars     = lprocfs_mdc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644 (file)
index 0000000..2aeff0e
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+#include <lustre_mds.h>
+
+#ifdef LPROCFS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+                  struct obd_capa *oc, __u64 valid, int ea_size,
+                  __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+                  const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+                       const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+                          struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+                     const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+                     struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const void *data, int datalen, __u32 mode, __u32 uid,
+                    __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                  __u32 mode, __u64 rdev, __u32 flags, const void *data,
+                  int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+                     __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+                   ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+                   struct md_op_data *,
+                   void *lmm, int lmmsize,
+                   struct lookup_intent *, int,
+                   struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+               struct lookup_intent *it, struct md_op_data *op_data,
+               struct lustre_handle *lockh, void *lmm, int lmmsize,
+               struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+                           struct list_head *cancels, ldlm_mode_t mode,
+                           __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, obd_id ino, int type, int flags,
+            struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+            struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *lmv_exp,
+                     struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+              cfs_cap_t capability, __u64 rdev,
+              struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+            struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+              const char *old, int oldlen, const char *new, int newlen,
+              struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+               void *ea, int ealen, void *ea2, int ea2len,
+               struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+              struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+                     ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                     ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+                                    const struct req_msg_field *field,
+                                    struct obd_capa *oc)
+{
+       if (oc == NULL)
+               req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+       else
+               /* it is already calculated as sizeof struct obd_capa */
+               ;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+                                  struct ptlrpc_request *req, int opc,
+                                  struct list_head *cancels, int count)
+{
+       return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+                                count);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644 (file)
index 0000000..05c6968
--- /dev/null
@@ -0,0 +1,564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+       LASSERT (b != NULL);
+
+       b->suppgid = suppgid;
+       b->uid = current_uid();
+       b->gid = current_gid();
+       b->fsuid = current_fsuid();
+       b->fsgid = current_fsgid();
+       b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req, const struct req_msg_field *field,
+                  struct obd_capa *oc)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       struct lustre_capa *c;
+
+       if (oc == NULL) {
+               LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+               return;
+       }
+
+       c = req_capsule_client_get(pill, field);
+       LASSERT(c != NULL);
+       capa_cpy(c, oc);
+       DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+                       const struct lu_fid *cfid, int flags)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       if (pfid) {
+               b->fid1 = *pfid;
+               b->valid = OBD_MD_FLID;
+       }
+       if (cfid)
+               b->fid2 = *cfid;
+       b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+                          struct md_op_data *op_data)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       __mdc_pack_body(b, op_data->op_suppgids[0]);
+       b->fid1 = op_data->op_fid1;
+       b->fid2 = op_data->op_fid2;
+       b->valid |= OBD_MD_FLID;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+                  const struct lu_fid *fid, struct obd_capa *oc,
+                  __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+       LASSERT(b != NULL);
+       b->valid = valid;
+       b->eadatasize = ea_size;
+       b->flags = flags;
+       __mdc_pack_body(b, suppgid);
+       if (fid) {
+               b->fid1 = *fid;
+               b->valid |= OBD_MD_FLID;
+               mdc_pack_capa(req, &RMF_CAPA1, oc);
+       }
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+                     __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+       b->fid1 = *fid;
+       b->valid |= OBD_MD_FLID;
+       b->size = pgoff;                       /* !! */
+       b->nlink = size;                        /* !! */
+       __mdc_pack_body(b, -1);
+       b->mode = LUDA_FID | LUDA_TYPE;
+
+       mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const void *data, int datalen, __u32 mode,
+                    __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+       struct mdt_rec_create   *rec;
+       char                    *tmp;
+       __u64                    flags;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+       rec->cr_opcode   = REINT_CREATE;
+       rec->cr_fsuid    = uid;
+       rec->cr_fsgid    = gid;
+       rec->cr_cap      = cap_effective;
+       rec->cr_fid1     = op_data->op_fid1;
+       rec->cr_fid2     = op_data->op_fid2;
+       rec->cr_mode     = mode;
+       rec->cr_rdev     = rdev;
+       rec->cr_time     = op_data->op_mod_time;
+       rec->cr_suppgid1 = op_data->op_suppgids[0];
+       rec->cr_suppgid2 = op_data->op_suppgids[1];
+       flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+       if (op_data->op_bias & MDS_CREATE_VOLATILE)
+               flags |= MDS_OPEN_VOLATILE;
+       set_mrc_cr_flags(rec, flags);
+       rec->cr_bias     = op_data->op_bias;
+       rec->cr_umask    = current_umask();
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+       if (data) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, data, datalen);
+       }
+}
+
+static __u64 mds_pack_open_flags(__u32 flags, __u32 mode)
+{
+       __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+                                  MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+                                  MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+                                  MDS_OPEN_BY_FID));
+       if (flags & O_CREAT)
+               cr_flags |= MDS_OPEN_CREAT;
+       if (flags & O_EXCL)
+               cr_flags |= MDS_OPEN_EXCL;
+       if (flags & O_TRUNC)
+               cr_flags |= MDS_OPEN_TRUNC;
+       if (flags & O_APPEND)
+               cr_flags |= MDS_OPEN_APPEND;
+       if (flags & O_SYNC)
+               cr_flags |= MDS_OPEN_SYNC;
+       if (flags & O_DIRECTORY)
+               cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+       if (flags & FMODE_EXEC)
+               cr_flags |= MDS_FMODE_EXEC;
+#endif
+       if (flags & O_LOV_DELAY_CREATE)
+               cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+       if ((flags & O_NOACCESS) || (flags & O_NONBLOCK))
+               cr_flags |= MDS_OPEN_NORESTORE;
+
+       return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                  __u32 mode, __u64 rdev, __u32 flags, const void *lmm,
+                  int lmmlen)
+{
+       struct mdt_rec_create *rec;
+       char *tmp;
+       __u64 cr_flags;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       /* XXX do something about time, uid, gid */
+       rec->cr_opcode   = REINT_OPEN;
+       rec->cr_fsuid   = current_fsuid();
+       rec->cr_fsgid   = current_fsgid();
+       rec->cr_cap      = cfs_curproc_cap_pack();
+       if (op_data != NULL) {
+               rec->cr_fid1 = op_data->op_fid1;
+               rec->cr_fid2 = op_data->op_fid2;
+       }
+       rec->cr_mode     = mode;
+       cr_flags = mds_pack_open_flags(flags, mode);
+       rec->cr_rdev     = rdev;
+       rec->cr_time     = op_data->op_mod_time;
+       rec->cr_suppgid1 = op_data->op_suppgids[0];
+       rec->cr_suppgid2 = op_data->op_suppgids[1];
+       rec->cr_bias     = op_data->op_bias;
+       rec->cr_umask    = current_umask();
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       /* the next buffer is child capa, which is used for replay,
+        * will be packed from the data in reply message. */
+
+       if (op_data->op_name) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+               if (op_data->op_bias & MDS_CREATE_VOLATILE)
+                       cr_flags |= MDS_OPEN_VOLATILE;
+       }
+
+       if (lmm) {
+               cr_flags |= MDS_OPEN_HAS_EA;
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, lmm, lmmlen);
+       }
+       set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid) {
+       __u64 sa_valid = 0;
+
+       if (ia_valid & ATTR_MODE)
+               sa_valid |= MDS_ATTR_MODE;
+       if (ia_valid & ATTR_UID)
+               sa_valid |= MDS_ATTR_UID;
+       if (ia_valid & ATTR_GID)
+               sa_valid |= MDS_ATTR_GID;
+       if (ia_valid & ATTR_SIZE)
+               sa_valid |= MDS_ATTR_SIZE;
+       if (ia_valid & ATTR_ATIME)
+               sa_valid |= MDS_ATTR_ATIME;
+       if (ia_valid & ATTR_MTIME)
+               sa_valid |= MDS_ATTR_MTIME;
+       if (ia_valid & ATTR_CTIME)
+               sa_valid |= MDS_ATTR_CTIME;
+       if (ia_valid & ATTR_ATIME_SET)
+               sa_valid |= MDS_ATTR_ATIME_SET;
+       if (ia_valid & ATTR_MTIME_SET)
+               sa_valid |= MDS_ATTR_MTIME_SET;
+       if (ia_valid & ATTR_FORCE)
+               sa_valid |= MDS_ATTR_FORCE;
+       if (ia_valid & ATTR_ATTR_FLAG)
+               sa_valid |= MDS_ATTR_ATTR_FLAG;
+       if (ia_valid & ATTR_KILL_SUID)
+               sa_valid |=  MDS_ATTR_KILL_SUID;
+       if (ia_valid & ATTR_KILL_SGID)
+               sa_valid |= MDS_ATTR_KILL_SGID;
+       if (ia_valid & ATTR_CTIME_SET)
+               sa_valid |= MDS_ATTR_CTIME_SET;
+       if (ia_valid & ATTR_FROM_OPEN)
+               sa_valid |= MDS_ATTR_FROM_OPEN;
+       if (ia_valid & ATTR_BLOCKS)
+               sa_valid |= MDS_ATTR_BLOCKS;
+       if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+               /* NFSD hack (see bug 5781) */
+               sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+       return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+                                struct md_op_data *op_data)
+{
+       rec->sa_opcode  = REINT_SETATTR;
+       rec->sa_fsuid   = current_fsuid();
+       rec->sa_fsgid   = current_fsgid();
+       rec->sa_cap     = cfs_curproc_cap_pack();
+       rec->sa_suppgid = -1;
+
+       rec->sa_fid    = op_data->op_fid1;
+       rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+       rec->sa_mode   = op_data->op_attr.ia_mode;
+       rec->sa_uid    = op_data->op_attr.ia_uid;
+       rec->sa_gid    = op_data->op_attr.ia_gid;
+       rec->sa_size   = op_data->op_attr.ia_size;
+       rec->sa_blocks = op_data->op_attr_blocks;
+       rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+       rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+       rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+       rec->sa_attr_flags = ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+       if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+           current_is_in_group(op_data->op_attr.ia_gid))
+               rec->sa_suppgid = op_data->op_attr.ia_gid;
+       else
+               rec->sa_suppgid = op_data->op_suppgids[0];
+
+       rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+                            struct md_op_data *op_data)
+{
+       memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+       epoch->ioepoch = op_data->op_ioepoch;
+       epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                     void *ea, int ealen, void *ea2, int ea2len)
+{
+       struct mdt_rec_setattr *rec;
+       struct mdt_ioepoch *epoch;
+       struct lov_user_md *lum = NULL;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) ==sizeof(struct mdt_rec_setattr));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       mdc_setattr_pack_rec(rec, op_data);
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+               mdc_ioepoch_pack(epoch, op_data);
+       }
+
+       if (ealen == 0)
+               return;
+
+       lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+       if (ea == NULL) { /* Remove LOV EA */
+               lum->lmm_magic = LOV_USER_MAGIC_V1;
+               lum->lmm_stripe_size = 0;
+               lum->lmm_stripe_count = 0;
+               lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+       } else {
+               memcpy(lum, ea, ealen);
+       }
+
+       if (ea2len == 0)
+               return;
+
+       memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+              ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_rec_unlink *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       LASSERT(rec != NULL);
+
+       rec->ul_opcode  = op_data->op_cli_flags & CLI_RM_ENTRY ?
+                                       REINT_RMENTRY : REINT_UNLINK;
+       rec->ul_fsuid   = op_data->op_fsuid;
+       rec->ul_fsgid   = op_data->op_fsgid;
+       rec->ul_cap     = op_data->op_cap;
+       rec->ul_mode    = op_data->op_mode;
+       rec->ul_suppgid1= op_data->op_suppgids[0];
+       rec->ul_suppgid2= -1;
+       rec->ul_fid1    = op_data->op_fid1;
+       rec->ul_fid2    = op_data->op_fid2;
+       rec->ul_time    = op_data->op_mod_time;
+       rec->ul_bias    = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LASSERT(tmp != NULL);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_rec_link *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       LASSERT (rec != NULL);
+
+       rec->lk_opcode   = REINT_LINK;
+       rec->lk_fsuid    = op_data->op_fsuid;//current->fsuid;
+       rec->lk_fsgid    = op_data->op_fsgid;//current->fsgid;
+       rec->lk_cap      = op_data->op_cap;//current->cap_effective;
+       rec->lk_suppgid1 = op_data->op_suppgids[0];
+       rec->lk_suppgid2 = op_data->op_suppgids[1];
+       rec->lk_fid1     = op_data->op_fid1;
+       rec->lk_fid2     = op_data->op_fid2;
+       rec->lk_time     = op_data->op_mod_time;
+       rec->lk_bias     = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const char *old, int oldlen, const char *new, int newlen)
+{
+       struct mdt_rec_rename *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       /* XXX do something about time, uid, gid */
+       rec->rn_opcode   = REINT_RENAME;
+       rec->rn_fsuid    = op_data->op_fsuid;
+       rec->rn_fsgid    = op_data->op_fsgid;
+       rec->rn_cap      = op_data->op_cap;
+       rec->rn_suppgid1 = op_data->op_suppgids[0];
+       rec->rn_suppgid2 = op_data->op_suppgids[1];
+       rec->rn_fid1     = op_data->op_fid1;
+       rec->rn_fid2     = op_data->op_fid2;
+       rec->rn_time     = op_data->op_mod_time;
+       rec->rn_mode     = op_data->op_mode;
+       rec->rn_bias     = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(old, oldlen, tmp);
+
+       if (new) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+               LOGL0(new, newlen, tmp);
+       }
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+                     struct md_op_data *op_data, int ea_size)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       b->valid = valid;
+       if (op_data->op_bias & MDS_CHECK_SPLIT)
+               b->valid |= OBD_MD_FLCKSPLIT;
+       if (op_data->op_bias & MDS_CROSS_REF)
+               b->valid |= OBD_MD_FLCROSSREF;
+       b->eadatasize = ea_size;
+       b->flags = flags;
+       __mdc_pack_body(b, op_data->op_suppgids[0]);
+
+       b->fid1 = op_data->op_fid1;
+       b->fid2 = op_data->op_fid2;
+       b->valid |= OBD_MD_FLID;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       if (op_data->op_name) {
+               char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+       }
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_ioepoch *epoch;
+       struct mdt_rec_setattr *rec;
+
+       epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       mdc_setattr_pack_rec(rec, op_data);
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_ioepoch_pack(epoch, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+       int rc;
+       ENTRY;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&mcw->mcw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       RETURN(rc);
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+       int rc = 0;
+       struct mdc_cache_waiter mcw;
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+               list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+               init_waitqueue_head(&mcw.mcw_waitq);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), &lwi);
+               if (rc) {
+                       client_obd_list_lock(&cli->cl_loi_list_lock);
+                       if (list_empty(&mcw.mcw_entry))
+                               cli->cl_r_in_flight--;
+                       list_del_init(&mcw.mcw_entry);
+                       client_obd_list_unlock(&cli->cl_loi_list_lock);
+               }
+       } else {
+               cli->cl_r_in_flight++;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+       return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct mdc_cache_waiter *mcw;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_r_in_flight--;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+               if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+                       /* No free request slots anymore */
+                       break;
+               }
+
+               mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+               list_del_init(&mcw->mcw_entry);
+               cli->cl_r_in_flight++;
+               wake_up(&mcw->mcw_waitq);
+       }
+       /* Empty waiting list? Decrease reqs in-flight number */
+
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644 (file)
index 0000000..1cc90b6
--- /dev/null
@@ -0,0 +1,1229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+/* fid_res_name_eq() */
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+       struct obd_export          *ga_exp;
+       struct md_enqueue_info      *ga_minfo;
+       struct ldlm_enqueue_info    *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+       return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+       it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+       it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+       if (it_disposition(it, DISP_OPEN_OPEN)) {
+               if (phase >= DISP_OPEN_OPEN)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_OPEN_CREATE)) {
+               if (phase >= DISP_OPEN_CREATE)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+               if (phase >= DISP_LOOKUP_EXECD)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_IT_EXECD)) {
+               if (phase >= DISP_IT_EXECD)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+       CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+              it->d.lustre.it_status);
+       LBUG();
+       return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+                     __u64 *bits)
+{
+       struct ldlm_lock *lock;
+       struct inode *new_inode = data;
+       ENTRY;
+
+       if(bits)
+               *bits = 0;
+
+       if (!*lockh)
+               RETURN(0);
+
+       lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+       LASSERT(lock != NULL);
+       lock_res_and_lock(lock);
+       if (lock->l_resource->lr_lvb_inode &&
+           lock->l_resource->lr_lvb_inode != data) {
+               struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+               LASSERTF(old_inode->i_state & I_FREEING,
+                        "Found existing inode %p/%lu/%u state %lu in lock: "
+                        "setting data to %p/%lu/%u\n", old_inode,
+                        old_inode->i_ino, old_inode->i_generation,
+                        old_inode->i_state,
+                        new_inode, new_inode->i_ino, new_inode->i_generation);
+       }
+       lock->l_resource->lr_lvb_inode = new_inode;
+       if (bits)
+               *bits = lock->l_policy_data.l_inodebits.bits;
+
+       unlock_res_and_lock(lock);
+       LDLM_LOCK_PUT(lock);
+
+       RETURN(0);
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh)
+{
+       struct ldlm_res_id res_id;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       fid_build_reg_res_name(fid, &res_id);
+       rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+                            &res_id, type, policy, mode, lockh, 0);
+       RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+                     const struct lu_fid *fid,
+                     ldlm_policy_data_t *policy,
+                     ldlm_mode_t mode,
+                     ldlm_cancel_flags_t flags,
+                     void *opaque)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+       int rc;
+
+       ENTRY;
+
+       fid_build_reg_res_name(fid, &res_id);
+       rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+                                            policy, mode, flags, opaque);
+       RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+                  const struct lu_fid *fid)
+{
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+       ENTRY;
+
+       LASSERTF(ns != NULL, "no namespace passed\n");
+
+       fid_build_reg_res_name(fid, &res_id);
+
+       res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+       if(res == NULL)
+               RETURN(0);
+
+       lock_res(res);
+       res->lr_lvb_inode = NULL;
+       unlock_res(res);
+
+       ldlm_resource_putref(res);
+       RETURN(0);
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+int mdc_find_cbdata(struct obd_export *exp,
+                   const struct lu_fid *fid,
+                   ldlm_iterator_t it, void *data)
+{
+       struct ldlm_res_id res_id;
+       int rc = 0;
+       ENTRY;
+
+       fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
+       rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+                                  it, data);
+       if (rc == LDLM_ITER_STOP)
+               RETURN(1);
+       else if (rc == LDLM_ITER_CONTINUE)
+               RETURN(0);
+       RETURN(rc);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+       /* Don't hold error requests for replay. */
+       if (req->rq_replay) {
+               spin_lock(&req->rq_lock);
+               req->rq_replay = 0;
+               spin_unlock(&req->rq_lock);
+       }
+       if (rc && req->rq_transno != 0) {
+               DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+               LBUG();
+       }
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+                               struct mdt_body *body)
+{
+       int     rc;
+
+       /* FIXME: remove this explicit offset. */
+       rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+                                       body->eadatasize);
+       if (rc) {
+               CERROR("Can't enlarge segment %d size to %d\n",
+                      DLM_INTENT_REC_OFF + 4, body->eadatasize);
+               body->valid &= ~OBD_MD_FLEASIZE;
+               body->eadatasize = 0;
+       }
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+                                                  struct lookup_intent *it,
+                                                  struct md_op_data *op_data,
+                                                  void *lmm, int lmmsize,
+                                                  void *cb_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ldlm_intent    *lit;
+       LIST_HEAD(cancels);
+       int                 count = 0;
+       int                 mode;
+       int                 rc;
+       ENTRY;
+
+       it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+       /* XXX: openlock is not cancelled for cross-refs. */
+       /* If inode is known, cancel conflicting OPEN locks. */
+       if (fid_is_sane(&op_data->op_fid2)) {
+               if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+                       mode = LCK_CW;
+#ifdef FMODE_EXEC
+               else if (it->it_flags & FMODE_EXEC)
+                       mode = LCK_PR;
+#endif
+               else
+                       mode = LCK_CR;
+               count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                               &cancels, mode,
+                                               MDS_INODELOCK_OPEN);
+       }
+
+       /* If CREATE, cancel parent's UPDATE lock. */
+       if (it->it_op & IT_CREAT)
+               mode = LCK_EX;
+       else
+               mode = LCK_CR;
+       count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                        &cancels, mode,
+                                        MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_OPEN);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       /* parent capability */
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       /* child capability, reserve the size according to parent capa, it will
+        * be filled after we get the reply */
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                            max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+       rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               return NULL;
+       }
+
+       spin_lock(&req->rq_lock);
+       req->rq_replay = req->rq_import->imp_replayable;
+       spin_unlock(&req->rq_lock);
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+                     lmmsize);
+
+       /* for remote client, fetch remote perm for current user */
+       if (client_is_remote(exp))
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       ptlrpc_request_set_replen(req);
+       return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+                                                    struct lookup_intent *it,
+                                                    struct md_op_data *op_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ldlm_intent    *lit;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_UNLINK);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_unlink_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+                                                     struct lookup_intent *it,
+                                                     struct md_op_data *op_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       obd_valid             valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+                                      OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+                                      OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+                                      (client_is_remote(exp) ?
+                                              OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+       struct ldlm_intent    *lit;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_GETATTR);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_getattr_pack(req, valid, it->it_flags, op_data,
+                        obddev->u.cli.cl_max_mds_easize);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_easize);
+       if (client_is_remote(exp))
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+                                                    struct lookup_intent *it,
+                                                    struct md_op_data *unused)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct ldlm_intent    *lit;
+       struct layout_intent  *layout;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                               &RQF_LDLM_INTENT_LAYOUT);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the layout intent request */
+       layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+       /* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+        * set for replication */
+       layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                       obd->u.cli.cl_max_mds_easize);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+       struct ptlrpc_request *req;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+                             struct ptlrpc_request *req,
+                             struct ldlm_enqueue_info *einfo,
+                             struct lookup_intent *it,
+                             struct lustre_handle *lockh,
+                             int rc)
+{
+       struct req_capsule  *pill = &req->rq_pill;
+       struct ldlm_request *lockreq;
+       struct ldlm_reply   *lockrep;
+       struct lustre_intent_data *intent = &it->d.lustre;
+       struct ldlm_lock    *lock;
+       void            *lvb_data = NULL;
+       int               lvb_len = 0;
+       ENTRY;
+
+       LASSERT(rc >= 0);
+       /* Similarly, if we're going to replay this request, we don't want to
+        * actually get a lock, just perform the intent. */
+       if (req->rq_transno || req->rq_replay) {
+               lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+               lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+       }
+
+       if (rc == ELDLM_LOCK_ABORTED) {
+               einfo->ei_mode = 0;
+               memset(lockh, 0, sizeof(*lockh));
+               rc = 0;
+       } else { /* rc = 0 */
+               lock = ldlm_handle2lock(lockh);
+               LASSERT(lock != NULL);
+
+               /* If the server gave us back a different lock mode, we should
+                * fix up our variables. */
+               if (lock->l_req_mode != einfo->ei_mode) {
+                       ldlm_lock_addref(lockh, lock->l_req_mode);
+                       ldlm_lock_decref(lockh, einfo->ei_mode);
+                       einfo->ei_mode = lock->l_req_mode;
+               }
+               LDLM_LOCK_PUT(lock);
+       }
+
+       lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+       LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+       intent->it_disposition = (int)lockrep->lock_policy_res1;
+       intent->it_status = (int)lockrep->lock_policy_res2;
+       intent->it_lock_mode = einfo->ei_mode;
+       intent->it_lock_handle = lockh->cookie;
+       intent->it_data = req;
+
+       /* Technically speaking rq_transno must already be zero if
+        * it_status is in error, so the check is a bit redundant */
+       if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+               mdc_clear_replay_flag(req, intent->it_status);
+
+       /* If we're doing an IT_OPEN which did not result in an actual
+        * successful open, then we need to remove the bit which saves
+        * this request for unconditional replay.
+        *
+        * It's important that we do this first!  Otherwise we might exit the
+        * function without doing so, and try to replay a failed create
+        * (bug 3440) */
+       if (it->it_op & IT_OPEN && req->rq_replay &&
+           (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0))
+               mdc_clear_replay_flag(req, intent->it_status);
+
+       DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+                 it->it_op, intent->it_disposition, intent->it_status);
+
+       /* We know what to expect, so we do any byte flipping required here */
+       if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+               struct mdt_body *body;
+
+               body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+               if (body == NULL) {
+                       CERROR ("Can't swab mdt_body\n");
+                       RETURN (-EPROTO);
+               }
+
+               if (it_disposition(it, DISP_OPEN_OPEN) &&
+                   !it_open_error(DISP_OPEN_OPEN, it)) {
+                       /*
+                        * If this is a successful OPEN request, we need to set
+                        * replay handler and data early, so that if replay
+                        * happens immediately after swabbing below, new reply
+                        * is swabbed by that handler correctly.
+                        */
+                       mdc_set_open_replay_data(NULL, NULL, req);
+               }
+
+               if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+                       void *eadata;
+
+                       mdc_update_max_ea_from_body(exp, body);
+
+                       /*
+                        * The eadata is opaque; just check that it is there.
+                        * Eventually, obd_unpackmd() will check the contents.
+                        */
+                       eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                             body->eadatasize);
+                       if (eadata == NULL)
+                               RETURN(-EPROTO);
+
+                       /* save lvb data and length in case this is for layout
+                        * lock */
+                       lvb_data = eadata;
+                       lvb_len = body->eadatasize;
+
+                       /*
+                        * We save the reply LOV EA in case we have to replay a
+                        * create for recovery.  If we didn't allocate a large
+                        * enough request buffer above we need to reallocate it
+                        * here to hold the actual LOV EA.
+                        *
+                        * To not save LOV EA if request is not going to replay
+                        * (for example error one).
+                        */
+                       if ((it->it_op & IT_OPEN) && req->rq_replay) {
+                               void *lmm;
+                               if (req_capsule_get_size(pill, &RMF_EADATA,
+                                                        RCL_CLIENT) <
+                                   body->eadatasize)
+                                       mdc_realloc_openmsg(req, body);
+                               else
+                                       req_capsule_shrink(pill, &RMF_EADATA,
+                                                          body->eadatasize,
+                                                          RCL_CLIENT);
+
+                               req_capsule_set_size(pill, &RMF_EADATA,
+                                                    RCL_CLIENT,
+                                                    body->eadatasize);
+
+                               lmm = req_capsule_client_get(pill, &RMF_EADATA);
+                               if (lmm)
+                                       memcpy(lmm, eadata, body->eadatasize);
+                       }
+               }
+
+               if (body->valid & OBD_MD_FLRMTPERM) {
+                       struct mdt_remote_perm *perm;
+
+                       LASSERT(client_is_remote(exp));
+                       perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+                       if (perm == NULL)
+                               RETURN(-EPROTO);
+               }
+               if (body->valid & OBD_MD_FLMDSCAPA) {
+                       struct lustre_capa *capa, *p;
+
+                       capa = req_capsule_server_get(pill, &RMF_CAPA1);
+                       if (capa == NULL)
+                               RETURN(-EPROTO);
+
+                       if (it->it_op & IT_OPEN) {
+                               /* client fid capa will be checked in replay */
+                               p = req_capsule_client_get(pill, &RMF_CAPA2);
+                               LASSERT(p);
+                               *p = *capa;
+                       }
+               }
+               if (body->valid & OBD_MD_FLOSSCAPA) {
+                       struct lustre_capa *capa;
+
+                       capa = req_capsule_server_get(pill, &RMF_CAPA2);
+                       if (capa == NULL)
+                               RETURN(-EPROTO);
+               }
+       } else if (it->it_op & IT_LAYOUT) {
+               /* maybe the lock was granted right away and layout
+                * is packed into RMF_DLM_LVB of req */
+               lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+               if (lvb_len > 0) {
+                       lvb_data = req_capsule_server_sized_get(pill,
+                                                       &RMF_DLM_LVB, lvb_len);
+                       if (lvb_data == NULL)
+                               RETURN(-EPROTO);
+               }
+       }
+
+       /* fill in stripe data for layout lock */
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+               void *lmm;
+
+               LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+                       ldlm_it2str(it->it_op), lvb_len);
+
+               OBD_ALLOC_LARGE(lmm, lvb_len);
+               if (lmm == NULL) {
+                       LDLM_LOCK_PUT(lock);
+                       RETURN(-ENOMEM);
+               }
+               memcpy(lmm, lvb_data, lvb_len);
+
+               /* install lvb_data */
+               lock_res_and_lock(lock);
+               if (lock->l_lvb_data == NULL) {
+                       lock->l_lvb_data = lmm;
+                       lock->l_lvb_len = lvb_len;
+                       lmm = NULL;
+               }
+               unlock_res_and_lock(lock);
+               if (lmm != NULL)
+                       OBD_FREE_LARGE(lmm, lvb_len);
+       }
+       if (lock != NULL)
+               LDLM_LOCK_PUT(lock);
+
+       RETURN(rc);
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+               struct lookup_intent *it, struct md_op_data *op_data,
+               struct lustre_handle *lockh, void *lmm, int lmmsize,
+               struct ptlrpc_request **reqp, __u64 extra_lock_flags)
+{
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ptlrpc_request *req = NULL;
+       __u64             flags, saved_flags = extra_lock_flags;
+       int                 rc;
+       struct ldlm_res_id res_id;
+       static const ldlm_policy_data_t lookup_policy =
+                           { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+       static const ldlm_policy_data_t update_policy =
+                           { .l_inodebits = { MDS_INODELOCK_UPDATE } };
+       static const ldlm_policy_data_t layout_policy =
+                           { .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+       ldlm_policy_data_t const *policy = &lookup_policy;
+       int                 generation, resends = 0;
+       struct ldlm_reply     *lockrep;
+       enum lvb_type          lvb_type = 0;
+       ENTRY;
+
+       LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+                einfo->ei_type);
+
+       fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+       if (it) {
+               saved_flags |= LDLM_FL_HAS_INTENT;
+               if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+                       policy = &update_policy;
+               else if (it->it_op & IT_LAYOUT)
+                       policy = &layout_policy;
+       }
+
+       LASSERT(reqp == NULL);
+
+       generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+       flags = saved_flags;
+       if (!it) {
+               /* The only way right now is FLOCK, in this case we hide flock
+                  policy as lmm, but lmmsize is 0 */
+               LASSERT(lmm && lmmsize == 0);
+               LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+                        einfo->ei_type);
+               policy = (ldlm_policy_data_t *)lmm;
+               res_id.name[3] = LDLM_FLOCK;
+       } else if (it->it_op & IT_OPEN) {
+               req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+                                          einfo->ei_cbdata);
+               policy = &update_policy;
+               einfo->ei_cbdata = NULL;
+               lmm = NULL;
+       } else if (it->it_op & IT_UNLINK) {
+               req = mdc_intent_unlink_pack(exp, it, op_data);
+       } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+               req = mdc_intent_getattr_pack(exp, it, op_data);
+       } else if (it->it_op & IT_READDIR) {
+               req = mdc_enqueue_pack(exp, 0);
+       } else if (it->it_op & IT_LAYOUT) {
+               if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+                       RETURN(-EOPNOTSUPP);
+
+               req = mdc_intent_layout_pack(exp, it, op_data);
+               lvb_type = LVB_T_LAYOUT;
+       } else {
+               LBUG();
+               RETURN(-EINVAL);
+       }
+
+       if (IS_ERR(req))
+               RETURN(PTR_ERR(req));
+
+       if (req != NULL && it && it->it_op & IT_CREAT)
+               /* ask ptlrpc not to resend on EINPROGRESS since we have our own
+                * retry logic */
+               req->rq_no_retry_einprogress = 1;
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+
+       /* It is important to obtain rpc_lock first (if applicable), so that
+        * threads that are serialised with rpc_lock are not polluting our
+        * rpcs in flight counter. We do not do flock request limiting, though*/
+       if (it) {
+               mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+               rc = mdc_enter_request(&obddev->u.cli);
+               if (rc != 0) {
+                       mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+                       mdc_clear_replay_flag(req, 0);
+                       ptlrpc_req_finished(req);
+                       RETURN(rc);
+               }
+       }
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+                             0, lvb_type, lockh, 0);
+       if (!it) {
+               /* For flock requests we immediatelly return without further
+                  delay and let caller deal with the rest, since rest of
+                  this function metadata processing makes no sense for flock
+                  requests anyway */
+               RETURN(rc);
+       }
+
+       mdc_exit_request(&obddev->u.cli);
+       mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+       if (rc < 0) {
+               CERROR("ldlm_cli_enqueue: %d\n", rc);
+               mdc_clear_replay_flag(req, rc);
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       LASSERT(lockrep != NULL);
+
+       /* Retry the create infinitely when we get -EINPROGRESS from
+        * server. This is required by the new quota design. */
+       if (it && it->it_op & IT_CREAT &&
+           (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+               mdc_clear_replay_flag(req, rc);
+               ptlrpc_req_finished(req);
+               resends++;
+
+               CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+                      obddev->obd_name, resends, it->it_op,
+                      PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+               if (generation == obddev->u.cli.cl_import->imp_generation) {
+                       goto resend;
+               } else {
+                       CDEBUG(D_HA, "resend cross eviction\n");
+                       RETURN(-EIO);
+               }
+       }
+
+       rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+       if (rc < 0) {
+               if (lustre_handle_is_used(lockh)) {
+                       ldlm_lock_decref(lockh, einfo->ei_mode);
+                       memset(lockh, 0, sizeof(*lockh));
+               }
+               ptlrpc_req_finished(req);
+       }
+       RETURN(rc);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+                                 struct ptlrpc_request *request,
+                                 struct md_op_data *op_data,
+                                 struct lookup_intent *it,
+                                 struct lustre_handle *lockh)
+{
+       struct lustre_handle old_lock;
+       struct mdt_body *mdt_body;
+       struct ldlm_lock *lock;
+       int rc;
+
+
+       LASSERT(request != NULL);
+       LASSERT(request != LP_POISON);
+       LASSERT(request->rq_repmsg != LP_POISON);
+
+       if (!it_disposition(it, DISP_IT_EXECD)) {
+               /* The server failed before it even started executing the
+                * intent, i.e. because it couldn't unpack the request. */
+               LASSERT(it->d.lustre.it_status != 0);
+               RETURN(it->d.lustre.it_status);
+       }
+       rc = it_open_error(DISP_IT_EXECD, it);
+       if (rc)
+               RETURN(rc);
+
+       mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+       LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
+
+       /* If we were revalidating a fid/name pair, mark the intent in
+        * case we fail and get called again from lookup */
+       if (fid_is_sane(&op_data->op_fid2) &&
+           it->it_create_mode & M_CHECK_STALE &&
+           it->it_op != IT_GETATTR) {
+               it_set_disposition(it, DISP_ENQ_COMPLETE);
+
+               /* Also: did we find the same inode? */
+               /* sever can return one of two fids:
+                * op_fid2 - new allocated fid - if file is created.
+                * op_fid3 - existent fid - if file only open.
+                * op_fid3 is saved in lmv_intent_open */
+               if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+                   (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+                       CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+                              "\n", PFID(&op_data->op_fid2),
+                              PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+                       RETURN(-ESTALE);
+               }
+       }
+
+       rc = it_open_error(DISP_LOOKUP_EXECD, it);
+       if (rc)
+               RETURN(rc);
+
+       /* keep requests around for the multiple phases of the call
+        * this shows the DISP_XX must guarantee we make it into the call
+        */
+       if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+           it_disposition(it, DISP_OPEN_CREATE) &&
+           !it_open_error(DISP_OPEN_CREATE, it)) {
+               it_set_disposition(it, DISP_ENQ_CREATE_REF);
+               ptlrpc_request_addref(request); /* balanced in ll_create_node */
+       }
+       if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+           it_disposition(it, DISP_OPEN_OPEN) &&
+           !it_open_error(DISP_OPEN_OPEN, it)) {
+               it_set_disposition(it, DISP_ENQ_OPEN_REF);
+               ptlrpc_request_addref(request); /* balanced in ll_file_open */
+               /* BUG 11546 - eviction in the middle of open rpc processing */
+               OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+       }
+
+       if (it->it_op & IT_CREAT) {
+               /* XXX this belongs in ll_create_it */
+       } else if (it->it_op == IT_OPEN) {
+               LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+       } else {
+               LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+       }
+
+       /* If we already have a matching lock, then cancel the new
+        * one.  We have to set the data here instead of in
+        * mdc_enqueue, because we need to use the child's inode as
+        * the l_ast_data to match, and that's not available until
+        * intent_finish has performed the iget().) */
+       lock = ldlm_handle2lock(lockh);
+       if (lock) {
+               ldlm_policy_data_t policy = lock->l_policy_data;
+               LDLM_DEBUG(lock, "matching against this");
+
+               LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+                                        &lock->l_resource->lr_name),
+                        "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n",
+                        (unsigned long)lock->l_resource->lr_name.name[0],
+                        (unsigned long)lock->l_resource->lr_name.name[1],
+                        (unsigned long)lock->l_resource->lr_name.name[2],
+                        (unsigned long)fid_seq(&mdt_body->fid1),
+                        (unsigned long)fid_oid(&mdt_body->fid1),
+                        (unsigned long)fid_ver(&mdt_body->fid1));
+               LDLM_LOCK_PUT(lock);
+
+               memcpy(&old_lock, lockh, sizeof(*lockh));
+               if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+                                   LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
+                       ldlm_lock_decref_and_cancel(lockh,
+                                                   it->d.lustre.it_lock_mode);
+                       memcpy(lockh, &old_lock, sizeof(old_lock));
+                       it->d.lustre.it_lock_handle = lockh->cookie;
+               }
+       }
+       CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+              op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+              it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+       RETURN(rc);
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits)
+{
+       /* We could just return 1 immediately, but since we should only
+        * be called in revalidate_it if we already have a lock, let's
+        * verify that. */
+       struct ldlm_res_id res_id;
+       struct lustre_handle lockh;
+       ldlm_policy_data_t policy;
+       ldlm_mode_t mode;
+       ENTRY;
+
+       if (it->d.lustre.it_lock_handle) {
+               lockh.cookie = it->d.lustre.it_lock_handle;
+               mode = ldlm_revalidate_lock_handle(&lockh, bits);
+       } else {
+               fid_build_reg_res_name(fid, &res_id);
+               switch (it->it_op) {
+               case IT_GETATTR:
+                       policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+                       break;
+               case IT_LAYOUT:
+                       policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+                       break;
+               default:
+                       policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+                       break;
+               }
+               mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+                                      LDLM_FL_BLOCK_GRANTED, &res_id,
+                                      LDLM_IBITS, &policy,
+                                      LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
+       }
+
+       if (mode) {
+               it->d.lustre.it_lock_handle = lockh.cookie;
+               it->d.lustre.it_lock_mode = mode;
+       } else {
+               it->d.lustre.it_lock_handle = 0;
+               it->d.lustre.it_lock_mode = 0;
+       }
+
+       RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int lookup_flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct lustre_handle lockh;
+       int rc = 0;
+       ENTRY;
+       LASSERT(it);
+
+       CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+              ", intent: %s flags %#o\n", op_data->op_namelen,
+              op_data->op_name, PFID(&op_data->op_fid2),
+              PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+              it->it_flags);
+
+       lockh.cookie = 0;
+       if (fid_is_sane(&op_data->op_fid2) &&
+           (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+               /* We could just return 1 immediately, but since we should only
+                * be called in revalidate_it if we already have a lock, let's
+                * verify that. */
+               it->d.lustre.it_lock_handle = 0;
+               rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+               /* Only return failure if it was not GETATTR by cfid
+                  (from inode_revalidate) */
+               if (rc || op_data->op_namelen != 0)
+                       RETURN(rc);
+       }
+
+       /* lookup_it may be called only after revalidate_it has run, because
+        * revalidate_it cannot return errors, only zero.  Returning zero causes
+        * this call to lookup, which *can* return an error.
+        *
+        * We only want to execute the request associated with the intent one
+        * time, however, so don't send the request again.  Instead, skip past
+        * this and use the request from revalidate.  In this case, revalidate
+        * never dropped its reference, so the refcounts are all OK */
+       if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
+               struct ldlm_enqueue_info einfo =
+                       { LDLM_IBITS, it_to_lock_mode(it), cb_blocking,
+                         ldlm_completion_ast, NULL, NULL, NULL };
+
+               /* For case if upper layer did not alloc fid, do it now. */
+               if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+                       rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+                       if (rc < 0) {
+                               CERROR("Can't alloc new fid, rc %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+               rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh,
+                                lmm, lmmsize, NULL, extra_lock_flags);
+               if (rc < 0)
+                       RETURN(rc);
+       } else if (!fid_is_sane(&op_data->op_fid2) ||
+                  !(it->it_create_mode & M_CHECK_STALE)) {
+               /* DISP_ENQ_COMPLETE set means there is extra reference on
+                * request referenced from this intent, saved for subsequent
+                * lookup.  This path is executed when we proceed to this
+                * lookup, so we clear DISP_ENQ_COMPLETE */
+               it_clear_disposition(it, DISP_ENQ_COMPLETE);
+       }
+       *reqp = it->d.lustre.it_data;
+       rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+       RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+                                             struct ptlrpc_request *req,
+                                             void *args, int rc)
+{
+       struct mdc_getattr_args  *ga = args;
+       struct obd_export       *exp = ga->ga_exp;
+       struct md_enqueue_info   *minfo = ga->ga_minfo;
+       struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+       struct lookup_intent     *it;
+       struct lustre_handle     *lockh;
+       struct obd_device       *obddev;
+       __u64                flags = LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       it    = &minfo->mi_it;
+       lockh = &minfo->mi_lockh;
+
+       obddev = class_exp2obd(exp);
+
+       mdc_exit_request(&obddev->u.cli);
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+               rc = -ETIMEDOUT;
+
+       rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+                                  &flags, NULL, 0, lockh, rc);
+       if (rc < 0) {
+               CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+               mdc_clear_replay_flag(req, rc);
+               GOTO(out, rc);
+       }
+
+       rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+       EXIT;
+
+out:
+       OBD_FREE_PTR(einfo);
+       minfo->mi_cb(req, minfo, rc);
+       return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo)
+{
+       struct md_op_data       *op_data = &minfo->mi_data;
+       struct lookup_intent    *it = &minfo->mi_it;
+       struct ptlrpc_request   *req;
+       struct mdc_getattr_args *ga;
+       struct obd_device       *obddev = class_exp2obd(exp);
+       struct ldlm_res_id       res_id;
+       /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+        *     for statahead currently. Consider CMD in future, such two bits
+        *     maybe managed by different MDS, should be adjusted then. */
+       ldlm_policy_data_t       policy = {
+                                       .l_inodebits = { MDS_INODELOCK_LOOKUP |
+                                                        MDS_INODELOCK_UPDATE }
+                                };
+       int                   rc = 0;
+       __u64               flags = LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              ldlm_it2str(it->it_op), it->it_flags);
+
+       fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+       req = mdc_intent_getattr_pack(exp, it, op_data);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       rc = mdc_enter_request(&obddev->u.cli);
+       if (rc != 0) {
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+                             0, LVB_T_NONE, &minfo->mi_lockh, 1);
+       if (rc < 0) {
+               mdc_exit_request(&obddev->u.cli);
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+       ga = ptlrpc_req_async_args(req);
+       ga->ga_exp = exp;
+       ga->ga_minfo = minfo;
+       ga->ga_einfo = einfo;
+
+       req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644 (file)
index 0000000..5e25a07
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+                    struct mdc_rpc_lock *rpc_lock,
+                    int level)
+{
+       int rc;
+
+       request->rq_send_state = level;
+
+       mdc_get_rpc_lock(rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(request);
+       mdc_put_rpc_lock(rpc_lock, NULL);
+       if (rc)
+               CDEBUG(D_INFO, "error in handling %d\n", rc);
+       else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+               rc = -EPROTO;
+       }
+       return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+                           struct list_head *cancels, ldlm_mode_t mode,
+                           __u64 bits)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       ldlm_policy_data_t policy = {{0}};
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       int count;
+       ENTRY;
+
+       /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+        * export) but disabled through procfs (flag in NS).
+        *
+        * This distinguishes from a case when ELC is not supported originally,
+        * when we still want to cancel locks in advance and just cancel them
+        * locally, without sending any RPC. */
+       if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+               RETURN(0);
+
+       fid_build_reg_res_name(fid, &res_id);
+       res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+                               NULL, &res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+       LDLM_RESOURCE_ADDREF(res);
+       /* Initialize ibits lock policy. */
+       policy.l_inodebits.bits = bits;
+       count = ldlm_cancel_resource_local(res, cancels, &policy,
+                                          mode, 0, 0, NULL);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(count);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+               void *ea, int ealen, void *ea2, int ea2len,
+               struct ptlrpc_request **request, struct md_open_data **mod)
+{
+       LIST_HEAD(cancels);
+       struct ptlrpc_request *req;
+       struct mdc_rpc_lock *rpc_lock;
+       struct obd_device *obd = exp->exp_obd;
+       int count = 0, rc;
+       __u64 bits;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+
+       bits = MDS_INODELOCK_UPDATE;
+       if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+               bits |= MDS_INODELOCK_LOOKUP;
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX, bits);
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_SETATTR);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+               req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+                                    0);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+                            ea2len);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       rpc_lock = obd->u.cli.cl_rpc_lock;
+
+       if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+               CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+                      ", ctime "CFS_TIME_T"\n",
+                      LTIME_S(op_data->op_attr.ia_mtime),
+                      LTIME_S(op_data->op_attr.ia_ctime));
+       mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+       ptlrpc_request_set_replen(req);
+       if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+           req->rq_import->imp_replayable)
+       {
+               LASSERT(*mod == NULL);
+
+               *mod = obd_mod_alloc();
+               if (*mod == NULL) {
+                       DEBUG_REQ(D_ERROR, req, "Can't allocate "
+                                 "md_open_data");
+               } else {
+                       req->rq_replay = 1;
+                       req->rq_cb_data = *mod;
+                       (*mod)->mod_open_req = req;
+                       req->rq_commit_cb = mdc_commit_open;
+                       /**
+                        * Take an extra reference on \var mod, it protects \var
+                        * mod from being freed on eviction (commit callback is
+                        * called despite rq_replay flag).
+                        * Will be put on mdc_done_writing().
+                        */
+                       obd_mod_get(*mod);
+               }
+       }
+
+       rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+       /* Save the obtained info in the original RPC for the replay case. */
+       if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+               struct mdt_ioepoch *epoch;
+               struct mdt_body  *body;
+
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               LASSERT(epoch != NULL);
+               LASSERT(body != NULL);
+               epoch->handle = body->handle;
+               epoch->ioepoch = body->ioepoch;
+               req->rq_replay_cb = mdc_replay_open;
+       /** bug 3633, open may be committed and estale answer is not error */
+       } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+               rc = 0;
+       } else if (rc == -ERESTARTSYS) {
+               rc = 0;
+       }
+       *request = req;
+       if (rc && req->rq_commit_cb) {
+               /* Put an extra reference on \var mod on error case. */
+               obd_mod_put(*mod);
+               req->rq_commit_cb(req);
+       }
+       RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+              cfs_cap_t cap_effective, __u64 rdev,
+              struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int level, rc;
+       int count, resends = 0;
+       struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+       int generation = import->imp_generation;
+       LIST_HEAD(cancels);
+       ENTRY;
+
+       /* For case if upper layer did not alloc fid, do it now. */
+       if (!fid_is_sane(&op_data->op_fid2)) {
+               /*
+                * mdc_fid_alloc() may return errno 1 in case of switch to new
+                * sequence, handle this.
+                */
+               rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+               if (rc < 0) {
+                       CERROR("Can't alloc new fid, rc %d\n", rc);
+                       RETURN(rc);
+               }
+       }
+
+rebuild:
+       count = 0;
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_CREATE_RMT_ACL);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                            data && datalen ? datalen : 0);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /*
+        * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+        * tgt, for symlinks or lov MD data.
+        */
+       mdc_create_pack(req, op_data, data, datalen, mode, uid,
+                       gid, cap_effective, rdev);
+
+       ptlrpc_request_set_replen(req);
+
+       /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+        * logic here */
+       req->rq_no_retry_einprogress = 1;
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+       level = LUSTRE_IMP_FULL;
+ resend:
+       rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+       /* Resend if we were told to. */
+       if (rc == -ERESTARTSYS) {
+               level = LUSTRE_IMP_RECOVER;
+               goto resend;
+       } else if (rc == -EINPROGRESS) {
+               /* Retry create infinitely until succeed or get other
+                * error code. */
+               ptlrpc_req_finished(req);
+               resends++;
+
+               CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+                      exp->exp_obd->obd_name, resends,
+                      PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+               if (generation == import->imp_generation) {
+                       goto rebuild;
+               } else {
+                       CDEBUG(D_HA, "resend cross eviction\n");
+                       RETURN(-EIO);
+               }
+       } else if (rc == 0) {
+               struct mdt_body *body;
+               struct lustre_capa *capa;
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               LASSERT(body);
+               if (body->valid & OBD_MD_FLMDSCAPA) {
+                       capa = req_capsule_server_get(&req->rq_pill,
+                                                     &RMF_CAPA1);
+                       if (capa == NULL)
+                               rc = -EPROTO;
+               }
+       }
+
+       *request = req;
+       RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+              struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req = *request;
+       int count = 0, rc;
+       ENTRY;
+
+       LASSERT(req == NULL);
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+           (fid_is_sane(&op_data->op_fid3)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_FULL);
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_UNLINK);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_unlink_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+
+       *request = req;
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+       RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+            struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req;
+       int count = 0, rc;
+       ENTRY;
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+           (fid_is_sane(&op_data->op_fid2)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_link_pack(req, op_data);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       *request = req;
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+              const char *old, int oldlen, const char *new, int newlen,
+              struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req;
+       int count = 0, rc;
+       ENTRY;
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+           (fid_is_sane(&op_data->op_fid2)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+           (fid_is_sane(&op_data->op_fid3)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_LOOKUP);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+            (fid_is_sane(&op_data->op_fid4)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_FULL);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_RENAME);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (exp_connect_cancelset(exp) && req)
+               ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+       mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       *request = req;
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644 (file)
index 0000000..88454bf
--- /dev/null
@@ -0,0 +1,2752 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+       struct obd_capa *ra_oc;
+       renew_capa_cb_t  ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+                   const struct req_msg_field *field, struct obd_capa **oc)
+{
+       struct lustre_capa *capa;
+       struct obd_capa *c;
+       ENTRY;
+
+       /* swabbed already in mdc_enqueue */
+       capa = req_capsule_server_get(&req->rq_pill, field);
+       if (capa == NULL)
+               RETURN(-EPROTO);
+
+       c = alloc_capa(CAPA_SITE_CLIENT);
+       if (IS_ERR(c)) {
+               CDEBUG(D_INFO, "alloc capa failed!\n");
+               RETURN(PTR_ERR(c));
+       } else {
+               c->c_capa = *capa;
+               *oc = c;
+               RETURN(0);
+       }
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+       int rc;
+
+       /* mdc_enter_request() ensures that this client has no more
+        * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+        * against an MDT. */
+       rc = mdc_enter_request(cli);
+       if (rc != 0)
+               return rc;
+
+       rc = ptlrpc_queue_wait(req);
+       mdc_exit_request(cli);
+
+       return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+                         struct obd_capa **pc, int level, int msg_flags)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+                                       LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+       lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+       req->rq_send_state = level;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       *rootfid = body->fid1;
+       CDEBUG(D_NET,
+              "root fid="DFID", last_committed="LPU64"\n",
+              PFID(rootfid),
+              lustre_msg_get_last_committed(req->rq_repmsg));
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+                 struct obd_capa **pc)
+{
+       return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+                             LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+                             struct ptlrpc_request *req)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       struct mdt_body    *body;
+       void           *eadata;
+       int              rc;
+       ENTRY;
+
+       /* Request message already built. */
+       rc = ptlrpc_queue_wait(req);
+       if (rc != 0)
+               RETURN(rc);
+
+       /* sanity check for the reply */
+       body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+       if (body->eadatasize != 0) {
+               mdc_update_max_ea_from_body(exp, body);
+
+               eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                     body->eadatasize);
+               if (eadata == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       if (body->valid & OBD_MD_FLRMTPERM) {
+               struct mdt_remote_perm *perm;
+
+               LASSERT(client_is_remote(exp));
+               perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+               if (perm == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               struct lustre_capa *capa;
+               capa = req_capsule_server_get(pill, &RMF_CAPA1);
+               if (capa == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       RETURN(0);
+}
+
+int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+               struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       /* Single MDS without an LMV case */
+       if (op_data->op_flags & MF_GET_MDT_IDX) {
+               op_data->op_mds = 0;
+               RETURN(0);
+       }
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     op_data->op_valid, op_data->op_mode, -1, 0);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            op_data->op_mode);
+       if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+               LASSERT(client_is_remote(exp));
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       }
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_getattr_common(exp, req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+                    struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_GETATTR_NAME);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     op_data->op_valid, op_data->op_mode,
+                     op_data->op_suppgids[0], 0);
+
+       if (op_data->op_name) {
+               char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+                               op_data->op_namelen);
+               memcpy(name, op_data->op_name, op_data->op_namelen);
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            op_data->op_mode);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_getattr_common(exp, req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+                        const struct lu_fid *pfid,
+                        const struct lu_fid *cfid,
+                        struct ptlrpc_request **request)
+{
+       struct ptlrpc_request  *req;
+       int                  rc;
+
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+                                       MDS_IS_SUBDIR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_is_subdir_pack(req, pfid, cfid, 0);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc && rc != -EREMOTE)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+                           const struct lu_fid *fid,
+                           struct obd_capa *oc, int opcode, obd_valid valid,
+                           const char *xattr_name, const char *input,
+                           int input_size, int output_size, int flags,
+                           __u32 suppgid, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int   xattr_namelen = 0;
+       char *tmp;
+       int   rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+       if (xattr_name) {
+               xattr_namelen = strlen(xattr_name) + 1;
+               req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                    xattr_namelen);
+       }
+       if (input_size) {
+               LASSERT(input);
+               req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                                    input_size);
+       }
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (opcode == MDS_REINT) {
+               struct mdt_rec_setxattr *rec;
+
+               CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+                        sizeof(struct mdt_rec_reint));
+               rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+               rec->sx_opcode = REINT_SETXATTR;
+               /* TODO:
+                *  cfs_curproc_fs{u,g}id() should replace
+                *  current->fs{u,g}id for portability.
+                */
+               rec->sx_fsuid  = current_fsuid();
+               rec->sx_fsgid  = current_fsgid();
+               rec->sx_cap    = cfs_curproc_cap_pack();
+               rec->sx_suppgid1 = suppgid;
+               rec->sx_suppgid2 = -1;
+               rec->sx_fid    = *fid;
+               rec->sx_valid  = valid | OBD_MD_FLCTIME;
+               rec->sx_time   = cfs_time_current_sec();
+               rec->sx_size   = output_size;
+               rec->sx_flags  = flags;
+
+               mdc_pack_capa(req, &RMF_CAPA1, oc);
+       } else {
+               mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+       }
+
+       if (xattr_name) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               memcpy(tmp, xattr_name, xattr_namelen);
+       }
+       if (input_size) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, input, input_size);
+       }
+
+       if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+               req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+                                    RCL_SERVER, output_size);
+       ptlrpc_request_set_replen(req);
+
+       /* make rpc */
+       if (opcode == MDS_REINT)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       rc = ptlrpc_queue_wait(req);
+
+       if (opcode == MDS_REINT)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+                struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+                const char *input, int input_size, int output_size,
+                int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+       return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+                               fid, oc, MDS_REINT, valid, xattr_name,
+                               input, input_size, output_size, flags,
+                               suppgid, request);
+}
+
+int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+                struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+                const char *input, int input_size, int output_size,
+                int flags, struct ptlrpc_request **request)
+{
+       return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+                               fid, oc, MDS_GETXATTR, valid, xattr_name,
+                               input, input_size, output_size, flags,
+                               -1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+       struct req_capsule     *pill = &req->rq_pill;
+       struct mdt_body *body = md->body;
+       struct posix_acl       *acl;
+       void               *buf;
+       int                  rc;
+       ENTRY;
+
+       if (!body->aclsize)
+               RETURN(0);
+
+       buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+       if (!buf)
+               RETURN(-EPROTO);
+
+       acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+       if (IS_ERR(acl)) {
+               rc = PTR_ERR(acl);
+               CERROR("convert xattr to acl: %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = posix_acl_valid(acl);
+       if (rc) {
+               CERROR("validate acl: %d\n", rc);
+               posix_acl_release(acl);
+               RETURN(rc);
+       }
+
+       md->posix_acl = acl;
+       RETURN(0);
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *md_exp,
+                     struct lustre_md *md)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       int rc;
+       ENTRY;
+
+       LASSERT(md);
+       memset(md, 0, sizeof(*md));
+
+       md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+       LASSERT(md->body != NULL);
+
+       if (md->body->valid & OBD_MD_FLEASIZE) {
+               int lmmsize;
+               struct lov_mds_md *lmm;
+
+               if (!S_ISREG(md->body->mode)) {
+                       CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+                              "regular file, but is not\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+
+               if (md->body->eadatasize == 0) {
+                       CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+                              "but eadatasize 0\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+               lmmsize = md->body->eadatasize;
+               lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+               if (!lmm)
+                       GOTO(out, rc = -EPROTO);
+
+               rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+               if (rc < 0)
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*md->lsm)) {
+                       CDEBUG(D_INFO, "lsm size too small: "
+                              "rc < sizeof (*md->lsm) (%d < %d)\n",
+                              rc, (int)sizeof(*md->lsm));
+                       GOTO(out, rc = -EPROTO);
+               }
+
+       } else if (md->body->valid & OBD_MD_FLDIREA) {
+               int lmvsize;
+               struct lov_mds_md *lmv;
+
+               if(!S_ISDIR(md->body->mode)) {
+                       CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+                              "directory, but is not\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+
+               if (md->body->eadatasize == 0) {
+                       CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+                              "but eadatasize 0\n");
+                       RETURN(-EPROTO);
+               }
+               if (md->body->valid & OBD_MD_MEA) {
+                       lmvsize = md->body->eadatasize;
+                       lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                          lmvsize);
+                       if (!lmv)
+                               GOTO(out, rc = -EPROTO);
+
+                       rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+                                         lmvsize);
+                       if (rc < 0)
+                               GOTO(out, rc);
+
+                       if (rc < sizeof(*md->mea)) {
+                               CDEBUG(D_INFO, "size too small:  "
+                                      "rc < sizeof(*md->mea) (%d < %d)\n",
+                                       rc, (int)sizeof(*md->mea));
+                               GOTO(out, rc = -EPROTO);
+                       }
+               }
+       }
+       rc = 0;
+
+       if (md->body->valid & OBD_MD_FLRMTPERM) {
+               /* remote permission */
+               LASSERT(client_is_remote(exp));
+               md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+               if (!md->remote_perm)
+                       GOTO(out, rc = -EPROTO);
+       }
+       else if (md->body->valid & OBD_MD_FLACL) {
+               /* for ACL, it's possible that FLACL is set but aclsize is zero.
+                * only when aclsize != 0 there's an actual segment for ACL
+                * in reply buffer.
+                */
+               if (md->body->aclsize) {
+                       rc = mdc_unpack_acl(req, md);
+                       if (rc)
+                               GOTO(out, rc);
+#ifdef CONFIG_FS_POSIX_ACL
+               } else {
+                       md->posix_acl = NULL;
+#endif
+               }
+       }
+       if (md->body->valid & OBD_MD_FLMDSCAPA) {
+               struct obd_capa *oc = NULL;
+
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+               if (rc)
+                       GOTO(out, rc);
+               md->mds_capa = oc;
+       }
+
+       if (md->body->valid & OBD_MD_FLOSSCAPA) {
+               struct obd_capa *oc = NULL;
+
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+               if (rc)
+                       GOTO(out, rc);
+               md->oss_capa = oc;
+       }
+
+       EXIT;
+out:
+       if (rc) {
+               if (md->oss_capa) {
+                       capa_put(md->oss_capa);
+                       md->oss_capa = NULL;
+               }
+               if (md->mds_capa) {
+                       capa_put(md->mds_capa);
+                       md->mds_capa = NULL;
+               }
+#ifdef CONFIG_FS_POSIX_ACL
+               posix_acl_release(md->posix_acl);
+#endif
+               if (md->lsm)
+                       obd_free_memmd(dt_exp, &md->lsm);
+       }
+       return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+       ENTRY;
+       RETURN(0);
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+       struct md_open_data *mod = req->rq_cb_data;
+       struct ptlrpc_request *close_req;
+       struct obd_client_handle *och;
+       struct lustre_handle old;
+       struct mdt_body *body;
+       ENTRY;
+
+       if (mod == NULL) {
+               DEBUG_REQ(D_ERROR, req,
+                         "Can't properly replay without open data.");
+               EXIT;
+               return;
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       och = mod->mod_och;
+       if (och != NULL) {
+               struct lustre_handle *file_fh;
+
+               LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+               file_fh = &och->och_fh;
+               CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n",
+                      file_fh->cookie, body->handle.cookie);
+               old = *file_fh;
+               *file_fh = body->handle;
+       }
+       close_req = mod->mod_close_req;
+       if (close_req != NULL) {
+               __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+               struct mdt_ioepoch *epoch;
+
+               LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+               epoch = req_capsule_client_get(&close_req->rq_pill,
+                                              &RMF_MDT_EPOCH);
+               LASSERT(epoch);
+
+               if (och != NULL)
+                       LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+               DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+               epoch->handle = body->handle;
+       }
+       EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+       struct md_open_data *mod = req->rq_cb_data;
+       if (mod == NULL)
+               return;
+
+       /**
+        * No need to touch md_open_data::mod_och, it holds a reference on
+        * \var mod and will zero references to each other, \var mod will be
+        * freed after that when md_open_data::mod_och will put the reference.
+        */
+
+       /**
+        * Do not let open request to disappear as it still may be needed
+        * for close rpc to happen (it may happen on evict only, otherwise
+        * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+        * called), just mark this rpc as committed to distinguish these 2
+        * cases, see mdc_close() for details. The open request reference will
+        * be put along with freeing \var mod.
+        */
+       ptlrpc_request_addref(req);
+       spin_lock(&req->rq_lock);
+       req->rq_committed = 1;
+       spin_unlock(&req->rq_lock);
+       req->rq_cb_data = NULL;
+       obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req)
+{
+       struct md_open_data   *mod;
+       struct mdt_rec_create *rec;
+       struct mdt_body       *body;
+       struct obd_import     *imp = open_req->rq_import;
+       ENTRY;
+
+       if (!open_req->rq_replay)
+               RETURN(0);
+
+       rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+       body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(rec != NULL);
+       /* Incoming message in my byte order (it's been swabbed). */
+       /* Outgoing messages always in my byte order. */
+       LASSERT(body != NULL);
+
+       /* Only if the import is replayable, we set replay_open data */
+       if (och && imp->imp_replayable) {
+               mod = obd_mod_alloc();
+               if (mod == NULL) {
+                       DEBUG_REQ(D_ERROR, open_req,
+                                 "Can't allocate md_open_data");
+                       RETURN(0);
+               }
+
+               /**
+                * Take a reference on \var mod, to be freed on mdc_close().
+                * It protects \var mod from being freed on eviction (commit
+                * callback is called despite rq_replay flag).
+                * Another reference for \var och.
+                */
+               obd_mod_get(mod);
+               obd_mod_get(mod);
+
+               spin_lock(&open_req->rq_lock);
+               och->och_mod = mod;
+               mod->mod_och = och;
+               mod->mod_open_req = open_req;
+               open_req->rq_cb_data = mod;
+               open_req->rq_commit_cb = mdc_commit_open;
+               spin_unlock(&open_req->rq_lock);
+       }
+
+       rec->cr_fid2 = body->fid1;
+       rec->cr_ioepoch = body->ioepoch;
+       rec->cr_old_handle.cookie = body->handle.cookie;
+       open_req->rq_replay_cb = mdc_replay_open;
+       if (!fid_is_sane(&body->fid1)) {
+               DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+                         "insane fid");
+               LBUG();
+       }
+
+       DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+       RETURN(0);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och)
+{
+       struct md_open_data *mod = och->och_mod;
+       ENTRY;
+
+       /**
+        * It is possible to not have \var mod in a case of eviction between
+        * lookup and ll_file_open().
+        **/
+       if (mod == NULL)
+               RETURN(0);
+
+       LASSERT(mod != LP_POISON);
+
+       mod->mod_och = NULL;
+       och->och_mod = NULL;
+       obd_mod_put(mod);
+
+       RETURN(0);
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+                                  struct md_op_data *op_data, int rc) {
+       struct mdt_body  *repbody;
+       struct mdt_ioepoch *epoch;
+
+       if (req && rc == -EAGAIN) {
+               repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+               epoch->flags |= MF_SOM_AU;
+               if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+                       op_data->op_flags |= MF_GETATTR_LOCK;
+       }
+}
+
+int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+             struct md_open_data *mod, struct ptlrpc_request **request)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_CLOSE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+        * portal whose threads are not taking any DLM locks and are therefore
+        * always progressing */
+       req->rq_request_portal = MDS_READPAGE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       /* Ensure that this close's handle is fixed up during replay. */
+       if (likely(mod != NULL)) {
+               LASSERTF(mod->mod_open_req != NULL &&
+                        mod->mod_open_req->rq_type != LI_POISON,
+                        "POISONED open %p!\n", mod->mod_open_req);
+
+               mod->mod_close_req = req;
+
+               DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+               /* We no longer want to preserve this open for replay even
+                * though the open was committed. b=3632, b=3633 */
+               spin_lock(&mod->mod_open_req->rq_lock);
+               mod->mod_open_req->rq_replay = 0;
+               spin_unlock(&mod->mod_open_req->rq_lock);
+       } else {
+                CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+       }
+
+       mdc_close_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+       if (req->rq_repmsg == NULL) {
+               CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+                      req->rq_status);
+               if (rc == 0)
+                       rc = req->rq_status ?: -EIO;
+       } else if (rc == 0 || rc == -EAGAIN) {
+               struct mdt_body *body;
+
+               rc = lustre_msg_get_status(req->rq_repmsg);
+               if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+                       DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err "
+                                 "= %d", rc);
+                       if (rc > 0)
+                               rc = -rc;
+               }
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               if (body == NULL)
+                       rc = -EPROTO;
+       } else if (rc == -ESTALE) {
+               /**
+                * it can be allowed error after 3633 if open was committed and
+                * server failed before close was sent. Let's check if mod
+                * exists and return no error in that case
+                */
+               if (mod) {
+                       DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+                       LASSERT(mod->mod_open_req != NULL);
+                       if (mod->mod_open_req->rq_committed)
+                               rc = 0;
+               }
+       }
+
+       if (mod) {
+               if (rc != 0)
+                       mod->mod_close_req = NULL;
+               /* Since now, mod is accessed through open_req only,
+                * thus close req does not keep a reference on mod anymore. */
+               obd_mod_put(mod);
+       }
+       *request = req;
+       mdc_close_handle_reply(req, op_data, rc);
+       RETURN(rc);
+}
+
+int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+                    struct md_open_data *mod)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_DONE_WRITING);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (mod != NULL) {
+               LASSERTF(mod->mod_open_req != NULL &&
+                        mod->mod_open_req->rq_type != LI_POISON,
+                        "POISONED setattr %p!\n", mod->mod_open_req);
+
+               mod->mod_close_req = req;
+               DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+               /* We no longer want to preserve this setattr for replay even
+                * though the open was committed. b=3632, b=3633 */
+               spin_lock(&mod->mod_open_req->rq_lock);
+               mod->mod_open_req->rq_replay = 0;
+               spin_unlock(&mod->mod_open_req->rq_lock);
+       }
+
+       mdc_close_pack(req, op_data);
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+       if (rc == -ESTALE) {
+               /**
+                * it can be allowed error after 3633 if open or setattr were
+                * committed and server failed before close was sent.
+                * Let's check if mod exists and return no error in that case
+                */
+               if (mod) {
+                       LASSERT(mod->mod_open_req != NULL);
+                       if (mod->mod_open_req->rq_committed)
+                               rc = 0;
+               }
+       }
+
+       if (mod) {
+               if (rc != 0)
+                       mod->mod_close_req = NULL;
+               /* Since now, mod is accessed through setattr req only,
+                * thus DW req does not keep a reference on mod anymore. */
+               obd_mod_put(mod);
+       }
+
+       mdc_close_handle_reply(req, op_data, rc);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+
+int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+                struct page **pages, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request   *req;
+       struct ptlrpc_bulk_desc *desc;
+       int                   i;
+       wait_queue_head_t             waitq;
+       int                   resends = 0;
+       struct l_wait_info       lwi;
+       int                   rc;
+       ENTRY;
+
+       *request = NULL;
+       init_waitqueue_head(&waitq);
+
+restart_bulk:
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       req->rq_request_portal = MDS_READPAGE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+                                   MDS_BULK_PORTAL);
+       if (desc == NULL) {
+               ptlrpc_request_free(req);
+               RETURN(-ENOMEM);
+       }
+
+       /* NB req now owns desc and will free it when it gets freed */
+       for (i = 0; i < op_data->op_npages; i++)
+               ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+       mdc_readdir_pack(req, op_data->op_offset,
+                        PAGE_CACHE_SIZE * op_data->op_npages,
+                        &op_data->op_fid1, op_data->op_capa1);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               ptlrpc_req_finished(req);
+               if (rc != -ETIMEDOUT)
+                       RETURN(rc);
+
+               resends++;
+               if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+                       CERROR("too many resend retries, returning error\n");
+                       RETURN(-EIO);
+               }
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL);
+               l_wait_event(waitq, 0, &lwi);
+
+               goto restart_bulk;
+       }
+
+       rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+                                         req->rq_bulk->bd_nob_transferred);
+       if (rc < 0) {
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+               CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+                       req->rq_bulk->bd_nob_transferred,
+                       PAGE_CACHE_SIZE * op_data->op_npages);
+               ptlrpc_req_finished(req);
+               RETURN(-EPROTO);
+       }
+
+       *request = req;
+       RETURN(0);
+}
+
+static int mdc_statfs(const struct lu_env *env,
+                     struct obd_export *exp, struct obd_statfs *osfs,
+                     __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct obd_statfs     *msfs;
+       struct obd_import     *imp = NULL;
+       int                 rc;
+       ENTRY;
+
+       /*
+        * Since the request might also come from lprocfs, so we need
+        * sync this with client_disconnect_export Bug15684
+        */
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import)
+               imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+       if (!imp)
+               RETURN(-ENODEV);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+                                       LUSTRE_MDS_VERSION, MDS_STATFS);
+       if (req == NULL)
+               GOTO(output, rc = -ENOMEM);
+
+       ptlrpc_request_set_replen(req);
+
+       if (flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stay in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               /* check connection error first */
+               if (imp->imp_connect_error)
+                       rc = imp->imp_connect_error;
+               GOTO(out, rc);
+       }
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *osfs = *msfs;
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+output:
+       class_import_put(imp);
+       return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+       __u32 keylen, vallen;
+       void *key;
+       int rc;
+
+       if (gf->gf_pathlen > PATH_MAX)
+               RETURN(-ENAMETOOLONG);
+       if (gf->gf_pathlen < 2)
+               RETURN(-EOVERFLOW);
+
+       /* Key is KEY_FID2PATH + getinfo_fid2path description */
+       keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+       OBD_ALLOC(key, keylen);
+       if (key == NULL)
+               RETURN(-ENOMEM);
+       memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+       memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+       CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+              PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+       if (!fid_is_sane(&gf->gf_fid))
+               GOTO(out, rc = -EINVAL);
+
+       /* Val is struct getinfo_fid2path result plus path */
+       vallen = sizeof(*gf) + gf->gf_pathlen;
+
+       rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+       if (rc != 0 && rc != -EREMOTE)
+               GOTO(out, rc);
+
+       if (vallen <= sizeof(*gf))
+               GOTO(out, rc = -EPROTO);
+       else if (vallen > sizeof(*gf) + gf->gf_pathlen)
+               GOTO(out, rc = -EOVERFLOW);
+
+       CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n",
+              PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+       OBD_FREE(key, keylen);
+       return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+                               struct hsm_progress_kernel *hpk)
+{
+       struct obd_import               *imp = class_exp2cliimp(exp);
+       struct hsm_progress_kernel      *req_hpk;
+       struct ptlrpc_request           *req;
+       int                              rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+                                       LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_progress struct */
+       req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+       if (req_hpk == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *req_hpk = *hpk;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+       __u32                   *archive_mask;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+                                       LUSTRE_MDS_VERSION,
+                                       MDS_HSM_CT_REGISTER);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_progress struct */
+       archive_mask = req_capsule_client_get(&req->rq_pill,
+                                             &RMF_MDS_HSM_ARCHIVE);
+       if (archive_mask == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *archive_mask = archives;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+                                     struct md_op_data *op_data)
+{
+       struct hsm_current_action       *hca = op_data->op_data;
+       struct hsm_current_action       *req_hca;
+       struct ptlrpc_request           *req;
+       int                              rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_ACTION);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       req_hca = req_capsule_server_get(&req->rq_pill,
+                                        &RMF_MDS_HSM_CURRENT_ACTION);
+       if (req_hca == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *hca = *req_hca;
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+                                       LUSTRE_MDS_VERSION,
+                                       MDS_HSM_CT_UNREGISTER);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+                                struct md_op_data *op_data)
+{
+       struct hsm_user_state   *hus = op_data->op_data;
+       struct hsm_user_state   *req_hus;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_STATE_GET);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+       if (rc != 0) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+       if (req_hus == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *hus = *req_hus;
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+                                struct md_op_data *op_data)
+{
+       struct hsm_state_set    *hss = op_data->op_data;
+       struct hsm_state_set    *req_hss;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_STATE_SET);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       /* Copy states */
+       req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+       if (req_hss == NULL)
+               GOTO(out, rc = -EPROTO);
+       *req_hss = *hss;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+                              struct hsm_user_request *hur)
+{
+       struct obd_import       *imp = class_exp2cliimp(exp);
+       struct ptlrpc_request   *req;
+       struct hsm_request      *req_hr;
+       struct hsm_user_item    *req_hui;
+       char                    *req_opaque;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+                            hur->hur_request.hr_itemcount
+                            * sizeof(struct hsm_user_item));
+       req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+                            hur->hur_request.hr_data_len);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_request struct */
+       req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+       if (req_hr == NULL)
+               GOTO(out, rc = -EPROTO);
+       *req_hr = hur->hur_request;
+
+       /* Copy hsm_user_item structs */
+       req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+       if (req_hui == NULL)
+               GOTO(out, rc = -EPROTO);
+       memcpy(req_hui, hur->hur_user_item,
+              hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+       /* Copy opaque field */
+       req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+       if (req_opaque == NULL)
+               GOTO(out, rc = -EPROTO);
+       memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+       struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+       LASSERT(len <= CR_MAXSIZE);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+       lh->kuc_flags = flags;
+       lh->kuc_msgtype = CL_RECORD;
+       lh->kuc_msglen = len;
+       return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+       __u64           cs_startrec;
+       __u32           cs_flags;
+       struct file     *cs_fp;
+       char            *cs_buf;
+       struct obd_device *cs_obd;
+};
+
+static int changelog_show_cb(const struct lu_env *env, struct llog_handle *llh,
+                            struct llog_rec_hdr *hdr, void *data)
+{
+       struct changelog_show *cs = data;
+       struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+       struct kuc_hdr *lh;
+       int len, rc;
+       ENTRY;
+
+       if ((rec->cr_hdr.lrh_type != CHANGELOG_REC) ||
+           (rec->cr.cr_type >= CL_LAST)) {
+               CERROR("Not a changelog rec %d/%d\n", rec->cr_hdr.lrh_type,
+                      rec->cr.cr_type);
+               RETURN(-EINVAL);
+       }
+
+       if (rec->cr.cr_index < cs->cs_startrec) {
+               /* Skip entries earlier than what we are interested in */
+               CDEBUG(D_CHANGELOG, "rec="LPU64" start="LPU64"\n",
+                      rec->cr.cr_index, cs->cs_startrec);
+               RETURN(0);
+       }
+
+       CDEBUG(D_CHANGELOG, LPU64" %02d%-5s "LPU64" 0x%x t="DFID" p="DFID
+               " %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+               changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+               rec->cr.cr_flags & CLF_FLAGMASK,
+               PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+               rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+       len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+       /* Set up the message */
+       lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+       memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+       rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+       CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len,rc);
+
+       RETURN(rc);
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+       struct changelog_show *cs = csdata;
+       struct llog_ctxt *ctxt = NULL;
+       struct llog_handle *llh = NULL;
+       struct kuc_hdr *kuch;
+       int rc;
+
+       CDEBUG(D_CHANGELOG, "changelog to fp=%p start "LPU64"\n",
+              cs->cs_fp, cs->cs_startrec);
+
+       OBD_ALLOC(cs->cs_buf, CR_MAXSIZE);
+       if (cs->cs_buf == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       /* Set up the remote catalog handle */
+       ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+       if (ctxt == NULL)
+               GOTO(out, rc = -ENOENT);
+       rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+                      LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("%s: fail to open changelog catalog: rc = %d\n",
+                      cs->cs_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+       if (rc) {
+               CERROR("llog_init_handle failed %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       rc = llog_cat_process(NULL, llh, changelog_show_cb, cs, 0, 0);
+
+       /* Send EOF no matter what our result */
+       if ((kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch),
+                                     cs->cs_flags))) {
+               kuch->kuc_msgtype = CL_EOF;
+               libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+       }
+
+out:
+       fput(cs->cs_fp);
+       if (llh)
+               llog_cat_close(NULL, llh);
+       if (ctxt)
+               llog_ctxt_put(ctxt);
+       if (cs->cs_buf)
+               OBD_FREE(cs->cs_buf, CR_MAXSIZE);
+       OBD_FREE_PTR(cs);
+       return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+                                 struct ioc_changelog *icc)
+{
+       struct changelog_show *cs;
+       int rc;
+
+       /* Freed in mdc_changelog_send_thread */
+       OBD_ALLOC_PTR(cs);
+       if (!cs)
+               return -ENOMEM;
+
+       cs->cs_obd = obd;
+       cs->cs_startrec = icc->icc_recno;
+       /* matching fput in mdc_changelog_send_thread */
+       cs->cs_fp = fget(icc->icc_id);
+       cs->cs_flags = icc->icc_flags;
+
+       /*
+        * New thread because we should return to user app before
+        * writing into our pipe
+        */
+       rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+                                "mdc_clg_send_thread"));
+       if (!IS_ERR_VALUE(rc)) {
+               CDEBUG(D_CHANGELOG, "start changelog thread\n");
+               return 0;
+       }
+
+       CERROR("Failed to start changelog thread: %d\n", rc);
+       OBD_FREE_PTR(cs);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                               struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                         struct obd_quotactl *oqctl)
+{
+       struct client_obd       *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+                                       MDS_QUOTACHECK);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *body = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+
+       /* the next poll will find -ENODATA, that means quotacheck is
+        * going on */
+       cli->cl_qchk_stat = -ENODATA;
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               cli->cl_qchk_stat = rc;
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+                               struct if_quotacheck *qchk)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       int rc;
+       ENTRY;
+
+       qchk->obd_uuid = cli->cl_target_uuid;
+       memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+       rc = cli->cl_qchk_stat;
+       /* the client is not the previous one */
+       if (rc == CL_NOT_QUOTACHECKED)
+               rc = -EINTR;
+       RETURN(rc);
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                       struct obd_quotactl *oqctl)
+{
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *oqc;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+                                       MDS_QUOTACTL);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *oqc = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+       ptlrpc_at_set_req_timeout(req);
+       req->rq_no_resend = 1;
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+       if (req->rq_repmsg &&
+           (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+               *oqctl = *oqc;
+       } else if (!rc) {
+               CERROR ("Can't unpack obd_quotactl\n");
+               rc = -EPROTO;
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+                               struct md_op_data *op_data)
+{
+       LIST_HEAD(cancels);
+       struct ptlrpc_request   *req;
+       int                      rc, count;
+       struct mdc_swap_layouts *msl, *payload;
+       ENTRY;
+
+       msl = op_data->op_data;
+
+       /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+        * first thing it will do is to cancel the 2 layout
+        * locks hold by this client.
+        * So the client must cancel its layout locks on the 2 fids
+        * with the request RPC to avoid extra RPC round trips
+        */
+       count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+                                       LCK_CR, MDS_INODELOCK_LAYOUT);
+       count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+                                        LCK_CR, MDS_INODELOCK_LAYOUT);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_SWAP_LAYOUTS);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_swap_layouts_pack(req, op_data);
+
+       payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+       LASSERT(payload);
+
+       *payload = *msl;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct obd_ioctl_data *data = karg;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       if (!try_module_get(THIS_MODULE)) {
+               CERROR("Can't get module. Is it alive?");
+               return -EINVAL;
+       }
+       switch (cmd) {
+       case OBD_IOC_CHANGELOG_SEND:
+               rc = mdc_ioc_changelog_send(obd, karg);
+               GOTO(out, rc);
+       case OBD_IOC_CHANGELOG_CLEAR: {
+               struct ioc_changelog *icc = karg;
+               struct changelog_setinfo cs =
+                       {.cs_recno = icc->icc_recno, .cs_id = icc->icc_id};
+               rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+                                       KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+                                       NULL);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_FID2PATH:
+               rc = mdc_ioc_fid2path(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_CT_START:
+               rc = mdc_ioc_hsm_ct_start(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_PROGRESS:
+               rc = mdc_ioc_hsm_progress(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_STATE_GET:
+               rc = mdc_ioc_hsm_state_get(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_STATE_SET:
+               rc = mdc_ioc_hsm_state_set(exp, karg);
+       case LL_IOC_HSM_ACTION:
+               rc = mdc_ioc_hsm_current_action(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_REQUEST:
+               rc = mdc_ioc_hsm_request(exp, karg);
+               GOTO(out, rc);
+       case OBD_IOC_CLIENT_RECOVER:
+               rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+               if (rc < 0)
+                       GOTO(out, rc);
+               GOTO(out, rc = 0);
+       case IOC_OSC_SET_ACTIVE:
+               rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+               GOTO(out, rc);
+       case OBD_IOC_PARSE: {
+               ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
+               rc = class_config_parse_llog(NULL, ctxt, data->ioc_inlbuf1,
+                                            NULL);
+               llog_ctxt_put(ctxt);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_LLOG_INFO:
+       case OBD_IOC_LLOG_PRINT: {
+               ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+               rc = llog_ioctl(NULL, ctxt, cmd, data);
+               llog_ctxt_put(ctxt);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_POLL_QUOTACHECK:
+               rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+               GOTO(out, rc);
+       case OBD_IOC_PING_TARGET:
+               rc = ptlrpc_obd_ping(obd);
+               GOTO(out, rc);
+       /*
+        * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+        * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+        * there'd be no LMV layer thus we might be called here. Eventually
+        * this code should be removed.
+        * bz20731, LU-592.
+        */
+       case IOC_OBD_STATFS: {
+               struct obd_statfs stat_buf = {0};
+
+               if (*((__u32 *) data->ioc_inlbuf2) != 0)
+                       GOTO(out, rc = -ENODEV);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       GOTO(out, rc = -EFAULT);
+
+               rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+               if (rc != 0)
+                       GOTO(out, rc);
+
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       GOTO(out, rc = -EFAULT);
+
+               GOTO(out, rc = 0);
+       }
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct obd_quotactl *oqctl;
+
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_MDTIDX;
+                       qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               if (copy_to_user(uarg,
+                                    exp_connect_flags_ptr(exp),
+                                    sizeof(__u64)))
+                       GOTO(out, rc = -EFAULT);
+               else
+                       GOTO(out, rc = 0);
+       }
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               rc = mdc_ioc_swap_layouts(exp, karg);
+               break;
+       }
+       default:
+               CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
+               GOTO(out, rc = -ENOTTY);
+       }
+out:
+       module_put(THIS_MODULE);
+
+       return rc;
+}
+
+int mdc_get_info_rpc(struct obd_export *exp,
+                    obd_count keylen, void *key,
+                    int vallen, void *val)
+{
+       struct obd_import      *imp = class_exp2cliimp(exp);
+       struct ptlrpc_request  *req;
+       char               *tmp;
+       int                  rc = -EINVAL;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+                            RCL_CLIENT, sizeof(__u32));
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+       memcpy(tmp, &vallen, sizeof(__u32));
+
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                            RCL_SERVER, vallen);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       /* -EREMOTE means the get_info result is partial, and it needs to
+        * continue on another MDT, see fid2path part in lmv_iocontrol */
+       if (rc == 0 || rc == -EREMOTE) {
+               tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+               memcpy(val, tmp, vallen);
+               if (ptlrpc_rep_need_swab(req)) {
+                       if (KEY_IS(KEY_FID2PATH))
+                               lustre_swab_fid2path(val);
+               }
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+       __swab32s(&h->hai_len);
+       __swab32s(&h->hai_action);
+       lustre_swab_lu_fid(&h->hai_fid);
+       lustre_swab_lu_fid(&h->hai_dfid);
+       __swab64s(&h->hai_cookie);
+       __swab64s(&h->hai_extent.offset);
+       __swab64s(&h->hai_extent.length);
+       __swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+       struct hsm_action_item  *hai;
+       int                      i;
+
+       __swab32s(&h->hal_version);
+       __swab32s(&h->hal_count);
+       __swab32s(&h->hal_archive_id);
+       __swab64s(&h->hal_flags);
+       hai = hai_zero(h);
+       for (i = 0; i < h->hal_count; i++) {
+               lustre_swab_hai(hai);
+               hai = hai_next(hai);
+       }
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+       __swab16s(&l->kuc_magic);
+       /* __u8 l->kuc_transport */
+       __swab16s(&l->kuc_msgtype);
+       __swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                               struct lustre_kernelcomm *lk)
+{
+       struct obd_import  *imp = class_exp2cliimp(exp);
+       __u32               archive = lk->lk_data;
+       int                 rc = 0;
+
+       if (lk->lk_group != KUC_GRP_HSM) {
+               CERROR("Bad copytool group %d\n", lk->lk_group);
+               return -EINVAL;
+       }
+
+       CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+              lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+       if (lk->lk_flags & LK_FLG_STOP) {
+               rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+               /* Unregister with the coordinator */
+               if (rc == 0)
+                       rc = mdc_ioc_hsm_ct_unregister(imp);
+       } else {
+               struct file *fp = fget(lk->lk_wfd);
+
+               rc = libcfs_kkuc_group_add(fp, lk->lk_uid, lk->lk_group,
+                                          lk->lk_data);
+               if (rc && fp)
+                       fput(fp);
+               if (rc == 0)
+                       rc = mdc_ioc_hsm_ct_register(imp, archive);
+       }
+
+       return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+       struct kuc_hdr          *lh = (struct kuc_hdr *)val;
+       struct hsm_action_list  *hal = (struct hsm_action_list *)(lh + 1);
+       int                      rc;
+       ENTRY;
+
+       if (len < sizeof(*lh) + sizeof(*hal)) {
+               CERROR("Short HSM message %d < %d\n", len,
+                      (int) (sizeof(*lh) + sizeof(*hal)));
+               RETURN(-EPROTO);
+       }
+       if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+               lustre_swab_kuch(lh);
+               lustre_swab_hal(hal);
+       } else if (lh->kuc_magic != KUC_MAGIC) {
+               CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+               RETURN(-EPROTO);
+       }
+
+       CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+              "on %s\n",
+              lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+              lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+       /* Broadcast to HSM listeners */
+       rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+       RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+       struct obd_import       *imp = (struct obd_import *)cb_arg;
+       __u32                    archive = data;
+       int                      rc;
+
+       CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+              archive);
+       rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+       /* ignore error if the copytool is already registered */
+       return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+       /* re-register HSM agents */
+       return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+                                        (void *)imp);
+}
+
+int mdc_set_info_async(const struct lu_env *env,
+                      struct obd_export *exp,
+                      obd_count keylen, void *key,
+                      obd_count vallen, void *val,
+                      struct ptlrpc_request_set *set)
+{
+       struct obd_import       *imp = class_exp2cliimp(exp);
+       int                      rc;
+       ENTRY;
+
+       if (KEY_IS(KEY_READ_ONLY)) {
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+
+               spin_lock(&imp->imp_lock);
+               if (*((int *)val)) {
+                       imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+                       imp->imp_connect_data.ocd_connect_flags |=
+                                                       OBD_CONNECT_RDONLY;
+               } else {
+                       imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+                       imp->imp_connect_data.ocd_connect_flags &=
+                                                       ~OBD_CONNECT_RDONLY;
+               }
+               spin_unlock(&imp->imp_lock);
+
+               rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                      keylen, key, vallen, val, set);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SPTLRPC_CONF)) {
+               sptlrpc_conf_client_adapt(exp->exp_obd);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_FLUSH_CTX)) {
+               sptlrpc_import_flush_my_ctx(imp);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_MDS_CONN)) {
+               /* mds-mds import */
+               spin_lock(&imp->imp_lock);
+               imp->imp_server_timeout = 1;
+               spin_unlock(&imp->imp_lock);
+               imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+               CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+               rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                      keylen, key, vallen, val, set);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+               rc = mdc_hsm_copytool_send(vallen, val);
+               RETURN(rc);
+       }
+
+       CERROR("Unknown key %s\n", (char *)key);
+       RETURN(-EINVAL);
+}
+
+int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+                __u32 keylen, void *key, __u32 *vallen, void *val,
+                struct lov_stripe_md *lsm)
+{
+       int rc = -EINVAL;
+
+       if (KEY_IS(KEY_MAX_EASIZE)) {
+               int mdsize, *max_easize;
+
+               if (*vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               mdsize = *(int*)val;
+               if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+                       exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+               max_easize = val;
+               *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+               RETURN(0);
+       } else if (KEY_IS(KEY_CONN_DATA)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               struct obd_connect_data *data = val;
+
+               if (*vallen != sizeof(*data))
+                       RETURN(-EINVAL);
+
+               *data = imp->imp_connect_data;
+               RETURN(0);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = 1;
+               RETURN(0);
+       }
+
+       rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+       RETURN(rc);
+}
+
+static int mdc_pin(struct obd_export *exp, const struct lu_fid *fid,
+                  struct obd_capa *oc, struct obd_client_handle *handle,
+                  int flags)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_PIN);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_PIN);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, 0, 0, -1, flags);
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc) {
+               CERROR("Pin failed: %d\n", rc);
+               GOTO(err_out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(err_out, rc = -EPROTO);
+
+       handle->och_fh = body->handle;
+       handle->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+
+       handle->och_mod = obd_mod_alloc();
+       if (handle->och_mod == NULL) {
+               DEBUG_REQ(D_ERROR, req, "can't allocate md_open_data");
+               GOTO(err_out, rc = -ENOMEM);
+       }
+       handle->och_mod->mod_open_req = req; /* will be dropped by unpin */
+
+       RETURN(0);
+
+err_out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mdc_unpin(struct obd_export *exp, struct obd_client_handle *handle,
+                    int flag)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_UNPIN,
+                                       LUSTRE_MDS_VERSION, MDS_UNPIN);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+       body->handle = handle->och_fh;
+       body->flags = flag;
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       if (rc != 0)
+               CERROR("Unpin failed: %d\n", rc);
+
+       ptlrpc_req_finished(req);
+       ptlrpc_req_finished(handle->och_mod->mod_open_req);
+
+       obd_mod_put(handle->och_mod);
+       RETURN(rc);
+}
+
+int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+            struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       int rc = 0;
+
+       LASSERT(imp->imp_obd == obd);
+
+       switch (event) {
+       case IMP_EVENT_DISCON: {
+#if 0
+               /* XXX Pass event up to OBDs stack. used only for FLD now */
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+               break;
+       }
+       case IMP_EVENT_INACTIVE: {
+               struct client_obd *cli = &obd->u.cli;
+               /*
+                * Flush current sequence to make client obtain new one
+                * from server in case of disconnect/reconnect.
+                */
+               if (cli->cl_seq != NULL)
+                       seq_client_flush(cli->cl_seq);
+
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+
+               ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+               break;
+       }
+       case IMP_EVENT_ACTIVE:
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+               /* restore re-establish kuc registration after reconnecting */
+               if (rc == 0)
+                       rc = mdc_kuc_reregister(imp);
+               break;
+       case IMP_EVENT_OCD:
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+               break;
+       case IMP_EVENT_DEACTIVATE:
+       case IMP_EVENT_ACTIVATE:
+               break;
+       default:
+               CERROR("Unknown import event %x\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       struct lu_client_seq *seq = cli->cl_seq;
+       ENTRY;
+       RETURN(seq_client_alloc_fid(NULL, seq, fid));
+}
+
+struct obd_uuid *mdc_get_uuid(struct obd_export *exp) {
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+       if (lock->l_resource->lr_type != LDLM_IBITS)
+               RETURN(0);
+
+       /* FIXME: if we ever get into a situation where there are too many
+        * opened files with open locks on a single node, then we really
+        * should replay these open locks to reget it */
+       if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+               RETURN(0);
+
+       RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+       if (res->lr_lvb_inode)
+               res->lr_lvb_inode = NULL;
+
+       return 0;
+}
+
+struct ldlm_valblock_ops inode_lvbo = {
+       lvbo_free: mdc_resource_inode_free
+};
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+       struct client_obd *cli = &obd->u.cli;
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       if (!cli->cl_rpc_lock)
+               RETURN(-ENOMEM);
+       mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+       ptlrpcd_addref();
+
+       OBD_ALLOC(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+       if (!cli->cl_close_lock)
+               GOTO(err_rpc_lock, rc = -ENOMEM);
+       mdc_init_rpc_lock(cli->cl_close_lock);
+
+       rc = client_obd_setup(obd, cfg);
+       if (rc)
+               GOTO(err_close_lock, rc);
+       lprocfs_mdc_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+       sptlrpc_lprocfs_cliobd_attach(obd);
+       ptlrpc_lprocfs_register_obd(obd);
+
+       ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+       obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               mdc_cleanup(obd);
+               CERROR("failed to setup llogging subsystems\n");
+       }
+
+       RETURN(rc);
+
+err_close_lock:
+       OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+err_rpc_lock:
+       OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+                    int def_easize, int cookiesize)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct client_obd *cli = &obd->u.cli;
+       ENTRY;
+
+       if (cli->cl_max_mds_easize < easize)
+               cli->cl_max_mds_easize = easize;
+
+       if (cli->cl_default_mds_easize < def_easize)
+               cli->cl_default_mds_easize = def_easize;
+
+       if (cli->cl_max_mds_cookiesize < cookiesize)
+               cli->cl_max_mds_cookiesize = cookiesize;
+
+       RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               /* Failsafe, ok if racy */
+               if (obd->obd_type->typ_refcnt <= 1)
+                       libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+               obd_cleanup_client_import(obd);
+               ptlrpc_lprocfs_unregister_obd(obd);
+               lprocfs_obd_cleanup(obd);
+
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+
+       OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+
+       ptlrpcd_decref();
+
+       return client_obd_cleanup(obd);
+}
+
+
+static int mdc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+
+       rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc)
+               RETURN(rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(0);
+}
+
+static int mdc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       RETURN(0);
+}
+
+static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg *lcfg = buf;
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc = 0;
+
+       lprocfs_mdc_init_vars(&lvars);
+       switch (lcfg->lcfg_command) {
+       default:
+               rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               break;
+       }
+       return(rc);
+}
+
+
+/* get remote permission for current user on fid */
+int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, __u32 suppgid,
+                       struct ptlrpc_request **request)
+{
+       struct ptlrpc_request  *req;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(client_is_remote(exp));
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                            sizeof(struct mdt_remote_perm));
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+                                   struct ptlrpc_request *req, void *args,
+                                   int status)
+{
+       struct mdc_renew_capa_args *ra = args;
+       struct mdt_body *body = NULL;
+       struct lustre_capa *capa;
+       ENTRY;
+
+       if (status)
+               GOTO(out, capa = ERR_PTR(status));
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, capa = ERR_PTR(-EFAULT));
+
+       if ((body->valid & OBD_MD_FLOSSCAPA) == 0)
+               GOTO(out, capa = ERR_PTR(-ENOENT));
+
+       capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+       if (!capa)
+               GOTO(out, capa = ERR_PTR(-EFAULT));
+       EXIT;
+out:
+       ra->ra_cb(ra->ra_oc, capa);
+       return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+                         renew_capa_cb_t cb)
+{
+       struct ptlrpc_request *req;
+       struct mdc_renew_capa_args *ra;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+                                       LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+        * capa to renew is oss capa.
+        */
+       mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+       ptlrpc_request_set_replen(req);
+
+       CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+       ra = ptlrpc_req_async_args(req);
+       ra->ra_oc = oc;
+       ra->ra_cb = cb;
+       req->rq_interpret_reply = mdc_interpret_renew_capa;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+       RETURN(0);
+}
+
+static int mdc_connect(const struct lu_env *env,
+                      struct obd_export **exp,
+                      struct obd_device *obd, struct obd_uuid *cluuid,
+                      struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct obd_import *imp = obd->u.cli.cl_import;
+
+       /* mds-mds import features */
+       if (data && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+               spin_lock(&imp->imp_lock);
+               imp->imp_server_timeout = 1;
+               spin_unlock(&imp->imp_lock);
+               imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+               CDEBUG(D_OTHER, "%s: Set 'mds' portal and timeout\n",
+                      obd->obd_name);
+       }
+
+       return client_connect_import(env, exp, obd, cluuid, data, NULL);
+}
+
+struct obd_ops mdc_obd_ops = {
+       .o_owner            = THIS_MODULE,
+       .o_setup            = mdc_setup,
+       .o_precleanup       = mdc_precleanup,
+       .o_cleanup        = mdc_cleanup,
+       .o_add_conn      = client_import_add_conn,
+       .o_del_conn      = client_import_del_conn,
+       .o_connect        = mdc_connect,
+       .o_disconnect       = client_disconnect_export,
+       .o_iocontrol    = mdc_iocontrol,
+       .o_set_info_async   = mdc_set_info_async,
+       .o_statfs          = mdc_statfs,
+       .o_pin        = mdc_pin,
+       .o_unpin            = mdc_unpin,
+       .o_fid_init         = client_fid_init,
+       .o_fid_fini         = client_fid_fini,
+       .o_fid_alloc    = mdc_fid_alloc,
+       .o_import_event     = mdc_import_event,
+       .o_llog_init    = mdc_llog_init,
+       .o_llog_finish      = mdc_llog_finish,
+       .o_get_info      = mdc_get_info,
+       .o_process_config   = mdc_process_config,
+       .o_get_uuid      = mdc_get_uuid,
+       .o_quotactl      = mdc_quotactl,
+       .o_quotacheck       = mdc_quotacheck
+};
+
+struct md_ops mdc_md_ops = {
+       .m_getstatus    = mdc_getstatus,
+       .m_null_inode       = mdc_null_inode,
+       .m_find_cbdata      = mdc_find_cbdata,
+       .m_close            = mdc_close,
+       .m_create          = mdc_create,
+       .m_done_writing     = mdc_done_writing,
+       .m_enqueue        = mdc_enqueue,
+       .m_getattr        = mdc_getattr,
+       .m_getattr_name     = mdc_getattr_name,
+       .m_intent_lock      = mdc_intent_lock,
+       .m_link      = mdc_link,
+       .m_is_subdir    = mdc_is_subdir,
+       .m_rename          = mdc_rename,
+       .m_setattr        = mdc_setattr,
+       .m_setxattr      = mdc_setxattr,
+       .m_getxattr      = mdc_getxattr,
+       .m_sync      = mdc_sync,
+       .m_readpage      = mdc_readpage,
+       .m_unlink          = mdc_unlink,
+       .m_cancel_unused    = mdc_cancel_unused,
+       .m_init_ea_size     = mdc_init_ea_size,
+       .m_set_lock_data    = mdc_set_lock_data,
+       .m_lock_match       = mdc_lock_match,
+       .m_get_lustre_md    = mdc_get_lustre_md,
+       .m_free_lustre_md   = mdc_free_lustre_md,
+       .m_set_open_replay_data = mdc_set_open_replay_data,
+       .m_clear_open_replay_data = mdc_clear_open_replay_data,
+       .m_renew_capa       = mdc_renew_capa,
+       .m_unpack_capa      = mdc_unpack_capa,
+       .m_get_remote_perm  = mdc_get_remote_perm,
+       .m_intent_getattr_async = mdc_intent_getattr_async,
+       .m_revalidate_lock      = mdc_revalidate_lock
+};
+
+int __init mdc_init(void)
+{
+       int rc;
+       struct lprocfs_static_vars lvars = { 0 };
+       lprocfs_mdc_init_vars(&lvars);
+
+       rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+                                LUSTRE_MDC_NAME, NULL);
+       RETURN(rc);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+       class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644 (file)
index 0000000..2672463
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o lproc_mgc.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mgc/libmgc.c b/drivers/staging/lustre/lustre/mgc/libmgc.c
new file mode 100644 (file)
index 0000000..442146c
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/libmgc.c
+ *
+ * Lustre Management Client
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+/* Minimal MGC for liblustre: only used to read the config log from the MGS
+   at setup time, no updates. */
+
+#define DEBUG_SUBSYSTEM S_MGC
+
+#include <liblustre.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int rc;
+       ENTRY;
+
+       ptlrpcd_addref();
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(err_decref, rc);
+
+       /* liblustre only support null flavor to MGS */
+       obd->u.cli.cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_NULL;
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               CERROR("failed to setup llogging subsystems\n");
+               GOTO(err_cleanup, rc);
+       }
+
+       RETURN(rc);
+
+err_cleanup:
+       client_obd_cleanup(obd);
+err_decref:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+       case OBD_CLEANUP_EXPORTS:
+               obd_cleanup_client_import(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+       ptlrpcd_decref();
+
+       rc = client_obd_cleanup(obd);
+       RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+       rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc < 0)
+               RETURN(rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       RETURN(0);
+}
+
+struct obd_ops mgc_obd_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_setup        = mgc_setup,
+       .o_precleanup   = mgc_precleanup,
+       .o_cleanup      = mgc_cleanup,
+       .o_add_conn     = client_import_add_conn,
+       .o_del_conn     = client_import_del_conn,
+       .o_connect      = client_connect_import,
+       .o_disconnect   = client_disconnect_export,
+       .o_llog_init    = mgc_llog_init,
+       .o_llog_finish  = mgc_llog_finish,
+};
+
+int __init mgc_init(void)
+{
+       return class_register_type(&mgc_obd_ops, NULL,
+                                  NULL, LUSTRE_MGC_NAME, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644 (file)
index 0000000..041f365
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef LPROCFS
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+       { "uuid",           lprocfs_rd_uuid,      0, 0 },
+       { "ping",           0, lprocfs_wr_ping,       0, 0, 0222 },
+       { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+       { "mgs_server_uuid", lprocfs_rd_server_uuid,   0, 0 },
+       { "mgs_conn_uuid",   lprocfs_rd_conn_uuid,     0, 0 },
+       { "import",       lprocfs_rd_import,    0, 0 },
+       { "state",         lprocfs_rd_state,     0, 0 },
+       { "ir_state",   lprocfs_mgc_rd_ir_state,  0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+       { "num_refs",   lprocfs_rd_numrefs,       0, 0 },
+       { 0 }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars = lprocfs_mgc_module_vars;
+       lvars->obd_vars    = lprocfs_mgc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644 (file)
index 0000000..111db90
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+#ifdef LPROCFS
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off,
+                           int count, int *eof, void *data);
+#else
+static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(char *page, char **start,
+       off_t off, int count, int *eof, void *data)
+{
+       return 0;
+}
+#endif  /* LPROCFS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+       return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+       return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644 (file)
index 0000000..74232f4
--- /dev/null
@@ -0,0 +1,1863 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+                         int type)
+{
+       __u64 resname = 0;
+
+       if (len > 8) {
+               CERROR("name too long: %s\n", name);
+               return -EINVAL;
+       }
+       if (len <= 0) {
+               CERROR("missing name: %s\n", name);
+               return -EINVAL;
+       }
+       memcpy(&resname, name, len);
+
+       /* Always use the same endianness for the resid */
+       memset(res_id, 0, sizeof(*res_id));
+       res_id->name[0] = cpu_to_le64(resname);
+       /* XXX: unfortunately, sptlprc and config llog share one lock */
+       switch(type) {
+       case CONFIG_T_CONFIG:
+       case CONFIG_T_SPTLRPC:
+               resname = 0;
+               break;
+       case CONFIG_T_RECOVER:
+               resname = type;
+               break;
+       default:
+               LBUG();
+       }
+       res_id->name[1] = cpu_to_le64(resname);
+       CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", name,
+              res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+       return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+       /* fsname is at most 8 chars long, maybe contain "-".
+        * e.g. "lustre", "SUN-000" */
+       return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+       char *name_end;
+       int len;
+
+       /* logname consists of "fsname-nodetype".
+        * e.g. "lustre-MDT0001", "SUN-000-client" */
+       name_end = strrchr(logname, '-');
+       LASSERT(name_end);
+       len = name_end - logname;
+       return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+       ENTRY;
+       atomic_inc(&cld->cld_refcount);
+       CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+              atomic_read(&cld->cld_refcount));
+       RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+              atomic_read(&cld->cld_refcount));
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       /* spinlock to make sure no item with 0 refcount in the list */
+       if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+               list_del(&cld->cld_list_chain);
+               spin_unlock(&config_list_lock);
+
+               CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+               if (cld->cld_recover)
+                       config_log_put(cld->cld_recover);
+               if (cld->cld_sptlrpc)
+                       config_log_put(cld->cld_sptlrpc);
+               if (cld_is_sptlrpc(cld))
+                       sptlrpc_conf_log_stop(cld->cld_logname);
+
+               class_export_put(cld->cld_mgcexp);
+               OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+       }
+
+       EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+                                        struct config_llog_instance *cfg)
+{
+       struct config_llog_data *cld;
+       struct config_llog_data *found = NULL;
+       void *             instance;
+       ENTRY;
+
+       LASSERT(logname != NULL);
+
+       instance = cfg ? cfg->cfg_instance : NULL;
+       spin_lock(&config_list_lock);
+       list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+               /* check if instance equals */
+               if (instance != cld->cld_cfg.cfg_instance)
+                       continue;
+
+               /* instance may be NULL, should check name */
+               if (strcmp(logname, cld->cld_logname) == 0) {
+                       found = cld;
+                       break;
+               }
+       }
+       if (found) {
+               atomic_inc(&found->cld_refcount);
+               LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+       }
+       spin_unlock(&config_list_lock);
+       RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+                                          char *logname,
+                                          int type,
+                                          struct config_llog_instance *cfg,
+                                          struct super_block *sb)
+{
+       struct config_llog_data *cld;
+       int                   rc;
+       ENTRY;
+
+       CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+              cfg ? cfg->cfg_instance : 0);
+
+       OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+       if (!cld)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       strcpy(cld->cld_logname, logname);
+       if (cfg)
+               cld->cld_cfg = *cfg;
+       else
+               cld->cld_cfg.cfg_callback = class_config_llog_handler;
+       mutex_init(&cld->cld_lock);
+       cld->cld_cfg.cfg_last_idx = 0;
+       cld->cld_cfg.cfg_flags = 0;
+       cld->cld_cfg.cfg_sb = sb;
+       cld->cld_type = type;
+       atomic_set(&cld->cld_refcount, 1);
+
+       /* Keep the mgc around until we are done */
+       cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+       if (cld_is_sptlrpc(cld)) {
+               sptlrpc_conf_log_start(logname);
+               cld->cld_cfg.cfg_obdname = obd->obd_name;
+       }
+
+       rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+       spin_lock(&config_list_lock);
+       list_add(&cld->cld_list_chain, &config_llog_list);
+       spin_unlock(&config_list_lock);
+
+       if (rc) {
+               config_log_put(cld);
+               RETURN(ERR_PTR(rc));
+       }
+
+       if (cld_is_sptlrpc(cld)) {
+               rc = mgc_process_log(obd, cld);
+               if (rc && rc != -ENOENT)
+                       CERROR("failed processing sptlrpc log: %d\n", rc);
+       }
+
+       RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+       char *fsname,
+       struct config_llog_instance *cfg,
+       struct super_block *sb)
+{
+       struct config_llog_instance lcfg = *cfg;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct config_llog_data *cld;
+       char logname[32];
+
+       if (IS_OST(lsi))
+               return NULL;
+
+       /* for osp-on-ost, see lustre_start_osp() */
+       if (IS_MDT(lsi) && lcfg.cfg_instance)
+               return NULL;
+
+       /* we have to use different llog for clients and mdts for cmd
+        * where only clients are notified if one of cmd server restarts */
+       LASSERT(strlen(fsname) < sizeof(logname) / 2);
+       strcpy(logname, fsname);
+       if (IS_SERVER(lsi)) { /* mdt */
+               LASSERT(lcfg.cfg_instance == NULL);
+               lcfg.cfg_instance = sb;
+               strcat(logname, "-mdtir");
+       } else {
+               LASSERT(lcfg.cfg_instance != NULL);
+               strcat(logname, "-cliir");
+       }
+
+       cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+       return cld;
+}
+
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+                         struct config_llog_instance *cfg,
+                         struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct config_llog_data *cld;
+       struct config_llog_data *sptlrpc_cld;
+       char                 seclogname[32];
+       char                *ptr;
+       ENTRY;
+
+       CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+       /*
+        * for each regular log, the depended sptlrpc log name is
+        * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+        */
+       ptr = strrchr(logname, '-');
+       if (ptr == NULL || ptr - logname > 8) {
+               CERROR("logname %s is too long\n", logname);
+               RETURN(-EINVAL);
+       }
+
+       memcpy(seclogname, logname, ptr - logname);
+       strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+       sptlrpc_cld = config_log_find(seclogname, NULL);
+       if (sptlrpc_cld == NULL) {
+               sptlrpc_cld = do_config_log_add(obd, seclogname,
+                                               CONFIG_T_SPTLRPC, NULL, NULL);
+               if (IS_ERR(sptlrpc_cld)) {
+                       CERROR("can't create sptlrpc log: %s\n", seclogname);
+                       RETURN(PTR_ERR(sptlrpc_cld));
+               }
+       }
+
+       cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+       if (IS_ERR(cld)) {
+               CERROR("can't create log: %s\n", logname);
+               config_log_put(sptlrpc_cld);
+               RETURN(PTR_ERR(cld));
+       }
+
+       cld->cld_sptlrpc = sptlrpc_cld;
+
+       LASSERT(lsi->lsi_lmd);
+       if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+               struct config_llog_data *recover_cld;
+               *strrchr(seclogname, '-') = 0;
+               recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+               if (IS_ERR(recover_cld)) {
+                       config_log_put(cld);
+                       RETURN(PTR_ERR(recover_cld));
+               }
+               cld->cld_recover = recover_cld;
+       }
+
+       RETURN(0);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+       struct config_llog_data *cld;
+       struct config_llog_data *cld_sptlrpc = NULL;
+       struct config_llog_data *cld_recover = NULL;
+       int rc = 0;
+       ENTRY;
+
+       cld = config_log_find(logname, cfg);
+       if (cld == NULL)
+               RETURN(-ENOENT);
+
+       mutex_lock(&cld->cld_lock);
+       /*
+        * if cld_stopping is set, it means we didn't start the log thus
+        * not owning the start ref. this can happen after previous umount:
+        * the cld still hanging there waiting for lock cancel, and we
+        * remount again but failed in the middle and call log_end without
+        * calling start_log.
+        */
+       if (unlikely(cld->cld_stopping)) {
+               mutex_unlock(&cld->cld_lock);
+               /* drop the ref from the find */
+               config_log_put(cld);
+               RETURN(rc);
+       }
+
+       cld->cld_stopping = 1;
+
+       cld_recover = cld->cld_recover;
+       cld->cld_recover = NULL;
+       mutex_unlock(&cld->cld_lock);
+
+       if (cld_recover) {
+               mutex_lock(&cld_recover->cld_lock);
+               cld_recover->cld_stopping = 1;
+               mutex_unlock(&cld_recover->cld_lock);
+               config_log_put(cld_recover);
+       }
+
+       spin_lock(&config_list_lock);
+       cld_sptlrpc = cld->cld_sptlrpc;
+       cld->cld_sptlrpc = NULL;
+       spin_unlock(&config_list_lock);
+
+       if (cld_sptlrpc)
+               config_log_put(cld_sptlrpc);
+
+       /* drop the ref from the find */
+       config_log_put(cld);
+       /* drop the start ref */
+       config_log_put(cld);
+
+       CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+              rc);
+       RETURN(rc);
+}
+
+int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct obd_device       *obd = data;
+       struct obd_import       *imp = obd->u.cli.cl_import;
+       struct obd_connect_data *ocd = &imp->imp_connect_data;
+       struct config_llog_data *cld;
+       int rc = 0;
+       ENTRY;
+
+       rc = snprintf(page, count, "imperative_recovery: %s\n",
+                     OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+       rc += snprintf(page + rc, count - rc, "client_state:\n");
+
+       spin_lock(&config_list_lock);
+       list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+               if (cld->cld_recover == NULL)
+                       continue;
+               rc += snprintf(page + rc, count - rc,
+                              "    - { client: %s, nidtbl_version: %u }\n",
+                              cld->cld_logname,
+                              cld->cld_recover->cld_cfg.cfg_last_idx);
+       }
+       spin_unlock(&config_list_lock);
+
+       RETURN(rc);
+}
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW     0x2
+#define RQ_LATER   0x4
+#define RQ_STOP    0x8
+static int                 rq_state = 0;
+static wait_queue_head_t           rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+       ENTRY;
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       /* Do not run mgc_process_log on a disconnected export or an
+          export which is being disconnected. Take the client
+          semaphore to make the check non-racy. */
+       down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+       if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+               CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+               mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+       } else {
+               CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+                      cld->cld_logname);
+       }
+       up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+       EXIT;
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_MGC, "Starting requeue thread\n");
+
+       /* Keep trying failed locks periodically */
+       spin_lock(&config_list_lock);
+       rq_state |= RQ_RUNNING;
+       while (1) {
+               struct l_wait_info lwi;
+               struct config_llog_data *cld, *cld_prev;
+               int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+               int stopped = !!(rq_state & RQ_STOP);
+               int to;
+
+               /* Any new or requeued lostlocks will change the state */
+               rq_state &= ~(RQ_NOW | RQ_LATER);
+               spin_unlock(&config_list_lock);
+
+               /* Always wait a few seconds to allow the server who
+                  caused the lock revocation to finish its setup, plus some
+                  random so everyone doesn't try to reconnect at once. */
+               to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+               to += rand * HZ / 100; /* rand is centi-seconds */
+               lwi = LWI_TIMEOUT(to, NULL, NULL);
+               l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi);
+
+               /*
+                * iterate & processing through the list. for each cld, process
+                * its depending sptlrpc cld firstly (if any) and then itself.
+                *
+                * it's guaranteed any item in the list must have
+                * reference > 0; and if cld_lostlock is set, at
+                * least one reference is taken by the previous enqueue.
+                */
+               cld_prev = NULL;
+
+               spin_lock(&config_list_lock);
+               list_for_each_entry(cld, &config_llog_list,
+                                       cld_list_chain) {
+                       if (!cld->cld_lostlock)
+                               continue;
+
+                       spin_unlock(&config_list_lock);
+
+                       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+                       /* Whether we enqueued again or not in mgc_process_log,
+                        * we're done with the ref from the old enqueue */
+                       if (cld_prev)
+                               config_log_put(cld_prev);
+                       cld_prev = cld;
+
+                       cld->cld_lostlock = 0;
+                       if (likely(!stopped))
+                               do_requeue(cld);
+
+                       spin_lock(&config_list_lock);
+               }
+               spin_unlock(&config_list_lock);
+               if (cld_prev)
+                       config_log_put(cld_prev);
+
+               /* break after scanning the list so that we can drop
+                * refcount to losing lock clds */
+               if (unlikely(stopped)) {
+                       spin_lock(&config_list_lock);
+                       break;
+               }
+
+               /* Wait a bit to see if anyone else needs a requeue */
+               lwi = (struct l_wait_info) { 0 };
+               l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+                            &lwi);
+               spin_lock(&config_list_lock);
+       }
+       /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+       rq_state &= ~RQ_RUNNING;
+       spin_unlock(&config_list_lock);
+
+       complete(&rq_exit);
+
+       CDEBUG(D_MGC, "Ending requeue thread\n");
+       RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+              cld->cld_logname, atomic_read(&cld->cld_refcount),
+              cld->cld_stopping, rq_state);
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       mutex_lock(&cld->cld_lock);
+       if (cld->cld_stopping || cld->cld_lostlock) {
+               mutex_unlock(&cld->cld_lock);
+               RETURN_EXIT;
+       }
+       /* this refcount will be released in mgc_requeue_thread. */
+       config_log_get(cld);
+       cld->cld_lostlock = 1;
+       mutex_unlock(&cld->cld_lock);
+
+       /* Hold lock for rq_state */
+       spin_lock(&config_list_lock);
+       if (rq_state & RQ_STOP) {
+               spin_unlock(&config_list_lock);
+               cld->cld_lostlock = 0;
+               config_log_put(cld);
+       } else {
+               rq_state |= RQ_NOW;
+               spin_unlock(&config_list_lock);
+               wake_up(&rq_waitq);
+       }
+       EXIT;
+}
+
+/********************** class fns **********************/
+
+static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb,
+                       struct vfsmount *mnt)
+{
+       struct lvfs_run_ctxt saved;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct client_obd *cli = &obd->u.cli;
+       struct dentry *dentry;
+       char *label;
+       int err = 0;
+       ENTRY;
+
+       LASSERT(lsi);
+       LASSERT(lsi->lsi_srv_mnt == mnt);
+
+       /* The mgc fs exclusion sem. Only one fs can be setup at a time. */
+       down(&cli->cl_mgc_sem);
+
+       cfs_cleanup_group_info();
+
+       obd->obd_fsops = fsfilt_get_ops(lsi->lsi_fstype);
+       if (IS_ERR(obd->obd_fsops)) {
+               up(&cli->cl_mgc_sem);
+               CERROR("%s: No fstype %s: rc = %ld\n", lsi->lsi_fstype,
+                      obd->obd_name, PTR_ERR(obd->obd_fsops));
+               RETURN(PTR_ERR(obd->obd_fsops));
+       }
+
+       cli->cl_mgc_vfsmnt = mnt;
+       err = fsfilt_setup(obd, mnt->mnt_sb);
+       if (err)
+               GOTO(err_ops, err);
+
+       OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+       obd->obd_lvfs_ctxt.pwdmnt = mnt;
+       obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+       obd->obd_lvfs_ctxt.fs = get_ds();
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+                                  strlen(MOUNT_CONFIGS_DIR));
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       if (IS_ERR(dentry)) {
+               err = PTR_ERR(dentry);
+               CERROR("cannot lookup %s directory: rc = %d\n",
+                      MOUNT_CONFIGS_DIR, err);
+               GOTO(err_ops, err);
+       }
+       cli->cl_mgc_configs_dir = dentry;
+
+       /* We take an obd ref to insure that we can't get to mgc_cleanup
+          without calling mgc_fs_cleanup first. */
+       class_incref(obd, "mgc_fs", obd);
+
+       label = fsfilt_get_label(obd, mnt->mnt_sb);
+       if (label)
+               CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label);
+
+       /* We keep the cl_mgc_sem until mgc_fs_cleanup */
+       RETURN(0);
+
+err_ops:
+       fsfilt_put_ops(obd->obd_fsops);
+       obd->obd_fsops = NULL;
+       cli->cl_mgc_vfsmnt = NULL;
+       up(&cli->cl_mgc_sem);
+       RETURN(err);
+}
+
+static int mgc_fs_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt != NULL);
+
+       if (cli->cl_mgc_configs_dir != NULL) {
+               struct lvfs_run_ctxt saved;
+               push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               l_dput(cli->cl_mgc_configs_dir);
+               cli->cl_mgc_configs_dir = NULL;
+               pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               class_decref(obd, "mgc_fs", obd);
+       }
+
+       cli->cl_mgc_vfsmnt = NULL;
+       if (obd->obd_fsops)
+               fsfilt_put_ops(obd->obd_fsops);
+
+       up(&cli->cl_mgc_sem);
+
+       RETURN(rc);
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               if (atomic_dec_and_test(&mgc_count)) {
+                       int running;
+                       /* stop requeue thread */
+                       spin_lock(&config_list_lock);
+                       running = rq_state & RQ_RUNNING;
+                       if (running)
+                               rq_state |= RQ_STOP;
+                       spin_unlock(&config_list_lock);
+                       if (running) {
+                               wake_up(&rq_waitq);
+                               wait_for_completion(&rq_exit);
+                       }
+               }
+               obd_cleanup_client_import(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+       /* COMPAT_146 - old config logs may have added profiles we don't
+          know about */
+       if (obd->obd_type->typ_refcnt <= 1)
+               /* Only for the last mgc */
+               class_del_profiles();
+
+       lprocfs_obd_cleanup(obd);
+       ptlrpcd_decref();
+
+       rc = client_obd_cleanup(obd);
+       RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars;
+       int rc;
+       ENTRY;
+
+       ptlrpcd_addref();
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(err_decref, rc);
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               CERROR("failed to setup llogging subsystems\n");
+               GOTO(err_cleanup, rc);
+       }
+
+       lprocfs_mgc_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+       sptlrpc_lprocfs_cliobd_attach(obd);
+
+       if (atomic_inc_return(&mgc_count) == 1) {
+               rq_state = 0;
+               init_waitqueue_head(&rq_waitq);
+
+               /* start requeue thread */
+               rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+                                            "ll_cfg_requeue"));
+               if (IS_ERR_VALUE(rc)) {
+                       CERROR("%s: Cannot start requeue thread (%d),"
+                              "no more log updates!\n",
+                              obd->obd_name, rc);
+                       GOTO(err_cleanup, rc);
+               }
+               /* rc is the task_struct pointer of mgc_requeue_thread. */
+               rc = 0;
+       }
+
+       RETURN(rc);
+
+err_cleanup:
+       client_obd_cleanup(obd);
+err_decref:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag)
+{
+       struct lustre_handle lockh;
+       struct config_llog_data *cld = (struct config_llog_data *)data;
+       int rc = 0;
+       ENTRY;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+               /* mgs wants the lock, give it up... */
+               LDLM_DEBUG(lock, "MGC blocking CB");
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               break;
+       case LDLM_CB_CANCELING:
+               /* We've given up the lock, prepare ourselves to update. */
+               LDLM_DEBUG(lock, "MGC cancel CB");
+
+               CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n",
+                      lock->l_resource->lr_name.name[0],
+                      (char *)&lock->l_resource->lr_name.name[0]);
+
+               if (!cld) {
+                       CDEBUG(D_INFO, "missing data, won't requeue\n");
+                       break;
+               }
+
+               /* held at mgc_process_log(). */
+               LASSERT(atomic_read(&cld->cld_refcount) > 0);
+               /* Are we done with this log? */
+               if (cld->cld_stopping) {
+                       CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+                              cld->cld_logname);
+                       config_log_put(cld);
+                       break;
+               }
+               /* Make sure not to re-enqueue when the mgc is stopping
+                  (we get called from client_disconnect_export) */
+               if (!lock->l_conn_export ||
+                   !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+                       CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+                              cld->cld_logname);
+                       config_log_put(cld);
+                       break;
+               }
+
+               /* Re-enqueue now */
+               mgc_requeue_add(cld);
+               config_log_put(cld);
+               break;
+       default:
+               LBUG();
+       }
+
+       RETURN(rc);
+}
+
+/* Not sure where this should go... */
+#define  MGC_ENQUEUE_LIMIT 50
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+                            struct mgs_send_param *msp)
+{
+       struct ptlrpc_request *req;
+       struct mgs_send_param *req_msp, *rep_msp;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+                                       MGS_SET_INFO);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+       if (!req_msp) {
+               ptlrpc_req_finished(req);
+               RETURN(-ENOMEM);
+       }
+
+       memcpy(req_msp, msp, sizeof(*req_msp));
+       ptlrpc_request_set_replen(req);
+
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+       rc = ptlrpc_queue_wait(req);
+       if (!rc) {
+               rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+               memcpy(msp, rep_msp, sizeof(*rep_msp));
+       }
+
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+                      __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                      __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+                      void *data, __u32 lvb_len, void *lvb_swabber,
+                      struct lustre_handle *lockh)
+{
+       struct config_llog_data *cld = (struct config_llog_data *)data;
+       struct ldlm_enqueue_info einfo = { type, mode, mgc_blocking_ast,
+                        ldlm_completion_ast, NULL, NULL, NULL };
+       struct ptlrpc_request *req;
+       int short_limit = cld_is_sptlrpc(cld);
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
+              cld->cld_resid.name[0]);
+
+       /* We need a callback for every lockholder, so don't try to
+          ldlm_lock_match (see rev 1.1.2.11.2.47) */
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+                                       LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+       ptlrpc_request_set_replen(req);
+
+       /* check if this is server or client */
+       if (cld->cld_cfg.cfg_sb) {
+               struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+               if (lsi && IS_SERVER(lsi))
+                       short_limit = 1;
+       }
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+       rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+                             NULL, 0, LVB_T_NONE, lockh, 0);
+       /* A failed enqueue should still call the mgc_blocking_ast,
+          where it will be requeued if needed ("grant failed"). */
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+
+       ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+       /* wakeup mgc_requeue_thread to requeue mgc lock */
+       spin_lock(&config_list_lock);
+       rq_state |= RQ_NOW;
+       spin_unlock(&config_list_lock);
+       wake_up(&rq_waitq);
+
+       /* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+                              struct mgs_target_info *mti)
+{
+       struct ptlrpc_request  *req;
+       struct mgs_target_info *req_mti, *rep_mti;
+       int                  rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+                                       MGS_TARGET_REG);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+       if (!req_mti) {
+               ptlrpc_req_finished(req);
+               RETURN(-ENOMEM);
+       }
+
+       memcpy(req_mti, mti, sizeof(*req_mti));
+       ptlrpc_request_set_replen(req);
+       CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+       rc = ptlrpc_queue_wait(req);
+       if (!rc) {
+               rep_mti = req_capsule_server_get(&req->rq_pill,
+                                                &RMF_MGS_TARGET_INFO);
+               memcpy(mti, rep_mti, sizeof(*rep_mti));
+               CDEBUG(D_MGC, "register %s got index = %d\n",
+                      mti->mti_svname, mti->mti_stripe_index);
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      obd_count keylen, void *key, obd_count vallen,
+                      void *val, struct ptlrpc_request_set *set)
+{
+       int rc = -EINVAL;
+       ENTRY;
+
+       /* Turn off initial_recov after we try all backup servers once */
+       if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               int value;
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               value = *(int *)val;
+               CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+                      imp->imp_obd->obd_name, value,
+                      imp->imp_deactive, imp->imp_invalid,
+                      imp->imp_replayable, imp->imp_obd->obd_replayable,
+                      ptlrpc_import_state_name(imp->imp_state));
+               /* Resurrect if we previously died */
+               if ((imp->imp_state != LUSTRE_IMP_FULL &&
+                    imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+                       ptlrpc_reconnect_import(imp);
+               RETURN(0);
+       }
+       /* FIXME move this to mgc_process_config */
+       if (KEY_IS(KEY_REGISTER_TARGET)) {
+               struct mgs_target_info *mti;
+               if (vallen != sizeof(struct mgs_target_info))
+                       RETURN(-EINVAL);
+               mti = (struct mgs_target_info *)val;
+               CDEBUG(D_MGC, "register_target %s %#x\n",
+                      mti->mti_svname, mti->mti_flags);
+               rc =  mgc_target_register(exp, mti);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SET_FS)) {
+               struct super_block *sb = (struct super_block *)val;
+               struct lustre_sb_info *lsi;
+               if (vallen != sizeof(struct super_block))
+                       RETURN(-EINVAL);
+               lsi = s2lsi(sb);
+               rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt);
+               if (rc) {
+                       CERROR("set_fs got %d\n", rc);
+               }
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_CLEAR_FS)) {
+               if (vallen != 0)
+                       RETURN(-EINVAL);
+               rc = mgc_fs_cleanup(exp->exp_obd);
+               if (rc) {
+                       CERROR("clear_fs got %d\n", rc);
+               }
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SET_INFO)) {
+               struct mgs_send_param *msp;
+
+               msp = (struct mgs_send_param *)val;
+               rc =  mgc_set_mgs_param(exp, msp);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_MGSSEC)) {
+               struct client_obd     *cli = &exp->exp_obd->u.cli;
+               struct sptlrpc_flavor  flvr;
+
+               /*
+                * empty string means using current flavor, if which haven't
+                * been set yet, set it as null.
+                *
+                * if flavor has been set previously, check the asking flavor
+                * must match the existing one.
+                */
+               if (vallen == 0) {
+                       if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+                               RETURN(0);
+                       val = "null";
+                       vallen = 4;
+               }
+
+               rc = sptlrpc_parse_flavor(val, &flvr);
+               if (rc) {
+                       CERROR("invalid sptlrpc flavor %s to MGS\n",
+                              (char *) val);
+                       RETURN(rc);
+               }
+
+               /*
+                * caller already hold a mutex
+                */
+               if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                       cli->cl_flvr_mgc = flvr;
+               } else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+                                 sizeof(flvr)) != 0) {
+                       char    str[20];
+
+                       sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+                                           str, sizeof(str));
+                       LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+                                      "currently %s is in use\n",
+                                      (char *) val, str);
+                       rc = -EPERM;
+               }
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *unused)
+{
+       int rc = -EINVAL;
+
+       if (KEY_IS(KEY_CONN_DATA)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               struct obd_connect_data *data = val;
+
+               if (*vallen == sizeof(*data)) {
+                       *data = imp->imp_connect_data;
+                       rc = 0;
+               }
+       }
+
+       return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+                           struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       int rc = 0;
+
+       LASSERT(imp->imp_obd == obd);
+       CDEBUG(D_MGC, "import event %#x\n", event);
+
+       switch (event) {
+       case IMP_EVENT_DISCON:
+               /* MGC imports should not wait for recovery */
+               if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+                       ptlrpc_pinger_ir_down();
+               break;
+       case IMP_EVENT_INACTIVE:
+               break;
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+               ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+               break;
+       }
+       case IMP_EVENT_ACTIVE:
+               CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+               /* Clearing obd_no_recov allows us to continue pinging */
+               obd->obd_no_recov = 0;
+               mgc_notify_active(obd);
+               if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+                       ptlrpc_pinger_ir_up();
+               break;
+       case IMP_EVENT_OCD:
+               break;
+       case IMP_EVENT_DEACTIVATE:
+       case IMP_EVENT_ACTIVATE:
+               break;
+       default:
+               CERROR("Unknown import event %#x\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+
+
+       rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc)
+               GOTO(out, rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+       if (!ctxt)
+               GOTO(out, rc = -ENODEV);
+
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(0);
+out:
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(0);
+}
+
+enum {
+       CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+       CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+                                 struct config_llog_data *cld,
+                                 __u64 max_version,
+                                 void *data, int datalen, bool mne_swab)
+{
+       struct config_llog_instance *cfg = &cld->cld_cfg;
+       struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+       struct mgs_nidtbl_entry *entry;
+       struct lustre_cfg       *lcfg;
+       struct lustre_cfg_bufs   bufs;
+       u64   prev_version = 0;
+       char *inst;
+       char *buf;
+       int   bufsz;
+       int   pos;
+       int   rc  = 0;
+       int   off = 0;
+       ENTRY;
+
+       LASSERT(cfg->cfg_instance != NULL);
+       LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+       OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+       if (inst == NULL)
+               RETURN(-ENOMEM);
+
+       if (!IS_SERVER(lsi)) {
+               pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+               if (pos >= PAGE_CACHE_SIZE) {
+                       OBD_FREE(inst, PAGE_CACHE_SIZE);
+                       return -E2BIG;
+               }
+       } else {
+               LASSERT(IS_MDT(lsi));
+               rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+                                       PAGE_CACHE_SIZE);
+               if (rc) {
+                       OBD_FREE(inst, PAGE_CACHE_SIZE);
+                       RETURN(-EINVAL);
+               }
+               pos = strlen(inst);
+       }
+
+       ++pos;
+       buf   = inst + pos;
+       bufsz = PAGE_CACHE_SIZE - pos;
+
+       while (datalen > 0) {
+               int   entry_len = sizeof(*entry);
+               int   is_ost;
+               struct obd_device *obd;
+               char *obdname;
+               char *cname;
+               char *params;
+               char *uuid;
+
+               rc = -EINVAL;
+               if (datalen < sizeof(*entry))
+                       break;
+
+               entry = (typeof(entry))(data + off);
+
+               /* sanity check */
+               if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+                       break;
+               if (entry->mne_nid_count == 0) /* at least one nid entry */
+                       break;
+               if (entry->mne_nid_size != sizeof(lnet_nid_t))
+                       break;
+
+               entry_len += entry->mne_nid_count * entry->mne_nid_size;
+               if (datalen < entry_len) /* must have entry_len at least */
+                       break;
+
+               /* Keep this swab for normal mixed endian handling. LU-1644 */
+               if (mne_swab)
+                       lustre_swab_mgs_nidtbl_entry(entry);
+               if (entry->mne_length > PAGE_CACHE_SIZE) {
+                       CERROR("MNE too large (%u)\n", entry->mne_length);
+                       break;
+               }
+
+               if (entry->mne_length < entry_len)
+                       break;
+
+               off     += entry->mne_length;
+               datalen -= entry->mne_length;
+               if (datalen < 0)
+                       break;
+
+               if (entry->mne_version > max_version) {
+                       CERROR("entry index(%lld) is over max_index(%lld)\n",
+                              entry->mne_version, max_version);
+                       break;
+               }
+
+               if (prev_version >= entry->mne_version) {
+                       CERROR("index unsorted, prev %lld, now %lld\n",
+                              prev_version, entry->mne_version);
+                       break;
+               }
+               prev_version = entry->mne_version;
+
+               /*
+                * Write a string with format "nid::instance" to
+                * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+                */
+
+               is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+               memset(buf, 0, bufsz);
+               obdname = buf;
+               pos = 0;
+
+               /* lustre-OST0001-osc-<instance #> */
+               strcpy(obdname, cld->cld_logname);
+               cname = strrchr(obdname, '-');
+               if (cname == NULL) {
+                       CERROR("mgc %s: invalid logname %s\n",
+                              mgc->obd_name, obdname);
+                       break;
+               }
+
+               pos = cname - obdname;
+               obdname[pos] = 0;
+               pos += sprintf(obdname + pos, "-%s%04x",
+                                 is_ost ? "OST" : "MDT", entry->mne_index);
+
+               cname = is_ost ? "osc" : "mdc",
+               pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+               lustre_cfg_bufs_reset(&bufs, obdname);
+
+               /* find the obd by obdname */
+               obd = class_name2obd(obdname);
+               if (obd == NULL) {
+                       CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+                              mgc->obd_name, obdname);
+                       rc = 0;
+                       /* this is a safe race, when the ost is starting up...*/
+                       continue;
+               }
+
+               /* osc.import = "connection=<Conn UUID>::<target instance>" */
+               ++pos;
+               params = buf + pos;
+               pos += sprintf(params, "%s.import=%s", cname, "connection=");
+               uuid = buf + pos;
+
+               down_read(&obd->u.cli.cl_sem);
+               if (obd->u.cli.cl_import == NULL) {
+                       /* client does not connect to the OST yet */
+                       up_read(&obd->u.cli.cl_sem);
+                       rc = 0;
+                       continue;
+               }
+
+               /* TODO: iterate all nids to find one */
+               /* find uuid by nid */
+               rc = client_import_find_conn(obd->u.cli.cl_import,
+                                            entry->u.nids[0],
+                                            (struct obd_uuid *)uuid);
+               up_read(&obd->u.cli.cl_sem);
+               if (rc < 0) {
+                       CERROR("mgc: cannot find uuid by nid %s\n",
+                              libcfs_nid2str(entry->u.nids[0]));
+                       break;
+               }
+
+               CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+                      uuid, libcfs_nid2str(entry->u.nids[0]));
+
+               pos += strlen(uuid);
+               pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+               LASSERT(pos < bufsz);
+
+               lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+               rc = -ENOMEM;
+               lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+               if (lcfg == NULL) {
+                       CERROR("mgc: cannot allocate memory\n");
+                       break;
+               }
+
+               CDEBUG(D_INFO, "ir apply logs "LPD64"/"LPD64" for %s -> %s\n",
+                      prev_version, max_version, obdname, params);
+
+               rc = class_process_config(lcfg);
+               lustre_cfg_free(lcfg);
+               if (rc)
+                       CDEBUG(D_INFO, "process config for %s error %d\n",
+                              obdname, rc);
+
+               /* continue, even one with error */
+       }
+
+       OBD_FREE(inst, PAGE_CACHE_SIZE);
+       RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+                                  struct config_llog_data *cld)
+{
+       struct ptlrpc_request *req = NULL;
+       struct config_llog_instance *cfg = &cld->cld_cfg;
+       struct mgs_config_body *body;
+       struct mgs_config_res  *res;
+       struct ptlrpc_bulk_desc *desc;
+       struct page **pages;
+       int nrpages;
+       bool eof = true;
+       bool mne_swab = false;
+       int i;
+       int ealen;
+       int rc;
+       ENTRY;
+
+       /* allocate buffer for bulk transfer.
+        * if this is the first time for this mgs to read logs,
+        * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+        * once; otherwise, it only reads increment of logs, this should be
+        * small and CONFIG_READ_NRPAGES will be used.
+        */
+       nrpages = CONFIG_READ_NRPAGES;
+       if (cfg->cfg_last_idx == 0) /* the first time */
+               nrpages = CONFIG_READ_NRPAGES_INIT;
+
+       OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+       if (pages == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < nrpages; i++) {
+               pages[i] = alloc_page(GFP_IOFS);
+               if (pages[i] == NULL)
+                       GOTO(out, rc = -ENOMEM);
+       }
+
+again:
+       LASSERT(cld_is_recover(cld));
+       LASSERT(mutex_is_locked(&cld->cld_lock));
+       req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+                                  &RQF_MGS_CONFIG_READ);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+       if (rc)
+               GOTO(out, rc);
+
+       /* pack request */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+       LASSERT(body != NULL);
+       LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+       if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+           >= sizeof(body->mcb_name))
+               GOTO(out, rc = -E2BIG);
+       body->mcb_offset = cfg->cfg_last_idx + 1;
+       body->mcb_type   = cld->cld_type;
+       body->mcb_bits   = PAGE_CACHE_SHIFT;
+       body->mcb_units  = nrpages;
+
+       /* allocate bulk transfer descriptor */
+       desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+                                   MGS_BULK_PORTAL);
+       if (desc == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < nrpages; i++)
+               ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+       if (res->mcr_size < res->mcr_offset)
+               GOTO(out, rc = -EINVAL);
+
+       /* always update the index even though it might have errors with
+        * handling the recover logs */
+       cfg->cfg_last_idx = res->mcr_offset;
+       eof = res->mcr_offset == res->mcr_size;
+
+       CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
+              res->mcr_offset, eof == false);
+
+       ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+       if (ealen < 0)
+               GOTO(out, rc = ealen);
+
+       if (ealen > nrpages << PAGE_CACHE_SHIFT)
+               GOTO(out, rc = -EINVAL);
+
+       if (ealen == 0) { /* no logs transferred */
+               if (!eof)
+                       rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+       /* This import flag means the server did an extra swab of IR MNE
+        * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+       if (unlikely(req->rq_import->imp_need_mne_swab))
+               mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+       for (i = 0; i < nrpages && ealen > 0; i++) {
+               int rc2;
+               void *ptr;
+
+               ptr = kmap(pages[i]);
+               rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+                                            min_t(int, ealen, PAGE_CACHE_SIZE),
+                                            mne_swab);
+               kunmap(pages[i]);
+               if (rc2 < 0) {
+                       CWARN("Process recover log %s error %d\n",
+                             cld->cld_logname, rc2);
+                       break;
+               }
+
+               ealen -= PAGE_CACHE_SIZE;
+       }
+
+out:
+       if (req)
+               ptlrpc_req_finished(req);
+
+       if (rc == 0 && !eof)
+               goto again;
+
+       if (pages) {
+               for (i = 0; i < nrpages; i++) {
+                       if (pages[i] == NULL)
+                               break;
+                       __free_page(pages[i]);
+               }
+               OBD_FREE(pages, sizeof(*pages) * nrpages);
+       }
+       return rc;
+}
+
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+                              struct config_llog_data *cld,
+                              int local_only)
+{
+       struct llog_ctxt *ctxt, *lctxt = NULL;
+       struct lvfs_run_ctxt *saved_ctxt;
+       struct lustre_sb_info *lsi = NULL;
+       int rc = 0, must_pop = 0;
+       bool sptlrpc_started = false;
+
+       ENTRY;
+
+       LASSERT(cld);
+       LASSERT(mutex_is_locked(&cld->cld_lock));
+
+       /*
+        * local copy of sptlrpc log is controlled elsewhere, don't try to
+        * read it up here.
+        */
+       if (cld_is_sptlrpc(cld) && local_only)
+               RETURN(0);
+
+       if (cld->cld_cfg.cfg_sb)
+               lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+       ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+       if (!ctxt) {
+               CERROR("missing llog context\n");
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC_PTR(saved_ctxt);
+       if (saved_ctxt == NULL)
+               RETURN(-ENOMEM);
+
+       lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+               if (local_only) { /* no local log at client side */
+               GOTO(out_pop, rc = -EIO);
+       }
+
+       if (cld_is_sptlrpc(cld)) {
+               sptlrpc_conf_log_update_begin(cld->cld_logname);
+               sptlrpc_started = true;
+       }
+
+       /* logname and instance info should be the same, so use our
+          copy of the instance for the update.  The cfg_last_idx will
+          be updated here. */
+       rc = class_config_parse_llog(NULL, ctxt, cld->cld_logname,
+                                    &cld->cld_cfg);
+       EXIT;
+
+out_pop:
+       llog_ctxt_put(ctxt);
+       if (lctxt)
+               llog_ctxt_put(lctxt);
+       if (must_pop)
+               pop_ctxt(saved_ctxt, &mgc->obd_lvfs_ctxt, NULL);
+
+       OBD_FREE_PTR(saved_ctxt);
+       /*
+        * update settings on existing OBDs. doing it inside
+        * of llog_process_lock so no device is attaching/detaching
+        * in parallel.
+        * the logname must be <fsname>-sptlrpc
+        */
+       if (sptlrpc_started) {
+               LASSERT(cld_is_sptlrpc(cld));
+               sptlrpc_conf_log_update_end(cld->cld_logname);
+               class_notify_sptlrpc_conf(cld->cld_logname,
+                                         strlen(cld->cld_logname) -
+                                         strlen("-sptlrpc"));
+       }
+
+       RETURN(rc);
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+       struct lustre_handle lockh = { 0 };
+       __u64 flags = LDLM_FL_NO_LRU;
+       int rc = 0, rcl;
+       ENTRY;
+
+       LASSERT(cld);
+
+       /* I don't want multiple processes running process_log at once --
+          sounds like badness.  It actually might be fine, as long as
+          we're not trying to update from the same log
+          simultaneously (in which case we should use a per-log sem.) */
+       mutex_lock(&cld->cld_lock);
+       if (cld->cld_stopping) {
+               mutex_unlock(&cld->cld_lock);
+               RETURN(0);
+       }
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+       CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+              cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+       /* Get the cfg lock on the llog */
+       rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+                         LCK_CR, &flags, NULL, NULL, NULL,
+                         cld, 0, NULL, &lockh);
+       if (rcl == 0) {
+               /* Get the cld, it will be released in mgc_blocking_ast. */
+               config_log_get(cld);
+               rc = ldlm_lock_set_data(&lockh, (void *)cld);
+               LASSERT(rc == 0);
+       } else {
+               CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+               /* mark cld_lostlock so that it will requeue
+                * after MGC becomes available. */
+               cld->cld_lostlock = 1;
+               /* Get extra reference, it will be put in requeue thread */
+               config_log_get(cld);
+       }
+
+
+       if (cld_is_recover(cld)) {
+               rc = 0; /* this is not a fatal error for recover log */
+               if (rcl == 0)
+                       rc = mgc_process_recover_log(mgc, cld);
+       } else {
+               rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+       }
+
+       CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+              mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+       mutex_unlock(&cld->cld_lock);
+
+       /* Now drop the lock so MGS can revoke it */
+       if (!rcl) {
+               rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL,
+                                LCK_CR, &lockh);
+               if (rcl)
+                       CERROR("Can't drop cfg lock: %d\n", rcl);
+       }
+
+       RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg *lcfg = buf;
+       struct config_llog_instance *cfg = NULL;
+       char *logname;
+       int rc = 0;
+       ENTRY;
+
+       switch(lcfg->lcfg_command) {
+       case LCFG_LOV_ADD_OBD: {
+               /* Overloading this cfg command: register a new target */
+               struct mgs_target_info *mti;
+
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+                   sizeof(struct mgs_target_info))
+                       GOTO(out, rc = -EINVAL);
+
+               mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+               CDEBUG(D_MGC, "add_target %s %#x\n",
+                      mti->mti_svname, mti->mti_flags);
+               rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+               break;
+       }
+       case LCFG_LOV_DEL_OBD:
+               /* Unregister has no meaning at the moment. */
+               CERROR("lov_del_obd unimplemented\n");
+               rc = -ENOSYS;
+               break;
+       case LCFG_SPTLRPC_CONF: {
+               rc = sptlrpc_process_config(lcfg);
+               break;
+       }
+       case LCFG_LOG_START: {
+               struct config_llog_data *cld;
+               struct super_block *sb;
+
+               logname = lustre_cfg_string(lcfg, 1);
+               cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+               sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+               CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+                      cfg->cfg_last_idx);
+
+               /* We're only called through here on the initial mount */
+               rc = config_log_add(obd, logname, cfg, sb);
+               if (rc)
+                       break;
+               cld = config_log_find(logname, cfg);
+               if (cld == NULL) {
+                       rc = -ENOENT;
+                       break;
+               }
+
+               /* COMPAT_146 */
+               /* FIXME only set this for old logs!  Right now this forces
+                  us to always skip the "inside markers" check */
+               cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+               rc = mgc_process_log(obd, cld);
+               if (rc == 0 && cld->cld_recover != NULL) {
+                       if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+                                        imp_connect_data, IMP_RECOV)) {
+                               rc = mgc_process_log(obd, cld->cld_recover);
+                       } else {
+                               struct config_llog_data *cir = cld->cld_recover;
+                               cld->cld_recover = NULL;
+                               config_log_put(cir);
+                       }
+                       if (rc)
+                               CERROR("Cannot process recover llog %d\n", rc);
+               }
+               config_log_put(cld);
+
+               break;
+       }
+       case LCFG_LOG_END: {
+               logname = lustre_cfg_string(lcfg, 1);
+
+               if (lcfg->lcfg_bufcount >= 2)
+                       cfg = (struct config_llog_instance *)lustre_cfg_buf(
+                               lcfg, 2);
+               rc = config_log_end(logname, cfg);
+               break;
+       }
+       default: {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+
+       }
+       }
+out:
+       RETURN(rc);
+}
+
+struct obd_ops mgc_obd_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_setup        = mgc_setup,
+       .o_precleanup   = mgc_precleanup,
+       .o_cleanup      = mgc_cleanup,
+       .o_add_conn     = client_import_add_conn,
+       .o_del_conn     = client_import_del_conn,
+       .o_connect      = client_connect_import,
+       .o_disconnect   = client_disconnect_export,
+       //.o_enqueue      = mgc_enqueue,
+       .o_cancel       = mgc_cancel,
+       //.o_iocontrol    = mgc_iocontrol,
+       .o_set_info_async = mgc_set_info_async,
+       .o_get_info       = mgc_get_info,
+       .o_import_event = mgc_import_event,
+       .o_llog_init    = mgc_llog_init,
+       .o_llog_finish  = mgc_llog_finish,
+       .o_process_config = mgc_process_config,
+};
+
+int __init mgc_init(void)
+{
+       return class_register_type(&mgc_obd_ops, NULL, NULL,
+                                  LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+       class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644 (file)
index 0000000..d2763f3
--- /dev/null
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+             llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+             genops.o uuid.o llog_ioctl.o lprocfs_status.o                \
+             lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+             local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+             mea.o lu_object.o dt_object.o capa.o cl_object.o   \
+             cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o           \
+             md_local_object.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644 (file)
index 0000000..c2a6702
--- /dev/null
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+       ES_UNK  = 0,    /* unknown stat */
+       ES_UNC  = 1,    /* ACL entry is not changed */
+       ES_MOD  = 2,    /* ACL entry is modified */
+       ES_ADD  = 3,    /* ACL entry is added */
+       ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+                                           ext_acl_xattr_entry *s)
+{
+       d->e_tag        = le16_to_cpu(s->e_tag);
+       d->e_perm       = le16_to_cpu(s->e_perm);
+       d->e_id  = le32_to_cpu(s->e_id);
+       d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+                                           ext_acl_xattr_entry *s)
+{
+       d->e_tag        = cpu_to_le16(s->e_tag);
+       d->e_perm       = cpu_to_le16(s->e_perm);
+       d->e_id  = cpu_to_le32(s->e_id);
+       d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+                                             posix_acl_xattr_entry *s)
+{
+       d->e_tag        = le16_to_cpu(s->e_tag);
+       d->e_perm       = le16_to_cpu(s->e_perm);
+       d->e_id  = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+                                             posix_acl_xattr_entry *s)
+{
+       d->e_tag        = cpu_to_le16(s->e_tag);
+       d->e_perm       = cpu_to_le16(s->e_perm);
+       d->e_id  = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+                                              int old_count, int new_count)
+{
+       int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+       int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+       posix_acl_xattr_header *new;
+
+       if (unlikely(old_count <= new_count))
+               return old_size;
+
+       OBD_ALLOC(new, new_size);
+       if (unlikely(new == NULL))
+               return -ENOMEM;
+
+       memcpy(new, *header, new_size);
+       OBD_FREE(*header, old_size);
+       *header = new;
+       return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+                                            int old_count)
+{
+       int ext_count = le32_to_cpu((*header)->a_count);
+       int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+       int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+       ext_acl_xattr_header *new;
+
+       if (unlikely(old_count <= ext_count))
+               return 0;
+
+       OBD_ALLOC(new, ext_size);
+       if (unlikely(new == NULL))
+               return -ENOMEM;
+
+       memcpy(new, *header, ext_size);
+       OBD_FREE(*header, old_size);
+       *header = new;
+       return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+       int count, i, esize;
+       ext_acl_xattr_header *new;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(ERR_PTR(-EINVAL));
+       else if (!size)
+               count = 0;
+       else
+               count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+       OBD_ALLOC(new, esize);
+       if (unlikely(new == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       new->a_count = cpu_to_le32(count);
+       for (i = 0; i < count; i++) {
+               new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+               new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+               new->a_entries[i].e_id   = header->a_entries[i].e_id;
+               new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+       }
+
+       RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+                                 posix_acl_xattr_header **out)
+{
+       int count, i, j, rc = 0;
+       __u32 id;
+       posix_acl_xattr_header *new;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(-EINVAL);
+       else if (!size)
+               RETURN(0);
+
+       OBD_ALLOC(new, size);
+       if (unlikely(new == NULL))
+               RETURN(-ENOMEM);
+
+       new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+       count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       for (i = 0, j = 0; i < count; i++) {
+               id = le32_to_cpu(header->a_entries[i].e_id);
+               switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       if (id != ACL_UNDEFINED_ID)
+                               GOTO(_out, rc = -EIO);
+
+                       memcpy(&new->a_entries[j++], &header->a_entries[i],
+                              sizeof(posix_acl_xattr_entry));
+                       break;
+               case ACL_USER:
+                       if (id != NOBODY_UID)
+                               memcpy(&new->a_entries[j++],
+                                      &header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+                       break;
+               case ACL_GROUP:
+                       if (id != NOBODY_GID)
+                               memcpy(&new->a_entries[j++],
+                                      &header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+                       break;
+               default:
+                       GOTO(_out, rc = -EIO);
+               }
+       }
+
+       /* free unused space. */
+       rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+       if (rc >= 0) {
+               size = rc;
+               *out = new;
+               rc = 0;
+       }
+       EXIT;
+
+_out:
+       if (rc) {
+               OBD_FREE(new, size);
+               size = rc;
+       }
+       return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+       OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+       OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+                                           ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+                           posix_acl_xattr_entry *entry, int *pos)
+{
+       int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+       once = 0;
+       start = *pos;
+       end = count;
+
+again:
+       for (i = start; i < end; i++) {
+               if (header->a_entries[i].e_tag == entry->e_tag &&
+                   header->a_entries[i].e_id == entry->e_id) {
+                       j = i;
+                       if (++i >= count)
+                               i = 0;
+                       *pos = i;
+                       return &header->a_entries[j];
+               }
+       }
+
+       if (!once) {
+               once = 1;
+               start = 0;
+               end = *pos;
+               goto again;
+       }
+
+       return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+                                ext_acl_xattr_header *ext_header,
+                                posix_acl_xattr_header **out)
+{
+       int posix_count, posix_size, i, j;
+       int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+       posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+       posix_acl_xattr_header *new;
+       ext_acl_xattr_entry *ee, ae;
+       ENTRY;
+
+       lustre_posix_acl_cpu_to_le(&pe, &pe);
+       ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+       if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+               /* there are only base ACL entries at most. */
+               posix_count = 3;
+               posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+               OBD_ALLOC(new, posix_size);
+               if (unlikely(new == NULL))
+                       RETURN(-ENOMEM);
+
+               new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+               for (i = 0, j = 0; i < ext_count; i++) {
+                       lustre_ext_acl_le_to_cpu(&ae,
+                                                &ext_header->a_entries[i]);
+                       switch (ae.e_tag) {
+                       case ACL_USER_OBJ:
+                       case ACL_GROUP_OBJ:
+                       case ACL_OTHER:
+                               if (ae.e_id != ACL_UNDEFINED_ID)
+                                       GOTO(_out, rc = -EIO);
+
+                               if (ae.e_stat != ES_DEL) {
+                                       new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                                       new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                                       new->a_entries[j++].e_id =
+                                               ext_header->a_entries[i].e_id;
+                               }
+                               break;
+                       case ACL_MASK:
+                       case ACL_USER:
+                       case ACL_GROUP:
+                               if (ae.e_stat == ES_DEL)
+                                       break;
+                       default:
+                               GOTO(_out, rc = -EIO);
+                       }
+               }
+       } else {
+               /* maybe there are valid ACL_USER or ACL_GROUP entries in the
+                * original server-side ACL, they are regarded as ES_UNC stat.*/
+               int ori_posix_count;
+
+               if (unlikely(size < 0))
+                       RETURN(-EINVAL);
+               else if (!size)
+                       ori_posix_count = 0;
+               else
+                       ori_posix_count =
+                               CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+               posix_count = ori_posix_count + ext_count;
+               posix_size =
+                       CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+               OBD_ALLOC(new, posix_size);
+               if (unlikely(new == NULL))
+                       RETURN(-ENOMEM);
+
+               new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+               /* 1. process the unchanged ACL entries
+                *    in the original server-side ACL. */
+               pos = 0;
+               for (i = 0, j = 0; i < ori_posix_count; i++) {
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee == NULL)
+                               memcpy(&new->a_entries[j++],
+                                      &posix_header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+               }
+
+               /* 2. process the non-deleted entries
+                *    from client-side extended ACL. */
+               for (i = 0; i < ext_count; i++) {
+                       if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+                           ES_DEL) {
+                               new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                               new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                               new->a_entries[j++].e_id =
+                                               ext_header->a_entries[i].e_id;
+                       }
+               }
+       }
+
+       /* free unused space. */
+       rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+       if (rc >= 0) {
+               posix_size = rc;
+               *out = new;
+               rc = 0;
+       }
+       EXIT;
+
+_out:
+       if (rc) {
+               OBD_FREE(new, posix_size);
+               posix_size = rc;
+       }
+       return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+                          ext_acl_xattr_header *ext_header)
+{
+       int ori_ext_count, posix_count, ext_count, ext_size;
+       int i, j, pos = 0, rc = 0;
+       posix_acl_xattr_entry pae;
+       ext_acl_xattr_header *new;
+       ext_acl_xattr_entry *ee, eae;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(ERR_PTR(-EINVAL));
+       else if (!size)
+               posix_count = 0;
+       else
+               posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       ori_ext_count = le32_to_cpu(ext_header->a_count);
+       ext_count = posix_count + ori_ext_count;
+       ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+       OBD_ALLOC(new, ext_size);
+       if (unlikely(new == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       for (i = 0, j = 0; i < posix_count; i++) {
+               lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+               switch (pae.e_tag) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       if (pae.e_id != ACL_UNDEFINED_ID)
+                               GOTO(out, rc = -EIO);
+               case ACL_USER:
+                       /* ignore "nobody" entry. */
+                       if (pae.e_id == NOBODY_UID)
+                               break;
+
+                       new->a_entries[j].e_tag =
+                                       posix_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                       posix_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id =
+                                       posix_header->a_entries[i].e_id;
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee) {
+                               if (posix_header->a_entries[i].e_perm !=
+                                                               ee->e_perm)
+                                       /* entry modified. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_MOD);
+                               else
+                                       /* entry unchanged. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_UNC);
+                       } else {
+                               /* new entry. */
+                               new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_ADD);
+                       }
+                       break;
+               case ACL_GROUP:
+                       /* ignore "nobody" entry. */
+                       if (pae.e_id == NOBODY_GID)
+                               break;
+                       new->a_entries[j].e_tag =
+                                       posix_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                       posix_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id =
+                                       posix_header->a_entries[i].e_id;
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee) {
+                               if (posix_header->a_entries[i].e_perm !=
+                                                               ee->e_perm)
+                                       /* entry modified. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_MOD);
+                               else
+                                       /* entry unchanged. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_UNC);
+                       } else {
+                               /* new entry. */
+                               new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_ADD);
+                       }
+                       break;
+               default:
+                       GOTO(out, rc = -EIO);
+               }
+       }
+
+       /* process deleted entries. */
+       for (i = 0; i < ori_ext_count; i++) {
+               lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+               if (eae.e_stat == ES_UNK) {
+                       /* ignore "nobody" entry. */
+                       if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+                           (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+                               continue;
+
+                       new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+                       new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+               }
+       }
+
+       new->a_count = cpu_to_le32(j);
+       /* free unused space. */
+       rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+       EXIT;
+
+out:
+       if (rc) {
+               OBD_FREE(new, ext_size);
+               new = ERR_PTR(rc);
+       }
+       return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644 (file)
index 0000000..3e532f5
--- /dev/null
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000          /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+       DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+       struct hlist_head *hash;
+       int nr_hash, i;
+
+       OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+       if (!hash)
+               return NULL;
+
+       nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+       LASSERT(nr_hash > NR_CAPAHASH);
+
+       for (i = 0; i < NR_CAPAHASH; i++)
+               INIT_HLIST_HEAD(hash + i);
+       return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+       return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+       LASSERT(capa_on_server(ocapa));
+       hlist_del_init(&ocapa->u.tgt.c_hash);
+       list_del_init(&ocapa->c_list);
+       capa_count[ocapa->c_site]--;
+       /* release the ref when alloc */
+       capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+       int i;
+       struct hlist_node *next;
+       struct obd_capa *oc;
+
+       spin_lock(&capa_lock);
+       for (i = 0; i < NR_CAPAHASH; i++) {
+               hlist_for_each_entry_safe(oc, next, hash + i,
+                                             u.tgt.c_hash)
+                       capa_delete(oc);
+       }
+       spin_unlock(&capa_lock);
+
+       OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+       return (fid_oid(fid) ^ fid_ver(fid)) *
+              (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+       return cfs_time_before(cfs_time_sub(oc->c_expiry,
+                                  cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+                              cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+                                 struct hlist_head *head, int alive)
+{
+       struct obd_capa *ocapa;
+       int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+       hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+               if (memcmp(&ocapa->c_capa, capa, len))
+                       continue;
+               /* don't return one that will expire soon in this case */
+               if (alive && capa_is_to_expire(ocapa))
+                       continue;
+
+               LASSERT(capa_on_server(ocapa));
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+               return ocapa;
+       }
+
+       return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+       struct obd_capa *ocapa;
+       struct list_head *node = head->next;
+       int count = 0;
+
+       /* free LRU_CAPA_DELETE_COUNT unused capa from head */
+       while (count++ < LRU_CAPA_DELETE_COUNT) {
+               ocapa = list_entry(node, struct obd_capa, c_list);
+               node = node->next;
+               if (atomic_read(&ocapa->c_refc))
+                       continue;
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+               capa_delete(ocapa);
+       }
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+       struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+       struct obd_capa *ocapa, *old = NULL;
+       struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+       ocapa = alloc_capa(CAPA_SITE_SERVER);
+       if (IS_ERR(ocapa))
+               return NULL;
+
+       spin_lock(&capa_lock);
+       old = find_capa(capa, head, 0);
+       if (!old) {
+               ocapa->c_capa = *capa;
+               set_capa_expiry(ocapa);
+               hlist_add_head(&ocapa->u.tgt.c_hash, head);
+               list_add_tail(&ocapa->c_list, list);
+               capa_get(ocapa);
+               capa_count[CAPA_SITE_SERVER]++;
+               if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+                       capa_delete_lru(list);
+               spin_unlock(&capa_lock);
+               return ocapa;
+       } else {
+               capa_get(old);
+               spin_unlock(&capa_lock);
+               capa_put(ocapa);
+               return old;
+       }
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+                            int alive)
+{
+       struct obd_capa *ocapa;
+
+       spin_lock(&capa_lock);
+       ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+       if (ocapa) {
+               list_move_tail(&ocapa->c_list,
+                                  &capa_list[CAPA_SITE_SERVER]);
+               capa_get(ocapa);
+       }
+       spin_unlock(&capa_lock);
+
+       return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+       struct ll_crypto_hash *tfm;
+       struct capa_hmac_alg  *alg;
+       int keylen;
+       struct scatterlist sl;
+
+       if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+               CERROR("unknown capability hmac algorithm!\n");
+               return -EFAULT;
+       }
+
+       alg = &capa_hmac_algs[capa_alg(capa)];
+
+       tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+       if (!tfm) {
+               CERROR("crypto_alloc_tfm failed, check whether your kernel"
+                      "has crypto support!\n");
+               return -ENOMEM;
+       }
+       keylen = alg->ha_keylen;
+
+       sg_set_page(&sl, virt_to_page(capa),
+                   offsetof(struct lustre_capa, lc_hmac),
+                   (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+       ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+       ll_crypto_free_hash(tfm);
+
+       return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+       struct ll_crypto_cipher *tfm;
+       struct scatterlist sd;
+       struct scatterlist ss;
+       struct blkcipher_desc desc;
+       unsigned int min;
+       int rc;
+       char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+       ENTRY;
+
+       /* passing "aes" in a variable instead of a constant string keeps gcc
+        * 4.3.2 happy */
+       tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+       if (IS_ERR(tfm)) {
+               CERROR("failed to load transform for aes\n");
+               RETURN(PTR_ERR(tfm));
+       }
+
+       min = ll_crypto_tfm_alg_min_keysize(tfm);
+       if (keylen < min) {
+               CERROR("keylen at least %d bits for aes\n", min * 8);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+       if (rc) {
+               CERROR("failed to setting key for aes\n");
+               GOTO(out, rc);
+       }
+
+       sg_set_page(&sd, virt_to_page(d), 16,
+                   (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+       sg_set_page(&ss, virt_to_page(s), 16,
+                   (unsigned long)(s) % PAGE_CACHE_SIZE);
+       desc.tfm   = tfm;
+       desc.info  = NULL;
+       desc.flags = 0;
+       rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+       if (rc) {
+               CERROR("failed to encrypt for aes\n");
+               GOTO(out, rc);
+       }
+
+       EXIT;
+
+out:
+       ll_crypto_free_blkcipher(tfm);
+       return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+       struct ll_crypto_cipher *tfm;
+       struct scatterlist sd;
+       struct scatterlist ss;
+       struct blkcipher_desc desc;
+       unsigned int min;
+       int rc;
+       char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+       ENTRY;
+
+       /* passing "aes" in a variable instead of a constant string keeps gcc
+        * 4.3.2 happy */
+       tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+       if (IS_ERR(tfm)) {
+               CERROR("failed to load transform for aes\n");
+               RETURN(PTR_ERR(tfm));
+       }
+
+       min = ll_crypto_tfm_alg_min_keysize(tfm);
+       if (keylen < min) {
+               CERROR("keylen at least %d bits for aes\n", min * 8);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+       if (rc) {
+               CERROR("failed to setting key for aes\n");
+               GOTO(out, rc);
+       }
+
+       sg_set_page(&sd, virt_to_page(d), 16,
+                   (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+       sg_set_page(&ss, virt_to_page(s), 16,
+                   (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+       desc.tfm   = tfm;
+       desc.info  = NULL;
+       desc.flags = 0;
+       rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+       if (rc) {
+               CERROR("failed to decrypt for aes\n");
+               GOTO(out, rc);
+       }
+
+       EXIT;
+
+out:
+       ll_crypto_free_blkcipher(tfm);
+       return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+       spin_lock(&ocapa->c_lock);
+       *(struct lustre_capa *)capa = ocapa->c_capa;
+       spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+                struct libcfs_debug_msg_data *msgdata,
+                const char *fmt, ... )
+{
+       va_list args;
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+                          " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+                          "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+                          capa_uid(c), capa_gid(c), capa_flags(c),
+                          capa_alg(c), capa_keyid(c), capa_timeout(c),
+                          capa_expiry(c));
+       va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644 (file)
index 0000000..7eb0ad7
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+       CNL_TOP,
+       CNL_SUB,
+       CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+       /**
+        * Number of outstanding calls to cl_lock_mutex_get() made by the
+        * current thread. For debugging.
+        */
+       int        ctc_nr_locks_locked;
+       /** List of locked locks. */
+       struct lu_ref ctc_locks_locked;
+       /** Number of outstanding holds on locks. */
+       int        ctc_nr_held;
+       /** Number of outstanding uses on locks. */
+       int        ctc_nr_used;
+       /** Number of held extent locks. */
+       int        ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+       /*
+        * Common fields.
+        */
+       struct cl_io     clt_io;
+       struct cl_2queue     clt_queue;
+
+       /*
+        * Fields used by cl_lock.c
+        */
+       struct cl_lock_descr clt_descr;
+       struct cl_page_list  clt_list;
+       /**
+        * Counters for every level of lock nesting.
+        */
+       struct cl_thread_counters clt_counters[CNL_NR];
+       /** @} debugging */
+
+       /*
+        * Fields used by cl_page.c
+        */
+       struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+       /*
+        * Fields used by cl_io.c
+        */
+       /**
+        * Pointer to the topmost ongoing IO in this thread.
+        */
+       struct cl_io    *clt_current_io;
+       /**
+        * Used for submitting a sync io.
+        */
+       struct cl_sync_io    clt_anchor;
+       /**
+        * Fields used by cl_lock_discard_pages().
+        */
+       pgoff_t       clt_next_index;
+       pgoff_t       clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644 (file)
index 0000000..75c9be8
--- /dev/null
@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+       list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)               \
+       list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+       return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+       return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+       return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+       struct cl_io *up;
+
+       up = io->ci_parent;
+       return
+               /*
+                * io can own pages only when it is ongoing. Sub-io might
+                * still be in CIS_LOCKED state when top-io is in
+                * CIS_IO_GOING.
+                */
+               ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+                    (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+       struct cl_io_slice    *slice;
+       struct cl_thread_info *info;
+
+       LINVRNT(cl_io_type_is_valid(io->ci_type));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       while (!list_empty(&io->ci_layers)) {
+               slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+                                    cis_linkage);
+               list_del_init(&slice->cis_linkage);
+               if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+                       slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+               /*
+                * Invalidate slice to catch use after free. This assumes that
+                * slices are allocated within session and can be touched
+                * after ->cio_fini() returns.
+                */
+               slice->cis_io = NULL;
+       }
+       io->ci_state = CIS_FINI;
+       info = cl_env_info(env);
+       if (info->clt_current_io == io)
+               info->clt_current_io = NULL;
+
+       /* sanity check for layout change */
+       switch(io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               break;
+       case CIT_FAULT:
+       case CIT_FSYNC:
+               LASSERT(!io->ci_need_restart);
+               break;
+       case CIT_SETATTR:
+       case CIT_MISC:
+               /* Check ignore layout change conf */
+               LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+                               !io->ci_need_restart));
+               break;
+       default:
+               LBUG();
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+                      enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_object *scan;
+       int result;
+
+       LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+       LINVRNT(cl_io_type_is_valid(iot));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       io->ci_type = iot;
+       INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+       INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+       INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+       INIT_LIST_HEAD(&io->ci_layers);
+
+       result = 0;
+       cl_object_for_each(scan, obj) {
+               if (scan->co_ops->coo_io_init != NULL) {
+                       result = scan->co_ops->coo_io_init(env, scan, io);
+                       if (result != 0)
+                               break;
+               }
+       }
+       if (result == 0)
+               io->ci_state = CIS_INIT;
+       RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+                  enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+
+       LASSERT(obj != cl_object_top(obj));
+       if (info->clt_current_io == NULL)
+               info->clt_current_io = io;
+       return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+              enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+
+       LASSERT(obj == cl_object_top(obj));
+       LASSERT(info->clt_current_io == NULL);
+
+       info->clt_current_io = io;
+       return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+                 enum cl_io_type iot, loff_t pos, size_t count)
+{
+       LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+       LINVRNT(io->ci_obj != NULL);
+       ENTRY;
+
+       LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+                        "io range: %u ["LPU64", "LPU64") %u %u\n",
+                        iot, (__u64)pos, (__u64)pos + count,
+                        io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+       io->u.ci_rw.crw_pos    = pos;
+       io->u.ci_rw.crw_count  = count;
+       RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+       return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+                             const struct cl_lock_descr *d1)
+{
+       return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+               __diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+                            const struct cl_lock_descr *d1)
+{
+       int ret;
+
+       ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+       if (ret)
+               return ret;
+       if (d0->cld_end < d1->cld_start)
+               return -1;
+       if (d0->cld_start > d0->cld_end)
+               return 1;
+       return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+                               const struct cl_lock_descr *d1)
+{
+       d0->cld_start = min(d0->cld_start, d1->cld_start);
+       d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+       if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+               d0->cld_mode = CLM_WRITE;
+
+       if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+               d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+       int done = 0;
+
+       ENTRY;
+       /* hidden treasure: bubble sort for now. */
+       do {
+               struct cl_io_lock_link *curr;
+               struct cl_io_lock_link *prev;
+               struct cl_io_lock_link *temp;
+
+               done = 1;
+               prev = NULL;
+
+               list_for_each_entry_safe(curr, temp,
+                                            &io->ci_lockset.cls_todo,
+                                            cill_linkage) {
+                       if (prev != NULL) {
+                               switch (cl_lock_descr_sort(&prev->cill_descr,
+                                                         &curr->cill_descr)) {
+                               case 0:
+                                       /*
+                                        * IMPOSSIBLE: Identical locks are
+                                        *           already removed at
+                                        *           this point.
+                                        */
+                               default:
+                                       LBUG();
+                               case +1:
+                                       list_move_tail(&curr->cill_linkage,
+                                                          &prev->cill_linkage);
+                                       done = 0;
+                                       continue; /* don't change prev: it's
+                                                  * still "previous" */
+                               case -1: /* already in order */
+                                       break;
+                               }
+                       }
+                       prev = curr;
+               }
+       } while (!done);
+       EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+                  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+              if (cl_lock_descr_match(&scan->cill_descr, need))
+                      RETURN(+1);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+                         const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+              if (cl_lock_descr_cmp(&scan->cill_descr, need))
+                      continue;
+              cl_lock_descr_merge(&scan->cill_descr, need);
+              CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+                     scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+                     scan->cill_descr.cld_end);
+              RETURN(+1);
+       }
+       RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+                           const struct cl_lock_descr *need)
+{
+       return cl_queue_match(&set->cls_curr, need) ||
+              cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+                           const struct cl_lock_descr *need)
+{
+       return cl_queue_merge(&set->cls_todo, need) ||
+              cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+                              struct cl_io *io, struct cl_lockset *set,
+                              struct cl_io_lock_link *link)
+{
+       struct cl_lock *lock;
+       int          result;
+
+       ENTRY;
+
+       lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+       if (!IS_ERR(lock)) {
+               link->cill_lock = lock;
+               list_move(&link->cill_linkage, &set->cls_curr);
+               if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+                       result = cl_wait(env, lock);
+                       if (result == 0)
+                               list_move(&link->cill_linkage,
+                                             &set->cls_done);
+               } else
+                       result = 0;
+       } else
+               result = PTR_ERR(lock);
+       RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+                             struct cl_io_lock_link *link)
+{
+       struct cl_lock *lock = link->cill_lock;
+
+       ENTRY;
+       list_del_init(&link->cill_linkage);
+       if (lock != NULL) {
+               cl_lock_release(env, lock, "io", io);
+               link->cill_lock = NULL;
+       }
+       if (link->cill_fini != NULL)
+               link->cill_fini(env, link);
+       EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+                          struct cl_lockset *set)
+{
+       struct cl_io_lock_link *link;
+       struct cl_io_lock_link *temp;
+       struct cl_lock   *lock;
+       int result;
+
+       ENTRY;
+       result = 0;
+       list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+               if (!cl_lockset_match(set, &link->cill_descr)) {
+                       /* XXX some locking to guarantee that locks aren't
+                        * expanded in between. */
+                       result = cl_lockset_lock_one(env, io, set, link);
+                       if (result != 0)
+                               break;
+               } else
+                       cl_lock_link_fini(env, io, link);
+       }
+       if (result == 0) {
+               list_for_each_entry_safe(link, temp,
+                                            &set->cls_curr, cill_linkage) {
+                       lock = link->cill_lock;
+                       result = cl_wait(env, lock);
+                       if (result == 0)
+                               list_move(&link->cill_linkage,
+                                             &set->cls_done);
+                       else
+                               break;
+               }
+       }
+       RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_IT_STARTED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+               if (result != 0)
+                       break;
+       }
+       if (result == 0) {
+               cl_io_locks_sort(io);
+               result = cl_lockset_lock(env, io, &io->ci_lockset);
+       }
+       if (result != 0)
+               cl_io_unlock(env, io);
+       else
+               io->ci_state = CIS_LOCKED;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+       struct cl_lockset       *set;
+       struct cl_io_lock_link   *link;
+       struct cl_io_lock_link   *temp;
+       const struct cl_io_slice *scan;
+
+       LASSERT(cl_io_is_loopable(io));
+       LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       set = &io->ci_lockset;
+
+       list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+               cl_lock_link_fini(env, io, link);
+
+       list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+               cl_lock_link_fini(env, io, link);
+
+       list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+               cl_unuse(env, link->cill_lock);
+               cl_lock_link_fini(env, io, link);
+       }
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+       }
+       io->ci_state = CIS_UNLOCKED;
+       LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       result = 0;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+                                                                     scan);
+               if (result != 0)
+                       break;
+       }
+       if (result == 0)
+               io->ci_state = CIS_IT_STARTED;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_UNLOCKED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+       }
+       io->ci_state = CIS_IT_ENDED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+               nob == 0);
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+
+       io->u.ci_rw.crw_pos   += nob;
+       io->u.ci_rw.crw_count -= nob;
+
+       /* layers have to be notified. */
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+                                                                  nob);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+                  struct cl_io_lock_link *link)
+{
+       int result;
+
+       ENTRY;
+       if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+               result = +1;
+       else {
+               list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+               result = 0;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+                                struct cl_io_lock_link *link)
+{
+       OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                        struct cl_lock_descr *descr)
+{
+       struct cl_io_lock_link *link;
+       int result;
+
+       ENTRY;
+       OBD_ALLOC_PTR(link);
+       if (link != NULL) {
+               link->cill_descr     = *descr;
+               link->cill_fini      = cl_free_io_lock_link;
+               result = cl_io_lock_add(env, io, link);
+               if (result) /* lock match */
+                       link->cill_fini(env, link);
+       } else
+               result = -ENOMEM;
+
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       io->ci_state = CIS_IO_GOING;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+               if (result != 0)
+                       break;
+       }
+       if (result >= 0)
+               result = 0;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_IO_GOING);
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+               /* TODO: error handling. */
+       }
+       io->ci_state = CIS_IO_FINISHED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+       LINVRNT(slice != NULL);
+       return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+       int     result = 1;
+       loff_t  start;
+       loff_t  end;
+       pgoff_t idx;
+
+       idx = page->cp_index;
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               /*
+                * check that [start, end) and [pos, pos + count) extents
+                * overlap.
+                */
+               if (!cl_io_is_append(io)) {
+                       const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+                       start = cl_offset(page->cp_obj, idx);
+                       end   = cl_offset(page->cp_obj, idx + 1);
+                       result = crw->crw_pos < end &&
+                                start < crw->crw_pos + crw->crw_count;
+               }
+               break;
+       case CIT_FAULT:
+               result = io->u.ci_fault.ft_index == idx;
+               break;
+       default:
+               LBUG();
+       }
+       return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page *page)
+{
+       const struct cl_io_slice *scan;
+       struct cl_2queue         *queue;
+       int                    result = 0;
+
+       LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+       LINVRNT(cl_page_is_owned(page, io));
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_page_in_io(page, io));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       queue = &io->ci_queue;
+
+       cl_2queue_init(queue);
+       /*
+        * ->cio_read_page() methods called in the loop below are supposed to
+        * never block waiting for network (the only subtle point is the
+        * creation of new pages for read-ahead that might result in cache
+        * shrinking, but currently only clean pages are shrunk and this
+        * requires no network io).
+        *
+        * Should this ever starts blocking, retry loop would be needed for
+        * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+        */
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->cio_read_page != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       LINVRNT(slice != NULL);
+                       result = scan->cis_iop->cio_read_page(env, scan, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       if (result == 0)
+               result = cl_io_submit_rw(env, io, CRT_READ, queue);
+       /*
+        * Unlock unsent pages in case of error.
+        */
+       cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_2queue_fini(env, queue);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page *page, unsigned from, unsigned to)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(io->ci_type == CIT_WRITE);
+       LINVRNT(cl_page_is_owned(page, io));
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       LASSERT(cl_page_in_io(page, io));
+       ENTRY;
+
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->cio_prepare_write != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       result = scan->cis_iop->cio_prepare_write(env, scan,
+                                                                 slice,
+                                                                 from, to);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+                      struct cl_page *page, unsigned from, unsigned to)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(io->ci_type == CIT_WRITE);
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       /*
+        * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+        * already called cl_page_cache_add(), moving page into CPS_CACHED
+        * state. Better (and more general) way of dealing with such situation
+        * is needed.
+        */
+       LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+       LASSERT(cl_page_in_io(page, io));
+       ENTRY;
+
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->cio_commit_write != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       result = scan->cis_iop->cio_commit_write(env, scan,
+                                                                slice,
+                                                                from, to);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LINVRNT(result <= 0);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+                   enum cl_req_type crt, struct cl_2queue *queue)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+       ENTRY;
+
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+                       continue;
+               result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+                                                              queue);
+               if (result != 0)
+                       break;
+       }
+       /*
+        * If ->cio_submit() failed, no pages were sent.
+        */
+       LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+                     enum cl_req_type iot, struct cl_2queue *queue,
+                     long timeout)
+{
+       struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+       struct cl_page *pg;
+       int rc;
+
+       cl_page_list_for_each(pg, &queue->c2_qin) {
+               LASSERT(pg->cp_sync_io == NULL);
+               pg->cp_sync_io = anchor;
+       }
+
+       cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+       rc = cl_io_submit_rw(env, io, iot, queue);
+       if (rc == 0) {
+               /*
+                * If some pages weren't sent for any reason (e.g.,
+                * read found up-to-date pages in the cache, or write found
+                * clean pages), count them as completed to avoid infinite
+                * wait.
+                */
+                cl_page_list_for_each(pg, &queue->c2_qin) {
+                       pg->cp_sync_io = NULL;
+                       cl_sync_io_note(anchor, +1);
+                }
+
+                /* wait for the IO to be finished. */
+                rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+                                     anchor, timeout);
+       } else {
+               LASSERT(list_empty(&queue->c2_qout.pl_pages));
+               cl_page_list_for_each(pg, &queue->c2_qin)
+                       pg->cp_sync_io = NULL;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+                struct cl_page_list *queue)
+{
+       struct cl_page *page;
+       int result = 0;
+
+       CERROR("Canceling ongoing page trasmission\n");
+       cl_page_list_for_each(page, queue) {
+               int rc;
+
+               LINVRNT(cl_page_in_io(page, io));
+               rc = cl_page_cancel(env, page);
+               result = result ?: rc;
+       }
+       return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+       int result   = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       ENTRY;
+
+       do {
+               size_t nob;
+
+               io->ci_continue = 0;
+               result = cl_io_iter_init(env, io);
+               if (result == 0) {
+                       nob    = io->ci_nob;
+                       result = cl_io_lock(env, io);
+                       if (result == 0) {
+                               /*
+                                * Notify layers that locks has been taken,
+                                * and do actual i/o.
+                                *
+                                *   - llite: kms, short read;
+                                *   - llite: generic_file_read();
+                                */
+                               result = cl_io_start(env, io);
+                               /*
+                                * Send any remaining pending
+                                * io, etc.
+                                *
+                                *   - llite: ll_rw_stats_tally.
+                                */
+                               cl_io_end(env, io);
+                               cl_io_unlock(env, io);
+                               cl_io_rw_advance(env, io, io->ci_nob - nob);
+                       }
+               }
+               cl_io_iter_fini(env, io);
+       } while (result == 0 && io->ci_continue);
+       if (result == 0)
+               result = io->ci_result;
+       RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                    struct cl_object *obj,
+                    const struct cl_io_operations *ops)
+{
+       struct list_head *linkage = &slice->cis_linkage;
+
+       LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+               list_empty(linkage));
+       ENTRY;
+
+       list_add_tail(linkage, &io->ci_layers);
+       slice->cis_io  = io;
+       slice->cis_obj = obj;
+       slice->cis_iop = ops;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+       ENTRY;
+       plist->pl_nr = 0;
+       INIT_LIST_HEAD(&plist->pl_pages);
+       plist->pl_owner = current;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+       ENTRY;
+       /* it would be better to check that page is owned by "current" io, but
+        * it is not passed here. */
+       LASSERT(page->cp_owner != NULL);
+       LINVRNT(plist->pl_owner == current);
+
+       lockdep_off();
+       mutex_lock(&page->cp_mutex);
+       lockdep_on();
+       LASSERT(list_empty(&page->cp_batch));
+       list_add_tail(&page->cp_batch, &plist->pl_pages);
+       ++plist->pl_nr;
+       page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+       cl_page_get(page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+                     struct cl_page_list *plist, struct cl_page *page)
+{
+       LASSERT(plist->pl_nr > 0);
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       list_del_init(&page->cp_batch);
+       lockdep_off();
+       mutex_unlock(&page->cp_mutex);
+       lockdep_on();
+       --plist->pl_nr;
+       lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+       cl_page_put(env, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+                      struct cl_page *page)
+{
+       LASSERT(src->pl_nr > 0);
+       LINVRNT(dst->pl_owner == current);
+       LINVRNT(src->pl_owner == current);
+
+       ENTRY;
+       list_move_tail(&page->cp_batch, &dst->pl_pages);
+       --src->pl_nr;
+       ++dst->pl_nr;
+       lu_ref_set_at(&page->cp_reference,
+                     page->cp_queue_ref, "queue", src, dst);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+       struct cl_page *page;
+       struct cl_page *tmp;
+
+       LINVRNT(list->pl_owner == current);
+       LINVRNT(head->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, tmp, list)
+               cl_page_list_move(head, list, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+                        struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, temp, plist) {
+               LASSERT(plist->pl_nr > 0);
+
+               list_del_init(&page->cp_batch);
+               lockdep_off();
+               mutex_unlock(&page->cp_mutex);
+               lockdep_on();
+               --plist->pl_nr;
+               /*
+                * cl_page_disown0 rather than usual cl_page_disown() is used,
+                * because pages are possibly in CPS_FREEING state already due
+                * to the call to cl_page_list_discard().
+                */
+               /*
+                * XXX cl_page_disown0() will fail if page is not locked.
+                */
+               cl_page_disown0(env, io, page);
+               lu_ref_del(&page->cp_reference, "queue", plist);
+               cl_page_put(env, page);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, temp, plist)
+               cl_page_list_del(env, plist, page);
+       LASSERT(plist->pl_nr == 0);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+       pgoff_t index = 0;
+       int result;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       result = 0;
+       cl_page_list_for_each_safe(page, temp, plist) {
+               LASSERT(index <= page->cp_index);
+               index = page->cp_index;
+               if (cl_page_own(env, io, page) == 0)
+                       result = result ?: page->cp_error;
+               else
+                       cl_page_list_del(env, plist, page);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+                        struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+
+       LINVRNT(plist->pl_owner == current);
+
+       cl_page_list_for_each(page, plist)
+               cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page_list *plist)
+{
+       struct cl_page *page;
+
+       LINVRNT(plist->pl_owner == current);
+       ENTRY;
+       cl_page_list_for_each(page, plist)
+               cl_page_discard(env, io, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       int result;
+
+       LINVRNT(plist->pl_owner == current);
+       ENTRY;
+       result = 0;
+       cl_page_list_for_each(page, plist) {
+               result = cl_page_unmap(env, io, page);
+               if (result != 0)
+                       break;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_init(&queue->c2_qin);
+       cl_page_list_init(&queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+       ENTRY;
+       cl_page_list_add(&queue->c2_qin, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+                     struct cl_io *io, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_page_list_disown(env, io, &queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_discard(env, io, &queue->c2_qin);
+       cl_page_list_discard(env, io, &queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+                     struct cl_io *io, struct cl_2queue *queue)
+{
+       cl_page_list_assume(env, io, &queue->c2_qin);
+       cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_fini(env, &queue->c2_qout);
+       cl_page_list_fini(env, &queue->c2_qin);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+       ENTRY;
+       cl_2queue_init(queue);
+       cl_2queue_add(queue, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+       ENTRY;
+       while (io->ci_parent != NULL)
+               io = io->ci_parent;
+       RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+                lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                     struct cl_device *dev,
+                     const struct cl_req_operations *ops)
+{
+       ENTRY;
+       list_add_tail(&slice->crs_linkage, &req->crq_layers);
+       slice->crs_dev = dev;
+       slice->crs_ops = ops;
+       slice->crs_req = req;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+       unsigned i;
+
+       LASSERT(list_empty(&req->crq_pages));
+       LASSERT(req->crq_nrpages == 0);
+       LINVRNT(list_empty(&req->crq_layers));
+       LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+       ENTRY;
+
+       if (req->crq_o != NULL) {
+               for (i = 0; i < req->crq_nrobjs; ++i) {
+                       struct cl_object *obj = req->crq_o[i].ro_obj;
+                       if (obj != NULL) {
+                               lu_object_ref_del_at(&obj->co_lu,
+                                                    req->crq_o[i].ro_obj_ref,
+                                                    "cl_req", req);
+                               cl_object_put(env, obj);
+                       }
+               }
+               OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+       }
+       OBD_FREE_PTR(req);
+       EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+                      struct cl_page *page)
+{
+       struct cl_device     *dev;
+       struct cl_page_slice *slice;
+       int result;
+
+       ENTRY;
+       result = 0;
+       page = cl_page_top(page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+                       if (dev->cd_ops->cdo_req_init != NULL) {
+                               result = dev->cd_ops->cdo_req_init(env,
+                                                                  dev, req);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               page = page->cp_child;
+       } while (page != NULL && result == 0);
+       RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+       struct cl_req_slice *slice;
+
+       ENTRY;
+       /*
+        * for the lack of list_for_each_entry_reverse_safe()...
+        */
+       while (!list_empty(&req->crq_layers)) {
+               slice = list_entry(req->crq_layers.prev,
+                                      struct cl_req_slice, crs_linkage);
+               list_del_init(&slice->crs_linkage);
+               if (slice->crs_ops->cro_completion != NULL)
+                       slice->crs_ops->cro_completion(env, slice, rc);
+       }
+       cl_req_free(env, req);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                           enum cl_req_type crt, int nr_objects)
+{
+       struct cl_req *req;
+
+       LINVRNT(nr_objects > 0);
+       ENTRY;
+
+       OBD_ALLOC_PTR(req);
+       if (req != NULL) {
+               int result;
+
+               OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+               if (req->crq_o != NULL) {
+                       req->crq_nrobjs = nr_objects;
+                       req->crq_type = crt;
+                       INIT_LIST_HEAD(&req->crq_pages);
+                       INIT_LIST_HEAD(&req->crq_layers);
+                       result = cl_req_init(env, req, page);
+               } else
+                       result = -ENOMEM;
+               if (result != 0) {
+                       cl_req_completion(env, req, result);
+                       req = ERR_PTR(result);
+               }
+       } else
+               req = ERR_PTR(-ENOMEM);
+       RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+                    struct cl_req *req, struct cl_page *page)
+{
+       struct cl_object  *obj;
+       struct cl_req_obj *rqo;
+       int i;
+
+       ENTRY;
+       page = cl_page_top(page);
+
+       LASSERT(list_empty(&page->cp_flight));
+       LASSERT(page->cp_req == NULL);
+
+       CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+                     req, req->crq_type, req->crq_nrpages);
+
+       list_add_tail(&page->cp_flight, &req->crq_pages);
+       ++req->crq_nrpages;
+       page->cp_req = req;
+       obj = cl_object_top(page->cp_obj);
+       for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+               if (rqo->ro_obj == NULL) {
+                       rqo->ro_obj = obj;
+                       cl_object_get(obj);
+                       rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                           "cl_req", req);
+                       break;
+               }
+       }
+       LASSERT(i < req->crq_nrobjs);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+       struct cl_req *req = page->cp_req;
+
+       ENTRY;
+       page = cl_page_top(page);
+
+       LASSERT(!list_empty(&page->cp_flight));
+       LASSERT(req->crq_nrpages > 0);
+
+       list_del_init(&page->cp_flight);
+       --req->crq_nrpages;
+       page->cp_req = NULL;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+       int i;
+       int result;
+       const struct cl_req_slice *slice;
+
+       ENTRY;
+       /*
+        * Check that the caller of cl_req_alloc() didn't lie about the number
+        * of objects.
+        */
+       for (i = 0; i < req->crq_nrobjs; ++i)
+               LASSERT(req->crq_o[i].ro_obj != NULL);
+
+       result = 0;
+       list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+               if (slice->crs_ops->cro_prep != NULL) {
+                       result = slice->crs_ops->cro_prep(env, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+                    struct cl_req_attr *attr, obd_valid flags)
+{
+       const struct cl_req_slice *slice;
+       struct cl_page      *page;
+       int i;
+
+       LASSERT(!list_empty(&req->crq_pages));
+       ENTRY;
+
+       /* Take any page to use as a model. */
+       page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+       for (i = 0; i < req->crq_nrobjs; ++i) {
+               list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+                       const struct cl_page_slice *scan;
+                       const struct cl_object     *obj;
+
+                       scan = cl_page_at(page,
+                                         slice->crs_dev->cd_lu_dev.ld_type);
+                       LASSERT(scan != NULL);
+                       obj = scan->cpl_obj;
+                       if (slice->crs_ops->cro_attr_set != NULL)
+                               slice->crs_ops->cro_attr_set(env, slice, obj,
+                                                            attr + i, flags);
+               }
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+       ENTRY;
+       init_waitqueue_head(&anchor->csi_waitq);
+       atomic_set(&anchor->csi_sync_nr, nrpages);
+       atomic_set(&anchor->csi_barrier, nrpages > 0);
+       anchor->csi_sync_rc = 0;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page_list *queue, struct cl_sync_io *anchor,
+                   long timeout)
+{
+       struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+                                                 NULL, NULL, NULL);
+       int rc;
+       ENTRY;
+
+       LASSERT(timeout >= 0);
+
+       rc = l_wait_event(anchor->csi_waitq,
+                         atomic_read(&anchor->csi_sync_nr) == 0,
+                         &lwi);
+       if (rc < 0) {
+               CERROR("SYNC IO failed with error: %d, try to cancel "
+                      "%d remaining pages\n",
+                      rc, atomic_read(&anchor->csi_sync_nr));
+
+               (void)cl_io_cancel(env, io, queue);
+
+               lwi = (struct l_wait_info) { 0 };
+               (void)l_wait_event(anchor->csi_waitq,
+                                  atomic_read(&anchor->csi_sync_nr) == 0,
+                                  &lwi);
+       } else {
+               rc = anchor->csi_sync_rc;
+       }
+       LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+       cl_page_list_assume(env, io, queue);
+
+       /* wait until cl_sync_io_note() has done wakeup */
+       while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+               cpu_relax();
+       }
+
+       POISON(anchor, 0x5a, sizeof *anchor);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+       ENTRY;
+       if (anchor->csi_sync_rc == 0 && ioret < 0)
+               anchor->csi_sync_rc = ioret;
+       /*
+        * Synchronous IO done without releasing page lock (e.g., as a part of
+        * ->{prepare,commit}_write(). Completion is used to signal the end of
+        * IO.
+        */
+       LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+       if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+               wake_up_all(&anchor->csi_waitq);
+               /* it's safe to nuke or reuse anchor now */
+               atomic_set(&anchor->csi_barrier, 0);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644 (file)
index 0000000..d34e044
--- /dev/null
@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+       {
+               .ckd_cache = &cl_lock_kmem,
+               .ckd_name  = "cl_lock_kmem",
+               .ckd_size  = sizeof (struct cl_lock)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+                                    const struct cl_lock *lock)
+{
+       return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+               atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+               lock->cll_holds >= lock->cll_users &&
+               lock->cll_holds >= 0 &&
+               lock->cll_users >= 0 &&
+               lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+                            const struct cl_lock *lock)
+{
+       int result;
+
+       result = atomic_read(&lock->cll_ref) > 0 &&
+               cl_lock_invariant_trusted(env, lock);
+       if (!result && env != NULL)
+               CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+       return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+       return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+                                                  const struct cl_lock *lock)
+{
+       struct cl_thread_info *info;
+       enum clt_nesting_level nesting;
+
+       info = cl_env_info(env);
+       nesting = cl_lock_nesting(lock);
+       LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+       return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+                          const char *prefix, const struct cl_lock *lock,
+                          const char *func, const int line)
+{
+       struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+       CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+                     "(%p/%d/%d) at %s():%d\n",
+              prefix, lock, atomic_read(&lock->cll_ref),
+              lock->cll_guarder, lock->cll_depth,
+              lock->cll_state, lock->cll_error, lock->cll_holds,
+              lock->cll_users, lock->cll_flags,
+              env, h->coh_nesting, cl_lock_nr_mutexed(env),
+              func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)                         \
+       cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+       lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                   struct cl_lock *lock, __u32 enqflags)
+{
+       cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+       lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                   struct cl_lock *lock)
+{
+       cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+       lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                   struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                   struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_lock_operations *ops)
+{
+       ENTRY;
+       slice->cls_lock = lock;
+       list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+       slice->cls_obj = obj;
+       slice->cls_ops = ops;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+       LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+               need == CLM_PHANTOM || need == CLM_GROUP);
+       LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+               has == CLM_PHANTOM || has == CLM_GROUP);
+       CLASSERT(CLM_PHANTOM < CLM_READ);
+       CLASSERT(CLM_READ < CLM_WRITE);
+       CLASSERT(CLM_WRITE < CLM_GROUP);
+
+       if (has != CLM_GROUP)
+               return need <= has;
+       else
+               return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+                     const struct cl_lock_descr *need)
+{
+       return
+               has->cld_start <= need->cld_start &&
+               has->cld_end >= need->cld_end &&
+               cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+               (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+                       const struct cl_lock_descr *need)
+{
+       return
+               cl_object_same(has->cld_obj, need->cld_obj) &&
+               cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object *obj = lock->cll_descr.cld_obj;
+
+       LINVRNT(!cl_lock_is_mutexed(lock));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+       might_sleep();
+       while (!list_empty(&lock->cll_layers)) {
+               struct cl_lock_slice *slice;
+
+               slice = list_entry(lock->cll_layers.next,
+                                      struct cl_lock_slice, cls_linkage);
+               list_del_init(lock->cll_layers.next);
+               slice->cls_ops->clo_fini(env, slice);
+       }
+       CS_LOCK_DEC(obj, total);
+       CS_LOCKSTATE_DEC(obj, lock->cll_state);
+       lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+       cl_object_put(env, obj);
+       lu_ref_fini(&lock->cll_reference);
+       lu_ref_fini(&lock->cll_holders);
+       mutex_destroy(&lock->cll_guard);
+       OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+       EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object        *obj;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       obj = lock->cll_descr.cld_obj;
+       LINVRNT(obj != NULL);
+
+       CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+
+       if (atomic_dec_and_test(&lock->cll_ref)) {
+               if (lock->cll_state == CLS_FREEING) {
+                       LASSERT(list_empty(&lock->cll_linkage));
+                       cl_lock_free(env, lock);
+               }
+               CS_LOCK_DEC(obj, busy);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_invariant(NULL, lock));
+       CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+       atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+       CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+       if (atomic_inc_return(&lock->cll_ref) == 1)
+               CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+       cl_lock_mutex_get(env, lock);
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+       cl_lock_mutex_put(env, lock);
+       cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    const struct cl_io *io,
+                                    const struct cl_lock_descr *descr)
+{
+       struct cl_lock    *lock;
+       struct lu_object_header *head;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+       if (lock != NULL) {
+               atomic_set(&lock->cll_ref, 1);
+               lock->cll_descr = *descr;
+               lock->cll_state = CLS_NEW;
+               cl_object_get(obj);
+               lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                     "cl_lock", lock);
+               INIT_LIST_HEAD(&lock->cll_layers);
+               INIT_LIST_HEAD(&lock->cll_linkage);
+               INIT_LIST_HEAD(&lock->cll_inclosure);
+               lu_ref_init(&lock->cll_reference);
+               lu_ref_init(&lock->cll_holders);
+               mutex_init(&lock->cll_guard);
+               lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+               init_waitqueue_head(&lock->cll_wq);
+               head = obj->co_lu.lo_header;
+               CS_LOCKSTATE_INC(obj, CLS_NEW);
+               CS_LOCK_INC(obj, total);
+               CS_LOCK_INC(obj, create);
+               cl_lock_lockdep_init(lock);
+               list_for_each_entry(obj, &head->loh_layers,
+                                       co_lu.lo_linkage) {
+                       int err;
+
+                       err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+                       if (err != 0) {
+                               cl_lock_finish(env, lock);
+                               lock = ERR_PTR(err);
+                               break;
+                       }
+               }
+       } else
+               lock = ERR_PTR(-ENOMEM);
+       RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+                                    struct cl_lock *lock)
+{
+       enum cl_lock_state state = lock->cll_state;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(state != CLS_INTRANSIT);
+       LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+                "Malformed lock state %d.\n", state);
+
+       cl_lock_state_set(env, lock, CLS_INTRANSIT);
+       lock->cll_intransit_owner = current;
+       cl_lock_hold_add(env, lock, "intransit", current);
+       return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state)
+{
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(lock->cll_state == CLS_INTRANSIT);
+       LASSERT(state != CLS_INTRANSIT);
+       LASSERT(lock->cll_intransit_owner == current);
+
+       lock->cll_intransit_owner = NULL;
+       cl_lock_state_set(env, lock, state);
+       cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+       LASSERT(cl_lock_is_mutexed(lock));
+       return lock->cll_state == CLS_INTRANSIT &&
+              lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+                            const struct cl_lock *lock,
+                            const struct cl_lock_descr *need,
+                            const struct cl_io *io)
+{
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_invariant_trusted(env, lock));
+       ENTRY;
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_fits_into != NULL &&
+                   !slice->cls_ops->clo_fits_into(env, slice, need, io))
+                       RETURN(0);
+       }
+       RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     const struct cl_io *io,
+                                     const struct cl_lock_descr *need)
+{
+       struct cl_lock    *lock;
+       struct cl_object_header *head;
+
+       ENTRY;
+
+       head = cl_object_header(obj);
+       LINVRNT(spin_is_locked(&head->coh_lock_guard));
+       CS_LOCK_INC(obj, lookup);
+       list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+               int matched;
+
+               matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+                         lock->cll_state < CLS_FREEING &&
+                         lock->cll_error == 0 &&
+                         !(lock->cll_flags & CLF_CANCELLED) &&
+                         cl_lock_fits_into(env, lock, need, io);
+               CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+                      PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+                      matched);
+               if (matched) {
+                       cl_lock_get_trust(lock);
+                       CS_LOCK_INC(obj, hit);
+                       RETURN(lock);
+               }
+       }
+       RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+                                   const struct cl_io *io,
+                                   const struct cl_lock_descr *need)
+{
+       struct cl_object_header *head;
+       struct cl_object        *obj;
+       struct cl_lock    *lock;
+
+       ENTRY;
+
+       obj  = need->cld_obj;
+       head = cl_object_header(obj);
+
+       spin_lock(&head->coh_lock_guard);
+       lock = cl_lock_lookup(env, obj, io, need);
+       spin_unlock(&head->coh_lock_guard);
+
+       if (lock == NULL) {
+               lock = cl_lock_alloc(env, obj, io, need);
+               if (!IS_ERR(lock)) {
+                       struct cl_lock *ghost;
+
+                       spin_lock(&head->coh_lock_guard);
+                       ghost = cl_lock_lookup(env, obj, io, need);
+                       if (ghost == NULL) {
+                               list_add_tail(&lock->cll_linkage,
+                                                 &head->coh_locks);
+                               spin_unlock(&head->coh_lock_guard);
+                               CS_LOCK_INC(obj, busy);
+                       } else {
+                               spin_unlock(&head->coh_lock_guard);
+                               /*
+                                * Other threads can acquire references to the
+                                * top-lock through its sub-locks. Hence, it
+                                * cannot be cl_lock_free()-ed immediately.
+                                */
+                               cl_lock_finish(env, lock);
+                               lock = ghost;
+                       }
+               }
+       }
+       RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source)
+{
+       struct cl_object_header *head;
+       struct cl_object        *obj;
+       struct cl_lock    *lock;
+
+       obj  = need->cld_obj;
+       head = cl_object_header(obj);
+
+       do {
+               spin_lock(&head->coh_lock_guard);
+               lock = cl_lock_lookup(env, obj, io, need);
+               spin_unlock(&head->coh_lock_guard);
+               if (lock == NULL)
+                       return NULL;
+
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state == CLS_INTRANSIT)
+                       /* Don't care return value. */
+                       cl_lock_state_wait(env, lock);
+               if (lock->cll_state == CLS_FREEING) {
+                       cl_lock_mutex_put(env, lock);
+                       cl_lock_put(env, lock);
+                       lock = NULL;
+               }
+       } while (lock == NULL);
+
+       cl_lock_hold_add(env, lock, scope, source);
+       cl_lock_user_add(env, lock);
+       if (lock->cll_state == CLS_CACHED)
+               cl_use_try(env, lock, 1);
+       if (lock->cll_state == CLS_HELD) {
+               cl_lock_mutex_put(env, lock);
+               cl_lock_lockdep_acquire(env, lock, 0);
+               cl_lock_put(env, lock);
+       } else {
+               cl_unuse_try(env, lock);
+               cl_lock_unhold(env, lock, scope, source);
+               cl_lock_mutex_put(env, lock);
+               cl_lock_put(env, lock);
+               lock = NULL;
+       }
+
+       return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                      const struct lu_device_type *dtype)
+{
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+       ENTRY;
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+                       RETURN(slice);
+       }
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_counters *counters;
+
+       counters = cl_lock_counters(env, lock);
+       lock->cll_depth++;
+       counters->ctc_nr_locks_locked++;
+       lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+       cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       if (lock->cll_guarder == current) {
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(lock->cll_depth > 0);
+       } else {
+               struct cl_object_header *hdr;
+               struct cl_thread_info   *info;
+               int i;
+
+               LINVRNT(lock->cll_guarder != current);
+               hdr = cl_object_header(lock->cll_descr.cld_obj);
+               /*
+                * Check that mutices are taken in the bottom-to-top order.
+                */
+               info = cl_env_info(env);
+               for (i = 0; i < hdr->coh_nesting; ++i)
+                       LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+               mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+               lock->cll_guarder = current;
+               LINVRNT(lock->cll_depth == 0);
+       }
+       cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+
+       LINVRNT(cl_lock_invariant_trusted(env, lock));
+       ENTRY;
+
+       result = 0;
+       if (lock->cll_guarder == current) {
+               LINVRNT(lock->cll_depth > 0);
+               cl_lock_mutex_tail(env, lock);
+       } else if (mutex_trylock(&lock->cll_guard)) {
+               LINVRNT(lock->cll_depth == 0);
+               lock->cll_guarder = current;
+               cl_lock_mutex_tail(env, lock);
+       } else
+               result = -EBUSY;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_counters *counters;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(lock->cll_guarder == current);
+       LINVRNT(lock->cll_depth > 0);
+
+       counters = cl_lock_counters(env, lock);
+       LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+       cl_lock_trace(D_TRACE, env, "put mutex", lock);
+       lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+       counters->ctc_nr_locks_locked--;
+       if (--lock->cll_depth == 0) {
+               lock->cll_guarder = NULL;
+               mutex_unlock(&lock->cll_guard);
+       }
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+       return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+       struct cl_thread_info *info;
+       int i;
+       int locked;
+
+       /*
+        * NOTE: if summation across all nesting levels (currently 2) proves
+        *       too expensive, a summary counter can be added to
+        *       struct cl_thread_info.
+        */
+       info = cl_env_info(env);
+       for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+               locked += info->clt_counters[i].ctc_nr_locks_locked;
+       return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       if (!(lock->cll_flags & CLF_CANCELLED)) {
+               const struct cl_lock_slice *slice;
+
+               lock->cll_flags |= CLF_CANCELLED;
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_cancel != NULL)
+                               slice->cls_ops->clo_cancel(env, slice);
+               }
+       }
+       EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object_header    *head;
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       if (lock->cll_state < CLS_FREEING) {
+               LASSERT(lock->cll_state != CLS_INTRANSIT);
+               cl_lock_state_set(env, lock, CLS_FREEING);
+
+               head = cl_object_header(lock->cll_descr.cld_obj);
+
+               spin_lock(&head->coh_lock_guard);
+               list_del_init(&lock->cll_linkage);
+               spin_unlock(&head->coh_lock_guard);
+
+               /*
+                * From now on, no new references to this lock can be acquired
+                * by cl_lock_lookup().
+                */
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_delete != NULL)
+                               slice->cls_ops->clo_delete(env, slice);
+               }
+               /*
+                * From now on, no new references to this lock can be acquired
+                * by layer-specific means (like a pointer from struct
+                * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+                * lov).
+                *
+                * Lock will be finally freed in cl_lock_put() when last of
+                * existing references goes away.
+                */
+       }
+       EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+                            int delta)
+{
+       struct cl_thread_counters *counters;
+       enum clt_nesting_level     nesting;
+
+       lock->cll_holds += delta;
+       nesting = cl_lock_nesting(lock);
+       if (nesting == CNL_TOP) {
+               counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+               counters->ctc_nr_held += delta;
+               LASSERT(counters->ctc_nr_held >= 0);
+       }
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+                            int delta)
+{
+       struct cl_thread_counters *counters;
+       enum clt_nesting_level     nesting;
+
+       lock->cll_users += delta;
+       nesting = cl_lock_nesting(lock);
+       if (nesting == CNL_TOP) {
+               counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+               counters->ctc_nr_used += delta;
+               LASSERT(counters->ctc_nr_used >= 0);
+       }
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_holds > 0);
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+       lu_ref_del(&lock->cll_holders, scope, source);
+       cl_lock_hold_mod(env, lock, -1);
+       if (lock->cll_holds == 0) {
+               CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+               if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+                   lock->cll_descr.cld_mode == CLM_GROUP ||
+                   lock->cll_state != CLS_CACHED)
+                       /*
+                        * If lock is still phantom or grouplock when user is
+                        * done with it---destroy the lock.
+                        */
+                       lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+               if (lock->cll_flags & CLF_CANCELPEND) {
+                       lock->cll_flags &= ~CLF_CANCELPEND;
+                       cl_lock_cancel0(env, lock);
+               }
+               if (lock->cll_flags & CLF_DOOMED) {
+                       /* no longer doomed: it's dead... Jim. */
+                       lock->cll_flags &= ~CLF_DOOMED;
+                       cl_lock_delete0(env, lock);
+               }
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+       wait_queue_t waiter;
+       sigset_t blocked;
+       int result;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_depth == 1);
+       LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+       cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+       result = lock->cll_error;
+       if (result == 0) {
+               /* To avoid being interrupted by the 'non-fatal' signals
+                * (SIGCHLD, for instance), we'd block them temporarily.
+                * LU-305 */
+               blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+               init_waitqueue_entry_current(&waiter);
+               add_wait_queue(&lock->cll_wq, &waiter);
+               set_current_state(TASK_INTERRUPTIBLE);
+               cl_lock_mutex_put(env, lock);
+
+               LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+               /* Returning ERESTARTSYS instead of EINTR so syscalls
+                * can be restarted if signals are pending here */
+               result = -ERESTARTSYS;
+               if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+                       waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+                       if (!cfs_signal_pending())
+                               result = 0;
+               }
+
+               cl_lock_mutex_get(env, lock);
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&lock->cll_wq, &waiter);
+
+               /* Restore old blocked signals */
+               cfs_restore_sigs(blocked);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+                                enum cl_lock_state state)
+{
+       const struct cl_lock_slice *slice;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+               if (slice->cls_ops->clo_state != NULL)
+                       slice->cls_ops->clo_state(env, slice, state);
+       wake_up_all(&lock->cll_wq);
+       EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+       cl_lock_state_signal(env, lock, lock->cll_state);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state)
+{
+       ENTRY;
+       LASSERT(lock->cll_state <= state ||
+               (lock->cll_state == CLS_CACHED &&
+                (state == CLS_HELD || /* lock found in cache */
+                 state == CLS_NEW  ||   /* sub-lock canceled */
+                 state == CLS_INTRANSIT)) ||
+               /* lock is in transit state */
+               lock->cll_state == CLS_INTRANSIT);
+
+       if (lock->cll_state != state) {
+               CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+               CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+               cl_lock_state_signal(env, lock, state);
+               lock->cll_state = state;
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+
+       do {
+               result = 0;
+
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(cl_lock_invariant(env, lock));
+               LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+               result = -ENOSYS;
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_unuse != NULL) {
+                               result = slice->cls_ops->clo_unuse(env, slice);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               LASSERT(result != -ENOSYS);
+       } while (result == CLO_REPEAT);
+
+       return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+       enum cl_lock_state state;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+       LASSERT(lock->cll_state == CLS_CACHED);
+       if (lock->cll_error)
+               RETURN(lock->cll_error);
+
+       result = -ENOSYS;
+       state = cl_lock_intransit(env, lock);
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_use != NULL) {
+                       result = slice->cls_ops->clo_use(env, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LASSERT(result != -ENOSYS);
+
+       LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+                lock->cll_state);
+
+       if (result == 0) {
+               state = CLS_HELD;
+       } else {
+               if (result == -ESTALE) {
+                       /*
+                        * ESTALE means sublock being cancelled
+                        * at this time, and set lock state to
+                        * be NEW here and ask the caller to repeat.
+                        */
+                       state = CLS_NEW;
+                       result = CLO_REPEAT;
+               }
+
+               /* @atomic means back-off-on-failure. */
+               if (atomic) {
+                       int rc;
+                       rc = cl_unuse_try_internal(env, lock);
+                       /* Vet the results. */
+                       if (rc < 0 && result > 0)
+                               result = rc;
+               }
+
+       }
+       cl_lock_extransit(env, lock, state);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+                          struct cl_lock *lock,
+                          struct cl_io *io, __u32 flags)
+{
+       int result;
+       const struct cl_lock_slice *slice;
+
+       ENTRY;
+       result = -ENOSYS;
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_enqueue != NULL) {
+                       result = slice->cls_ops->clo_enqueue(env,
+                                                            slice, io, flags);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LASSERT(result != -ENOSYS);
+       RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                      lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                  struct cl_io *io, __u32 flags)
+{
+       int result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+       do {
+               LINVRNT(cl_lock_is_mutexed(lock));
+
+               result = lock->cll_error;
+               if (result != 0)
+                       break;
+
+               switch (lock->cll_state) {
+               case CLS_NEW:
+                       cl_lock_state_set(env, lock, CLS_QUEUING);
+                       /* fall-through */
+               case CLS_QUEUING:
+                       /* kick layers. */
+                       result = cl_enqueue_kick(env, lock, io, flags);
+                       /* For AGL case, the cl_lock::cll_state may
+                        * become CLS_HELD already. */
+                       if (result == 0 && lock->cll_state == CLS_QUEUING)
+                               cl_lock_state_set(env, lock, CLS_ENQUEUED);
+                       break;
+               case CLS_INTRANSIT:
+                       LASSERT(cl_lock_is_intransit(lock));
+                       result = CLO_WAIT;
+                       break;
+               case CLS_CACHED:
+                       /* yank lock from the cache. */
+                       result = cl_use_try(env, lock, 0);
+                       break;
+               case CLS_ENQUEUED:
+               case CLS_HELD:
+                       result = 0;
+                       break;
+               default:
+               case CLS_FREEING:
+                       /*
+                        * impossible, only held locks with increased
+                        * ->cll_holds can be enqueued, and they cannot be
+                        * freed.
+                        */
+                       LBUG();
+               }
+       } while (result == CLO_REPEAT);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+                        struct cl_lock *lock,
+                        int keep_mutex)
+{
+       struct cl_lock  *conflict;
+       int           rc = 0;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(lock->cll_state == CLS_QUEUING);
+       LASSERT(lock->cll_conflict != NULL);
+
+       conflict = lock->cll_conflict;
+       lock->cll_conflict = NULL;
+
+       cl_lock_mutex_put(env, lock);
+       LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+       cl_lock_mutex_get(env, conflict);
+       cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+       cl_lock_cancel(env, conflict);
+       cl_lock_delete(env, conflict);
+
+       while (conflict->cll_state != CLS_FREEING) {
+               rc = cl_lock_state_wait(env, conflict);
+               if (rc != 0)
+                       break;
+       }
+       cl_lock_mutex_put(env, conflict);
+       lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+       cl_lock_put(env, conflict);
+
+       if (keep_mutex)
+               cl_lock_mutex_get(env, lock);
+
+       LASSERT(rc <= 0);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+                            struct cl_io *io, __u32 enqflags)
+{
+       int result;
+
+       ENTRY;
+
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_holds > 0);
+
+       cl_lock_user_add(env, lock);
+       do {
+               result = cl_enqueue_try(env, lock, io, enqflags);
+               if (result == CLO_WAIT) {
+                       if (lock->cll_conflict != NULL)
+                               result = cl_lock_enqueue_wait(env, lock, 1);
+                       else
+                               result = cl_lock_state_wait(env, lock);
+                       if (result == 0)
+                               continue;
+               }
+               break;
+       } while (1);
+       if (result != 0)
+               cl_unuse_try(env, lock);
+       LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+                    lock->cll_state == CLS_ENQUEUED ||
+                    lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                      lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+              struct cl_io *io, __u32 enqflags)
+{
+       int result;
+
+       ENTRY;
+
+       cl_lock_lockdep_acquire(env, lock, enqflags);
+       cl_lock_mutex_get(env, lock);
+       result = cl_enqueue_locked(env, lock, io, enqflags);
+       cl_lock_mutex_put(env, lock);
+       if (result != 0)
+               cl_lock_lockdep_release(env, lock);
+       LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+                    lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       int                      result;
+       enum cl_lock_state        state = CLS_NEW;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+       if (lock->cll_users > 1) {
+               cl_lock_user_del(env, lock);
+               RETURN(0);
+       }
+
+       /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+        * underlying resources. */
+       if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+               cl_lock_user_del(env, lock);
+               RETURN(0);
+       }
+
+       /*
+        * New lock users (->cll_users) are not protecting unlocking
+        * from proceeding. From this point, lock eventually reaches
+        * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+        * CLS_FREEING.
+        */
+       state = cl_lock_intransit(env, lock);
+
+       result = cl_unuse_try_internal(env, lock);
+       LASSERT(lock->cll_state == CLS_INTRANSIT);
+       LASSERT(result != CLO_WAIT);
+       cl_lock_user_del(env, lock);
+       if (result == 0 || result == -ESTALE) {
+               /*
+                * Return lock back to the cache. This is the only
+                * place where lock is moved into CLS_CACHED state.
+                *
+                * If one of ->clo_unuse() methods returned -ESTALE, lock
+                * cannot be placed into cache and has to be
+                * re-initialized. This happens e.g., when a sub-lock was
+                * canceled while unlocking was in progress.
+                */
+               if (state == CLS_HELD && result == 0)
+                       state = CLS_CACHED;
+               else
+                       state = CLS_NEW;
+               cl_lock_extransit(env, lock, state);
+
+               /*
+                * Hide -ESTALE error.
+                * If the lock is a glimpse lock, and it has multiple
+                * stripes. Assuming that one of its sublock returned -ENAVAIL,
+                * and other sublocks are matched write locks. In this case,
+                * we can't set this lock to error because otherwise some of
+                * its sublocks may not be canceled. This causes some dirty
+                * pages won't be written to OSTs. -jay
+                */
+               result = 0;
+       } else {
+               CERROR("result = %d, this is unlikely!\n", result);
+               state = CLS_NEW;
+               cl_lock_extransit(env, lock, state);
+       }
+       RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+       ENTRY;
+
+       result = cl_unuse_try(env, lock);
+       if (result)
+               CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+       EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+       ENTRY;
+       cl_lock_mutex_get(env, lock);
+       cl_unuse_locked(env, lock);
+       cl_lock_mutex_put(env, lock);
+       cl_lock_lockdep_release(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       int                      result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+       do {
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(cl_lock_invariant(env, lock));
+               LASSERTF(lock->cll_state == CLS_QUEUING ||
+                        lock->cll_state == CLS_ENQUEUED ||
+                        lock->cll_state == CLS_HELD ||
+                        lock->cll_state == CLS_INTRANSIT,
+                        "lock state: %d\n", lock->cll_state);
+               LASSERT(lock->cll_users > 0);
+               LASSERT(lock->cll_holds > 0);
+
+               result = lock->cll_error;
+               if (result != 0)
+                       break;
+
+               if (cl_lock_is_intransit(lock)) {
+                       result = CLO_WAIT;
+                       break;
+               }
+
+               if (lock->cll_state == CLS_HELD)
+                       /* nothing to do */
+                       break;
+
+               result = -ENOSYS;
+               list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                       if (slice->cls_ops->clo_wait != NULL) {
+                               result = slice->cls_ops->clo_wait(env, slice);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               LASSERT(result != -ENOSYS);
+               if (result == 0) {
+                       LASSERT(lock->cll_state != CLS_INTRANSIT);
+                       cl_lock_state_set(env, lock, CLS_HELD);
+               }
+       } while (result == CLO_REPEAT);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                     lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+
+       ENTRY;
+       cl_lock_mutex_get(env, lock);
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+                "Wrong state %d \n", lock->cll_state);
+       LASSERT(lock->cll_holds > 0);
+
+       do {
+               result = cl_wait_try(env, lock);
+               if (result == CLO_WAIT) {
+                       result = cl_lock_state_wait(env, lock);
+                       if (result == 0)
+                               continue;
+               }
+               break;
+       } while (1);
+       if (result < 0) {
+               cl_unuse_try(env, lock);
+               cl_lock_lockdep_release(env, lock);
+       }
+       cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+       cl_lock_mutex_put(env, lock);
+       LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       unsigned long pound;
+       unsigned long ounce;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       pound = 0;
+       list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_weigh != NULL) {
+                       ounce = slice->cls_ops->clo_weigh(env, slice);
+                       pound += ounce;
+                       if (pound < ounce) /* over-weight^Wflow */
+                               pound = ~0UL;
+               }
+       }
+       RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+                  const struct cl_lock_descr *desc)
+{
+       const struct cl_lock_slice *slice;
+       struct cl_object           *obj = lock->cll_descr.cld_obj;
+       struct cl_object_header    *hdr = cl_object_header(obj);
+       int result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+       /* don't allow object to change */
+       LASSERT(obj == desc->cld_obj);
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_modify != NULL) {
+                       result = slice->cls_ops->clo_modify(env, slice, desc);
+                       if (result != 0)
+                               RETURN(result);
+               }
+       }
+       CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+                     PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+       /*
+        * Just replace description in place. Nothing more is needed for
+        * now. If locks were indexed according to their extent and/or mode,
+        * that index would have to be updated here.
+        */
+       spin_lock(&hdr->coh_lock_guard);
+       lock->cll_descr = *desc;
+       spin_unlock(&hdr->coh_lock_guard);
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+                         struct cl_lock_closure *closure,
+                         struct cl_lock *origin, int wait)
+{
+       LINVRNT(cl_lock_is_mutexed(origin));
+       LINVRNT(cl_lock_invariant(env, origin));
+
+       INIT_LIST_HEAD(&closure->clc_list);
+       closure->clc_origin = origin;
+       closure->clc_wait   = wait;
+       closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                         struct cl_lock_closure *closure)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+       LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+       result = cl_lock_enclosure(env, lock, closure);
+       if (result == 0) {
+               list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                       if (slice->cls_ops->clo_closure != NULL) {
+                               result = slice->cls_ops->clo_closure(env, slice,
+                                                                    closure);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+       }
+       if (result != 0)
+               cl_lock_disclosure(env, closure);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+                     struct cl_lock_closure *closure)
+{
+       int result = 0;
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+       if (!cl_lock_mutex_try(env, lock)) {
+               /*
+                * If lock->cll_inclosure is not empty, lock is already in
+                * this closure.
+                */
+               if (list_empty(&lock->cll_inclosure)) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add(&lock->cll_reference, "closure", closure);
+                       list_add(&lock->cll_inclosure, &closure->clc_list);
+                       closure->clc_nr++;
+               } else
+                       cl_lock_mutex_put(env, lock);
+               result = 0;
+       } else {
+               cl_lock_disclosure(env, closure);
+               if (closure->clc_wait) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add(&lock->cll_reference, "closure-w", closure);
+                       cl_lock_mutex_put(env, closure->clc_origin);
+
+                       LASSERT(cl_lock_nr_mutexed(env) == 0);
+                       cl_lock_mutex_get(env, lock);
+                       cl_lock_mutex_put(env, lock);
+
+                       cl_lock_mutex_get(env, closure->clc_origin);
+                       lu_ref_del(&lock->cll_reference, "closure-w", closure);
+                       cl_lock_put(env, lock);
+               }
+               result = CLO_REPEAT;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+                       struct cl_lock_closure *closure)
+{
+       struct cl_lock *scan;
+       struct cl_lock *temp;
+
+       cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+       list_for_each_entry_safe(scan, temp, &closure->clc_list,
+                                    cll_inclosure){
+               list_del_init(&scan->cll_inclosure);
+               cl_lock_mutex_put(env, scan);
+               lu_ref_del(&scan->cll_reference, "closure", closure);
+               cl_lock_put(env, scan);
+               closure->clc_nr--;
+       }
+       LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+       LASSERT(closure->clc_nr == 0);
+       LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *        cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+                    cl_lock_nr_mutexed(env) == 1));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+       if (lock->cll_holds == 0)
+               cl_lock_delete0(env, lock);
+       else
+               lock->cll_flags |= CLF_DOOMED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       if (lock->cll_error == 0 && error != 0) {
+               cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+               lock->cll_error = error;
+               cl_lock_signal(env, lock);
+               cl_lock_cancel(env, lock);
+               cl_lock_delete(env, lock);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+       if (lock->cll_holds == 0)
+               cl_lock_cancel0(env, lock);
+       else
+               lock->cll_flags |= CLF_CANCELPEND;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+                                struct cl_object *obj, pgoff_t index,
+                                struct cl_lock *except,
+                                int pending, int canceld)
+{
+       struct cl_object_header *head;
+       struct cl_lock    *scan;
+       struct cl_lock    *lock;
+       struct cl_lock_descr    *need;
+
+       ENTRY;
+
+       head = cl_object_header(obj);
+       need = &cl_env_info(env)->clt_descr;
+       lock = NULL;
+
+       need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+                                   * not PHANTOM */
+       need->cld_start = need->cld_end = index;
+       need->cld_enq_flags = 0;
+
+       spin_lock(&head->coh_lock_guard);
+       /* It is fine to match any group lock since there could be only one
+        * with a uniq gid and it conflicts with all other lock modes too */
+       list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+               if (scan != except &&
+                   (scan->cll_descr.cld_mode == CLM_GROUP ||
+                   cl_lock_ext_match(&scan->cll_descr, need)) &&
+                   scan->cll_state >= CLS_HELD &&
+                   scan->cll_state < CLS_FREEING &&
+                   /*
+                    * This check is racy as the lock can be canceled right
+                    * after it is done, but this is fine, because page exists
+                    * already.
+                    */
+                   (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+                   (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+                       /* Don't increase cs_hit here since this
+                        * is just a helper function. */
+                       cl_lock_get_trust(scan);
+                       lock = scan;
+                       break;
+               }
+       }
+       spin_unlock(&head->coh_lock_guard);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+       struct lu_device_type *dtype;
+       const struct cl_page_slice *slice;
+
+       dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+       slice = cl_page_at(page, dtype);
+       LASSERT(slice != NULL);
+       return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+                               struct cl_page *page, void *cbdata)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+       struct cl_lock *lock = cbdata;
+       pgoff_t index = pgoff_at_lock(page, lock);
+
+       if (index >= info->clt_fn_index) {
+               struct cl_lock *tmp;
+
+               /* refresh non-overlapped index */
+               tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+                                       lock, 1, 0);
+               if (tmp != NULL) {
+                       /* Cache the first-non-overlapped index so as to skip
+                        * all pages within [index, clt_fn_index). This
+                        * is safe because if tmp lock is canceled, it will
+                        * discard these pages. */
+                       info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+                       if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+                               info->clt_fn_index = CL_PAGE_EOF;
+                       cl_lock_put(env, tmp);
+               } else if (cl_page_own(env, io, page) == 0) {
+                       /* discard the page */
+                       cl_page_unmap(env, io, page);
+                       cl_page_discard(env, io, page);
+                       cl_page_disown(env, io, page);
+               } else {
+                       LASSERT(page->cp_state == CPS_FREEING);
+               }
+       }
+
+       info->clt_next_index = index + 1;
+       return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+                     struct cl_page *page, void *cbdata)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+       struct cl_lock *lock   = cbdata;
+
+       LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+       KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+                     !PageWriteback(cl_page_vmpage(env, page))));
+       KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+                     !PageDirty(cl_page_vmpage(env, page))));
+
+       info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+       if (cl_page_own(env, io, page) == 0) {
+               /* discard the page */
+               cl_page_unmap(env, io, page);
+               cl_page_discard(env, io, page);
+               cl_page_disown(env, io, page);
+       } else {
+               LASSERT(page->cp_state == CPS_FREEING);
+       }
+
+       return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_info *info  = cl_env_info(env);
+       struct cl_io      *io    = &info->clt_io;
+       struct cl_lock_descr  *descr = &lock->cll_descr;
+       cl_page_gang_cb_t      cb;
+       int res;
+       int result;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+
+       io->ci_obj = cl_object_top(descr->cld_obj);
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (result != 0)
+               GOTO(out, result);
+
+       cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+       info->clt_fn_index = info->clt_next_index = descr->cld_start;
+       do {
+               res = cl_page_gang_lookup(env, descr->cld_obj, io,
+                                         info->clt_next_index, descr->cld_end,
+                                         cb, (void *)lock);
+               if (info->clt_next_index > descr->cld_end)
+                       break;
+
+               if (res == CLP_GANG_RESCHED)
+                       cond_resched();
+       } while (res != CLP_GANG_OKAY);
+out:
+       cl_io_fini(env, io);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *            destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+       struct cl_object_header *head;
+       struct cl_lock    *lock;
+
+       ENTRY;
+       head = cl_object_header(obj);
+       /*
+        * If locks are destroyed without cancellation, all pages must be
+        * already destroyed (as otherwise they will be left unprotected).
+        */
+       LASSERT(ergo(!cancel,
+                    head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+       spin_lock(&head->coh_lock_guard);
+       while (!list_empty(&head->coh_locks)) {
+               lock = container_of(head->coh_locks.next,
+                                   struct cl_lock, cll_linkage);
+               cl_lock_get_trust(lock);
+               spin_unlock(&head->coh_lock_guard);
+               lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state < CLS_FREEING) {
+                       LASSERT(lock->cll_users <= 1);
+                       if (unlikely(lock->cll_users == 1)) {
+                               struct l_wait_info lwi = { 0 };
+
+                               cl_lock_mutex_put(env, lock);
+                               l_wait_event(lock->cll_wq,
+                                            lock->cll_users == 0,
+                                            &lwi);
+                               goto again;
+                       }
+
+                       if (cancel)
+                               cl_lock_cancel(env, lock);
+                       cl_lock_delete(env, lock);
+               }
+               cl_lock_mutex_put(env, lock);
+               lu_ref_del(&lock->cll_reference, "prune", current);
+               cl_lock_put(env, lock);
+               spin_lock(&head->coh_lock_guard);
+       }
+       spin_unlock(&head->coh_lock_guard);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+                                         const struct cl_io *io,
+                                         const struct cl_lock_descr *need,
+                                         const char *scope, const void *source)
+{
+       struct cl_lock *lock;
+
+       ENTRY;
+
+       while (1) {
+               lock = cl_lock_find(env, io, need);
+               if (IS_ERR(lock))
+                       break;
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state < CLS_FREEING &&
+                   !(lock->cll_flags & CLF_CANCELLED)) {
+                       cl_lock_hold_mod(env, lock, +1);
+                       lu_ref_add(&lock->cll_holders, scope, source);
+                       lu_ref_add(&lock->cll_reference, scope, source);
+                       break;
+               }
+               cl_lock_mutex_put(env, lock);
+               cl_lock_put(env, lock);
+       }
+       RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source)
+{
+       struct cl_lock *lock;
+
+       ENTRY;
+
+       lock = cl_lock_hold_mutex(env, io, need, scope, source);
+       if (!IS_ERR(lock))
+               cl_lock_mutex_put(env, lock);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                               const struct cl_lock_descr *need,
+                               const char *scope, const void *source)
+{
+       struct cl_lock       *lock;
+       int                rc;
+       __u32            enqflags = need->cld_enq_flags;
+
+       ENTRY;
+       do {
+               lock = cl_lock_hold_mutex(env, io, need, scope, source);
+               if (IS_ERR(lock))
+                       break;
+
+               rc = cl_enqueue_locked(env, lock, io, enqflags);
+               if (rc == 0) {
+                       if (cl_lock_fits_into(env, lock, need, io)) {
+                               if (!(enqflags & CEF_AGL)) {
+                                       cl_lock_mutex_put(env, lock);
+                                       cl_lock_lockdep_acquire(env, lock,
+                                                               enqflags);
+                                       break;
+                               }
+                               rc = 1;
+                       }
+                       cl_unuse_locked(env, lock);
+               }
+               cl_lock_trace(D_DLMTRACE, env,
+                             rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+               cl_lock_hold_release(env, lock, scope, source);
+               cl_lock_mutex_put(env, lock);
+               lu_ref_del(&lock->cll_reference, scope, source);
+               cl_lock_put(env, lock);
+               if (rc > 0) {
+                       LASSERT(enqflags & CEF_AGL);
+                       lock = NULL;
+               } else if (rc != 0) {
+                       lock = ERR_PTR(rc);
+               }
+       } while (rc == 0);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+                     const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_state != CLS_FREEING);
+
+       ENTRY;
+       cl_lock_hold_mod(env, lock, +1);
+       cl_lock_get(lock);
+       lu_ref_add(&lock->cll_holders, scope, source);
+       lu_ref_add(&lock->cll_reference, scope, source);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+                   const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       cl_lock_hold_release(env, lock, scope, source);
+       lu_ref_del(&lock->cll_reference, scope, source);
+       cl_lock_put(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+                    const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+       cl_lock_mutex_get(env, lock);
+       cl_lock_hold_release(env, lock, scope, source);
+       cl_lock_mutex_put(env, lock);
+       lu_ref_del(&lock->cll_reference, scope, source);
+       cl_lock_put(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       cl_lock_used_mod(env, lock, +1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_users > 0);
+
+       ENTRY;
+       cl_lock_used_mod(env, lock, -1);
+       if (lock->cll_users == 0)
+               wake_up_all(&lock->cll_wq);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+       static const char *names[] = {
+               [CLM_PHANTOM] = "P",
+               [CLM_READ]    = "R",
+               [CLM_WRITE]   = "W",
+               [CLM_GROUP]   = "G"
+       };
+       if (0 <= mode && mode < ARRAY_SIZE(names))
+               return names[mode];
+       else
+               return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                      lu_printer_t printer,
+                      const struct cl_lock_descr *descr)
+{
+       const struct lu_fid  *fid;
+
+       fid = lu_object_fid(&descr->cld_obj->co_lu);
+       (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t printer, const struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+                  lock, atomic_read(&lock->cll_ref),
+                  lock->cll_state, lock->cll_error, lock->cll_holds,
+                  lock->cll_users, lock->cll_flags);
+       cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+       (*printer)(env, cookie, " {\n");
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               (*printer)(env, cookie, "    %s@%p: ",
+                          slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+                          slice);
+               if (slice->cls_ops->clo_print != NULL)
+                       slice->cls_ops->clo_print(env, cookie, printer, slice);
+               (*printer)(env, cookie, "\n");
+       }
+       (*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+       return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+       lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644 (file)
index 0000000..faa9ef6
--- /dev/null
@@ -0,0 +1,1155 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *       ->coh_page_guard
+ *       ->coh_lock_guard
+ *       ->coh_attr_guard
+ *       ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+       int result;
+
+       ENTRY;
+       result = lu_object_header_init(&h->coh_lu);
+       if (result == 0) {
+               spin_lock_init(&h->coh_page_guard);
+               spin_lock_init(&h->coh_lock_guard);
+               spin_lock_init(&h->coh_attr_guard);
+               lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+               lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+               lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+               h->coh_pages = 0;
+               /* XXX hard coded GFP_* mask. */
+               INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+               INIT_LIST_HEAD(&h->coh_locks);
+               h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+       LASSERT(list_empty(&h->coh_locks));
+       lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+                                struct cl_device *cd, const struct lu_fid *fid,
+                                const struct cl_object_conf *c)
+{
+       might_sleep();
+       return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+       lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+       lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+       struct cl_object_header *hdr = cl_object_header(o);
+       struct cl_object *top;
+
+       while (hdr->coh_parent != NULL)
+               hdr = hdr->coh_parent;
+
+       top = lu2cl(lu_object_top(&hdr->coh_lu));
+       CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+       return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+       return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+       spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+       spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_attr *attr)
+{
+       struct lu_object_header *top;
+       int result;
+
+       LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+       ENTRY;
+
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+               if (obj->co_ops->coo_attr_get != NULL) {
+                       result = obj->co_ops->coo_attr_get(env, obj, attr);
+                       if (result != 0) {
+                               if (result > 0)
+                                       result = 0;
+                               break;
+                       }
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+                      const struct cl_attr *attr, unsigned v)
+{
+       struct lu_object_header *top;
+       int result;
+
+       LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+       ENTRY;
+
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry_reverse(obj, &top->loh_layers,
+                                       co_lu.lo_linkage) {
+               if (obj->co_ops->coo_attr_set != NULL) {
+                       result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+                       if (result != 0) {
+                               if (result > 0)
+                                       result = 0;
+                               break;
+                       }
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+                     struct ost_lvb *lvb)
+{
+       struct lu_object_header *top;
+       int result;
+
+       ENTRY;
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry_reverse(obj, &top->loh_layers,
+                                       co_lu.lo_linkage) {
+               if (obj->co_ops->coo_glimpse != NULL) {
+                       result = obj->co_ops->coo_glimpse(env, obj, lvb);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+                        "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                        "ctime: "LPU64" blocks: "LPU64"\n",
+                        lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                        lvb->lvb_ctime, lvb->lvb_blocks);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+               const struct cl_object_conf *conf)
+{
+       struct lu_object_header *top;
+       int result;
+
+       ENTRY;
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+               if (obj->co_ops->coo_conf_set != NULL) {
+                       result = obj->co_ops->coo_conf_set(env, obj, conf);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+       struct cl_object_header *hdr;
+
+       hdr = cl_object_header(obj);
+       LASSERT(hdr->coh_tree.rnode == NULL);
+       LASSERT(hdr->coh_pages == 0);
+
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+       /*
+        * Destroy all locks. Object destruction (including cl_inode_fini())
+        * cannot cancel the locks, because in the case of a local client,
+        * where client and server share the same thread running
+        * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+        * waiting on __wait_on_freeing_inode().
+        */
+       cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+       ENTRY;
+       cl_pages_prune(env, obj);
+       cl_locks_prune(env, obj, 1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+       struct cl_object_header *head = cl_object_header(obj);
+       int has;
+
+       spin_lock(&head->coh_lock_guard);
+       has = list_empty(&head->coh_locks);
+       spin_unlock(&head->coh_lock_guard);
+
+       return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+       int i;
+
+       cs->cs_name = name;
+       for (i = 0; i < CS_NR; i++)
+               atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs,
+                     char *page, int count, int h)
+{
+       int nob = 0;
+       int i;
+       /*
+        *   lookup    hit    total  cached create
+        * env: ...... ...... ...... ...... ......
+        */
+       if (h) {
+               const char *names[CS_NR] = CS_NAMES;
+
+               nob += snprintf(page + nob, count - nob, "%6s", " ");
+               for (i = 0; i < CS_NR; i++)
+                       nob += snprintf(page + nob, count - nob,
+                                       "%8s", names[i]);
+               nob += snprintf(page + nob, count - nob, "\n");
+       }
+
+       nob += snprintf(page + nob, count - nob, "%5.5s:", cs->cs_name);
+       for (i = 0; i < CS_NR; i++)
+               nob += snprintf(page + nob, count - nob, "%8u",
+                               atomic_read(&cs->cs_stats[i]));
+       return nob;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+       int i;
+       int result;
+
+       result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+       if (result == 0) {
+               cache_stats_init(&s->cs_pages, "pages");
+               cache_stats_init(&s->cs_locks, "locks");
+               for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+                       atomic_set(&s->cs_pages_state[0], 0);
+               for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+                       atomic_set(&s->cs_locks_state[i], 0);
+       }
+       return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+       lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+       .cs_name    = "envs",
+       .cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, char *page, int count)
+{
+       int nob;
+       int i;
+       static const char *pstate[] = {
+               [CPS_CACHED]  = "c",
+               [CPS_OWNED]   = "o",
+               [CPS_PAGEOUT] = "w",
+               [CPS_PAGEIN]  = "r",
+               [CPS_FREEING] = "f"
+       };
+       static const char *lstate[] = {
+               [CLS_NEW]       = "n",
+               [CLS_QUEUING]   = "q",
+               [CLS_ENQUEUED]  = "e",
+               [CLS_HELD]      = "h",
+               [CLS_INTRANSIT] = "t",
+               [CLS_CACHED]    = "c",
+               [CLS_FREEING]   = "f"
+       };
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+       nob = lu_site_stats_print(&site->cs_lu, page, count);
+       nob += cache_stats_print(&site->cs_pages, page + nob, count - nob, 1);
+       nob += snprintf(page + nob, count - nob, " [");
+       for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+               nob += snprintf(page + nob, count - nob, "%s: %u ",
+                               pstate[i],
+                               atomic_read(&site->cs_pages_state[i]));
+       nob += snprintf(page + nob, count - nob, "]\n");
+       nob += cache_stats_print(&site->cs_locks, page + nob, count - nob, 0);
+       nob += snprintf(page + nob, count - nob, " [");
+       for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+               nob += snprintf(page + nob, count - nob, "%s: %u ",
+                               lstate[i],
+                               atomic_read(&site->cs_locks_state[i]));
+       nob += snprintf(page + nob, count - nob, "]\n");
+       nob += cache_stats_print(&cl_env_stats, page + nob, count - nob, 0);
+       nob += snprintf(page + nob, count - nob, "\n");
+       return nob;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+       void         *ce_magic;
+       struct lu_env     ce_lu;
+       struct lu_context ce_ses;
+
+       /**
+        * This allows cl_env to be entered into cl_env_hash which implements
+        * the current thread -> client environment lookup.
+        */
+       struct hlist_node  ce_node;
+       /**
+        * Owner for the current cl_env.
+        *
+        * If LL_TASK_CL_ENV is defined, this point to the owning current,
+        * only for debugging purpose ;
+        * Otherwise hash is used, and this is the key for cfs_hash.
+        * Now current thread pid is stored. Note using thread pointer would
+        * lead to unbalanced hash because of its specific allocation locality
+        * and could be varied for different platforms and OSes, even different
+        * OS versions.
+        */
+       void         *ce_owner;
+
+       /*
+        * Linkage into global list of all client environments. Used for
+        * garbage collection.
+        */
+       struct list_head        ce_linkage;
+       /*
+        *
+        */
+       int            ce_ref;
+       /*
+        * Debugging field: address of the caller who made original
+        * allocation.
+        */
+       void         *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+       LASSERT(cle->ce_ref == 0);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+       LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+       cle->ce_ref = 1;
+       cle->ce_debug = debug;
+       CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+                                const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+       return cfs_hash_u64_hash((__u64)key, mask);
+#else
+       return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+       struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+       return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+       struct cl_env *cle = cl_env_hops_obj(hn);
+
+       LASSERT(cle->ce_owner != NULL);
+       return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+       struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+       .hs_hash        = cl_env_hops_hash,
+       .hs_key  = cl_env_hops_obj,
+       .hs_keycmp      = cl_env_hops_keycmp,
+       .hs_object      = cl_env_hops_obj,
+       .hs_get  = cl_env_hops_noop,
+       .hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+       struct cl_env *cle;
+
+       cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+       LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+       return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+       if (cle) {
+               int rc;
+
+               LASSERT(cle->ce_owner == NULL);
+               cle->ce_owner = (void *) (long) current->pid;
+               rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+                                        &cle->ce_node);
+               LASSERT(rc == 0);
+       }
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+       void *cookie;
+
+       LASSERT(cle->ce_owner == (void *) (long) current->pid);
+       cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+                             &cle->ce_node);
+       LASSERT(cookie == cle);
+       cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+       cl_env_hash = cfs_hash_create("cl_env",
+                                     HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+                                     HASH_CL_ENV_BKT_BITS, 0,
+                                     CFS_HASH_MIN_THETA,
+                                     CFS_HASH_MAX_THETA,
+                                     &cl_env_hops,
+                                     CFS_HASH_RW_BKTLOCK);
+       return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+       cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+       if (cle == NULL)
+               cle = cl_env_fetch();
+
+       if (cle && cle->ce_owner)
+               cl_env_do_detach(cle);
+
+       return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+       struct lu_env *env;
+       struct cl_env *cle;
+
+       OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+       if (cle != NULL) {
+               int rc;
+
+               INIT_LIST_HEAD(&cle->ce_linkage);
+               cle->ce_magic = &cl_env_init0;
+               env = &cle->ce_lu;
+               rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+               if (rc == 0) {
+                       rc = lu_context_init(&cle->ce_ses,
+                                            LCT_SESSION | ses_tags);
+                       if (rc == 0) {
+                               lu_context_enter(&cle->ce_ses);
+                               env->le_ses = &cle->ce_ses;
+                               cl_env_init0(cle, debug);
+                       } else
+                               lu_env_fini(env);
+               }
+               if (rc != 0) {
+                       OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+                       env = ERR_PTR(rc);
+               } else {
+                       CL_ENV_INC(create);
+                       CL_ENV_INC(total);
+               }
+       } else
+               env = ERR_PTR(-ENOMEM);
+       return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+       CL_ENV_DEC(total);
+       lu_context_fini(&cle->ce_lu.le_ctx);
+       lu_context_fini(&cle->ce_ses);
+       OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+       return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+       struct lu_env *env;
+       struct cl_env *cle;
+
+       CL_ENV_INC(lookup);
+
+       /* check that we don't go far from untrusted pointer */
+       CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+       env = NULL;
+       cle = cl_env_fetch();
+       if (cle != NULL) {
+               CL_ENV_INC(hit);
+               env = &cle->ce_lu;
+               *refcheck = ++cle->ce_ref;
+       }
+       CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+       return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+       struct lu_env *env;
+
+       env = cl_env_peek(refcheck);
+       if (env == NULL) {
+               env = cl_env_new(lu_context_tags_default,
+                                lu_session_tags_default,
+                                __builtin_return_address(0));
+
+               if (!IS_ERR(env)) {
+                       struct cl_env *cle;
+
+                       cle = cl_env_container(env);
+                       cl_env_attach(cle);
+                       *refcheck = cle->ce_ref;
+                       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+               }
+       }
+       return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+       struct lu_env *env;
+
+       LASSERT(cl_env_peek(refcheck) == NULL);
+       env = cl_env_new(tags, tags, __builtin_return_address(0));
+       if (!IS_ERR(env)) {
+               struct cl_env *cle;
+
+               cle = cl_env_container(env);
+               *refcheck = cle->ce_ref;
+               CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+       }
+       return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+       LASSERT(cle->ce_owner == NULL);
+       lu_context_exit(&cle->ce_lu.le_ctx);
+       lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle;
+
+       cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 0);
+       LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+       if (--cle->ce_ref == 0) {
+               CL_ENV_DEC(busy);
+               cl_env_detach(cle);
+               cle->ce_debug = NULL;
+               cl_env_exit(cle);
+               cl_env_fini(cle);
+       }
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+       return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+       cl_env_detach(NULL);
+       cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 0);
+
+       cl_env_attach(cle);
+       cl_env_get(refcheck);
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 1);
+
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+       cl_env_detach(cle);
+       cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+       struct lu_env *env;
+
+       nest->cen_cookie = NULL;
+       env = cl_env_peek(&nest->cen_refcheck);
+       if (env != NULL) {
+               if (!cl_io_is_going(env))
+                       return env;
+               else {
+                       cl_env_put(env, &nest->cen_refcheck);
+                       nest->cen_cookie = cl_env_reenter();
+               }
+       }
+       env = cl_env_get(&nest->cen_refcheck);
+       if (IS_ERR(env)) {
+               cl_env_reexit(nest->cen_cookie);
+               return env;
+       }
+
+       LASSERT(!cl_io_is_going(env));
+       return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+       cl_env_put(env, &nest->cen_refcheck);
+       cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+       ENTRY;
+       lvb->lvb_size   = attr->cat_size;
+       lvb->lvb_mtime  = attr->cat_mtime;
+       lvb->lvb_atime  = attr->cat_atime;
+       lvb->lvb_ctime  = attr->cat_ctime;
+       lvb->lvb_blocks = attr->cat_blocks;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+       ENTRY;
+       attr->cat_size   = lvb->lvb_size;
+       attr->cat_mtime  = lvb->lvb_mtime;
+       attr->cat_atime  = lvb->lvb_atime;
+       attr->cat_ctime  = lvb->lvb_ctime;
+       attr->cat_blocks = lvb->lvb_blocks;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                               struct lu_device_type *ldt,
+                               struct lu_device *next)
+{
+       const char       *typename;
+       struct lu_device *d;
+
+       LASSERT(ldt != NULL);
+
+       typename = ldt->ldt_name;
+       d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+       if (!IS_ERR(d)) {
+               int rc;
+
+               if (site != NULL)
+                       d->ld_site = site;
+               rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+               if (rc == 0) {
+                       lu_device_get(d);
+                       lu_ref_add(&d->ld_reference,
+                                  "lu-stack", &lu_site_init);
+               } else {
+                       ldt->ldt_ops->ldto_device_free(env, d);
+                       CERROR("can't init device '%s', %d\n", typename, rc);
+                       d = ERR_PTR(rc);
+               }
+       } else
+               CERROR("Cannot allocate device: '%s'\n", typename);
+       return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+       lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+       return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+                        struct lu_context_key *key)
+{
+       struct cl_thread_info *info;
+
+       info = cl0_key_init(ctx, key);
+       if (!IS_ERR(info)) {
+               int i;
+
+               for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+                       lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+       }
+       return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+                       struct lu_context_key *key, void *data)
+{
+       struct cl_thread_info *info;
+       int i;
+
+       info = data;
+       for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+               lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+       cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+                       struct lu_context_key *key, void *data)
+{
+       struct cl_thread_info *info = data;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+               LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+               lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+               lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+       }
+}
+
+static struct lu_context_key cl_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = cl_key_init,
+       .lct_fini = cl_key_fini,
+       .lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+       {
+               .ckd_cache = &cl_env_kmem,
+               .ckd_name  = "cl_env_kmem",
+               .ckd_size  = sizeof (struct cl_env)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+       int result;
+
+       result = cl_env_store_init();
+       if (result)
+               return result;
+
+       result = lu_kmem_init(cl_object_caches);
+       if (result)
+               goto out_store;
+
+       LU_CONTEXT_KEY_INIT(&cl_key);
+       result = lu_context_key_register(&cl_key);
+       if (result)
+               goto out_kmem;
+
+       result = cl_lock_init();
+       if (result)
+               goto out_context;
+
+       result = cl_page_init();
+       if (result)
+               goto out_lock;
+
+       return 0;
+out_lock:
+       cl_lock_fini();
+out_context:
+       lu_context_key_degister(&cl_key);
+out_kmem:
+       lu_kmem_fini(cl_object_caches);
+out_store:
+       cl_env_store_fini();
+       return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+       cl_lock_fini();
+       cl_page_fini();
+       lu_context_key_degister(&cl_key);
+       lu_kmem_fini(cl_object_caches);
+       cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644 (file)
index 0000000..bb93359
--- /dev/null
@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                           int radix);
+
+# define PASSERT(env, page, expr)                                     \
+  do {                                                             \
+         if (unlikely(!(expr))) {                                    \
+                 CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                 LASSERT(0);                                      \
+         }                                                          \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+       ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+       while (page->cp_parent != NULL)
+               page = page->cp_parent;
+       return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+       LASSERT(atomic_read(&page->cp_ref) > 0);
+       atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+                  const struct lu_device_type *dtype)
+{
+       const struct cl_page_slice *slice;
+       ENTRY;
+
+       page = cl_page_top_trusted((struct cl_page *)page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+                               RETURN(slice);
+               }
+               page = page->cp_child;
+       } while (page != NULL);
+       RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+       struct cl_page *page;
+
+       LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+       page = radix_tree_lookup(&hdr->coh_tree, index);
+       if (page != NULL)
+               cl_page_get_trust(page);
+       return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io, pgoff_t start, pgoff_t end,
+                       cl_page_gang_cb_t cb, void *cbdata)
+{
+       struct cl_object_header *hdr;
+       struct cl_page    *page;
+       struct cl_page   **pvec;
+       const struct cl_page_slice  *slice;
+       const struct lu_device_type *dtype;
+       pgoff_t           idx;
+       unsigned int         nr;
+       unsigned int         i;
+       unsigned int         j;
+       int                   res = CLP_GANG_OKAY;
+       int                   tree_lock = 1;
+       ENTRY;
+
+       idx = start;
+       hdr = cl_object_header(obj);
+       pvec = cl_env_info(env)->clt_pvec;
+       dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+       spin_lock(&hdr->coh_page_guard);
+       while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+                                           idx, CLT_PVEC_SIZE)) > 0) {
+               int end_of_region = 0;
+               idx = pvec[nr - 1]->cp_index + 1;
+               for (i = 0, j = 0; i < nr; ++i) {
+                       page = pvec[i];
+                       pvec[i] = NULL;
+
+                       LASSERT(page->cp_type == CPT_CACHEABLE);
+                       if (page->cp_index > end) {
+                               end_of_region = 1;
+                               break;
+                       }
+                       if (page->cp_state == CPS_FREEING)
+                               continue;
+
+                       slice = cl_page_at_trusted(page, dtype);
+                       /*
+                        * Pages for lsm-less file has no underneath sub-page
+                        * for osc, in case of ...
+                        */
+                       PASSERT(env, page, slice != NULL);
+
+                       page = slice->cpl_page;
+                       /*
+                        * Can safely call cl_page_get_trust() under
+                        * radix-tree spin-lock.
+                        *
+                        * XXX not true, because @page is from object another
+                        * than @hdr and protected by different tree lock.
+                        */
+                       cl_page_get_trust(page);
+                       lu_ref_add_atomic(&page->cp_reference,
+                                         "gang_lookup", current);
+                       pvec[j++] = page;
+               }
+
+               /*
+                * Here a delicate locking dance is performed. Current thread
+                * holds a reference to a page, but has to own it before it
+                * can be placed into queue. Owning implies waiting, so
+                * radix-tree lock is to be released. After a wait one has to
+                * check that pages weren't truncated (cl_page_own() returns
+                * error in the latter case).
+                */
+               spin_unlock(&hdr->coh_page_guard);
+               tree_lock = 0;
+
+               for (i = 0; i < j; ++i) {
+                       page = pvec[i];
+                       if (res == CLP_GANG_OKAY)
+                               res = (*cb)(env, io, page, cbdata);
+                       lu_ref_del(&page->cp_reference,
+                                  "gang_lookup", current);
+                       cl_page_put(env, page);
+               }
+               if (nr < CLT_PVEC_SIZE || end_of_region)
+                       break;
+
+               if (res == CLP_GANG_OKAY && need_resched())
+                       res = CLP_GANG_RESCHED;
+               if (res != CLP_GANG_OKAY)
+                       break;
+
+               spin_lock(&hdr->coh_page_guard);
+               tree_lock = 1;
+       }
+       if (tree_lock)
+               spin_unlock(&hdr->coh_page_guard);
+       RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+       struct cl_object *obj  = page->cp_obj;
+       int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+       PASSERT(env, page, list_empty(&page->cp_batch));
+       PASSERT(env, page, page->cp_owner == NULL);
+       PASSERT(env, page, page->cp_req == NULL);
+       PASSERT(env, page, page->cp_parent == NULL);
+       PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+       ENTRY;
+       might_sleep();
+       while (!list_empty(&page->cp_layers)) {
+               struct cl_page_slice *slice;
+
+               slice = list_entry(page->cp_layers.next,
+                                      struct cl_page_slice, cpl_linkage);
+               list_del_init(page->cp_layers.next);
+               slice->cpl_ops->cpo_fini(env, slice);
+       }
+       CS_PAGE_DEC(obj, total);
+       CS_PAGESTATE_DEC(obj, page->cp_state);
+       lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+       cl_object_put(env, obj);
+       lu_ref_fini(&page->cp_reference);
+       OBD_FREE(page, pagesize);
+       EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+                                          enum cl_page_state state)
+{
+       /* bypass const. */
+       *(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+               struct cl_object *o, pgoff_t ind, struct page *vmpage,
+               enum cl_page_type type)
+{
+       struct cl_page    *page;
+       struct lu_object_header *head;
+
+       ENTRY;
+       OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+                       __GFP_IO);
+       if (page != NULL) {
+               int result = 0;
+               atomic_set(&page->cp_ref, 1);
+               if (type == CPT_CACHEABLE) /* for radix tree */
+                       atomic_inc(&page->cp_ref);
+               page->cp_obj = o;
+               cl_object_get(o);
+               page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+               page->cp_index = ind;
+               cl_page_state_set_trust(page, CPS_CACHED);
+               page->cp_type = type;
+               INIT_LIST_HEAD(&page->cp_layers);
+               INIT_LIST_HEAD(&page->cp_batch);
+               INIT_LIST_HEAD(&page->cp_flight);
+               mutex_init(&page->cp_mutex);
+               lu_ref_init(&page->cp_reference);
+               head = o->co_lu.lo_header;
+               list_for_each_entry(o, &head->loh_layers,
+                                       co_lu.lo_linkage) {
+                       if (o->co_ops->coo_page_init != NULL) {
+                               result = o->co_ops->coo_page_init(env, o,
+                                                                 page, vmpage);
+                               if (result != 0) {
+                                       cl_page_delete0(env, page, 0);
+                                       cl_page_free(env, page);
+                                       page = ERR_PTR(result);
+                                       break;
+                               }
+                       }
+               }
+               if (result == 0) {
+                       CS_PAGE_INC(o, total);
+                       CS_PAGE_INC(o, create);
+                       CS_PAGESTATE_DEC(o, CPS_CACHED);
+               }
+       } else {
+               page = ERR_PTR(-ENOMEM);
+       }
+       RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+                                    struct cl_object *o,
+                                    pgoff_t idx, struct page *vmpage,
+                                    enum cl_page_type type,
+                                    struct cl_page *parent)
+{
+       struct cl_page    *page = NULL;
+       struct cl_page    *ghost = NULL;
+       struct cl_object_header *hdr;
+       int err;
+
+       LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+       might_sleep();
+
+       ENTRY;
+
+       hdr = cl_object_header(o);
+       CS_PAGE_INC(o, lookup);
+
+       CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+              idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+       /* fast path. */
+       if (type == CPT_CACHEABLE) {
+               /* vmpage lock is used to protect the child/parent
+                * relationship */
+               KLASSERT(PageLocked(vmpage));
+               /*
+                * cl_vmpage_page() can be called here without any locks as
+                *
+                *     - "vmpage" is locked (which prevents ->private from
+                *       concurrent updates), and
+                *
+                *     - "o" cannot be destroyed while current thread holds a
+                *       reference on it.
+                */
+               page = cl_vmpage_page(vmpage, o);
+               PINVRNT(env, page,
+                       ergo(page != NULL,
+                            cl_page_vmpage(env, page) == vmpage &&
+                            (void *)radix_tree_lookup(&hdr->coh_tree,
+                                                      idx) == page));
+       }
+
+       if (page != NULL) {
+               CS_PAGE_INC(o, hit);
+               RETURN(page);
+       }
+
+       /* allocate and initialize cl_page */
+       page = cl_page_alloc(env, o, idx, vmpage, type);
+       if (IS_ERR(page))
+               RETURN(page);
+
+       if (type == CPT_TRANSIENT) {
+               if (parent) {
+                       LASSERT(page->cp_parent == NULL);
+                       page->cp_parent = parent;
+                       parent->cp_child = page;
+               }
+               RETURN(page);
+       }
+
+       /*
+        * XXX optimization: use radix_tree_preload() here, and change tree
+        * gfp mask to GFP_KERNEL in cl_object_header_init().
+        */
+       spin_lock(&hdr->coh_page_guard);
+       err = radix_tree_insert(&hdr->coh_tree, idx, page);
+       if (err != 0) {
+               ghost = page;
+               /*
+                * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+                * from this race, but
+                *
+                *     0. it's better to have cl_page interface "locally
+                *     consistent" so that its correctness can be reasoned
+                *     about without appealing to the (obscure world of) VM
+                *     locking.
+                *
+                *     1. handling this race allows ->coh_tree to remain
+                *     consistent even when VM locking is somehow busted,
+                *     which is very useful during diagnosing and debugging.
+                */
+               page = ERR_PTR(err);
+               CL_PAGE_DEBUG(D_ERROR, env, ghost,
+                             "fail to insert into radix tree: %d\n", err);
+       } else {
+               if (parent) {
+                       LASSERT(page->cp_parent == NULL);
+                       page->cp_parent = parent;
+                       parent->cp_child = page;
+               }
+               hdr->coh_pages++;
+       }
+       spin_unlock(&hdr->coh_page_guard);
+
+       if (unlikely(ghost != NULL)) {
+               cl_page_delete0(env, ghost, 0);
+               cl_page_free(env, ghost);
+       }
+       RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+                            pgoff_t idx, struct page *vmpage,
+                            enum cl_page_type type)
+{
+       return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+                                pgoff_t idx, struct page *vmpage,
+                                struct cl_page *parent)
+{
+       return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+       struct cl_object_header *header;
+       struct cl_page    *parent;
+       struct cl_page    *child;
+       struct cl_io        *owner;
+
+       /*
+        * Page invariant is protected by a VM lock.
+        */
+       LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+       header = cl_object_header(pg->cp_obj);
+       parent = pg->cp_parent;
+       child  = pg->cp_child;
+       owner  = pg->cp_owner;
+
+       return cl_page_in_use(pg) &&
+               ergo(parent != NULL, parent->cp_child == pg) &&
+               ergo(child != NULL, child->cp_parent == pg) &&
+               ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+               ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+               ergo(owner != NULL && parent != NULL,
+                    parent->cp_owner == pg->cp_owner->ci_parent) &&
+               ergo(owner != NULL && child != NULL,
+                    child->cp_owner->ci_parent == owner) &&
+               /*
+                * Either page is early in initialization (has neither child
+                * nor parent yet), or it is in the object radix tree.
+                */
+               ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+                    (void *)radix_tree_lookup(&header->coh_tree,
+                                              pg->cp_index) == pg ||
+                    (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+                              struct cl_page *page, enum cl_page_state state)
+{
+       enum cl_page_state old;
+
+       /*
+        * Matrix of allowed state transitions [old][new], for sanity
+        * checking.
+        */
+       static const int allowed_transitions[CPS_NR][CPS_NR] = {
+               [CPS_CACHED] = {
+                       [CPS_CACHED]  = 0,
+                       [CPS_OWNED]   = 1, /* io finds existing cached page */
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 1, /* write-out from the cache */
+                       [CPS_FREEING] = 1, /* eviction on the memory pressure */
+               },
+               [CPS_OWNED] = {
+                       [CPS_CACHED]  = 1, /* release to the cache */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 1, /* start read immediately */
+                       [CPS_PAGEOUT] = 1, /* start write immediately */
+                       [CPS_FREEING] = 1, /* lock invalidation or truncate */
+               },
+               [CPS_PAGEIN] = {
+                       [CPS_CACHED]  = 1, /* io completion */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               },
+               [CPS_PAGEOUT] = {
+                       [CPS_CACHED]  = 1, /* io completion */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               },
+               [CPS_FREEING] = {
+                       [CPS_CACHED]  = 0,
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               }
+       };
+
+       ENTRY;
+       old = page->cp_state;
+       PASSERT(env, page, allowed_transitions[old][state]);
+       CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+       for (; page != NULL; page = page->cp_child) {
+               PASSERT(env, page, page->cp_state == old);
+               PASSERT(env, page,
+                       equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+               CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+               CS_PAGESTATE_INC(page->cp_obj, state);
+               cl_page_state_set_trust(page, state);
+       }
+       EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+                             struct cl_page *page, enum cl_page_state state)
+{
+       cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+       ENTRY;
+       cl_page_get_trust(page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+       PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+       ENTRY;
+       CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+                      atomic_read(&page->cp_ref));
+
+       if (atomic_dec_and_test(&page->cp_ref)) {
+               LASSERT(page->cp_state == CPS_FREEING);
+
+               LASSERT(atomic_read(&page->cp_ref) == 0);
+               PASSERT(env, page, page->cp_owner == NULL);
+               PASSERT(env, page, list_empty(&page->cp_batch));
+               /*
+                * Page is no longer reachable by other threads. Tear
+                * it down.
+                */
+               cl_page_free(env, page);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       /*
+        * Find uppermost layer with ->cpo_vmpage() method, and return its
+        * result.
+        */
+       page = cl_page_top(page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       if (slice->cpl_ops->cpo_vmpage != NULL)
+                               RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+               }
+               page = page->cp_child;
+       } while (page != NULL);
+       LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+       struct cl_page *top;
+       struct cl_page *page;
+
+       ENTRY;
+       KLASSERT(PageLocked(vmpage));
+
+       /*
+        * NOTE: absence of races and liveness of data are guaranteed by page
+        *       lock on a "vmpage". That works because object destruction has
+        *       bottom-to-top pass.
+        */
+
+       /*
+        * This loop assumes that ->private points to the top-most page. This
+        * can be rectified easily.
+        */
+       top = (struct cl_page *)vmpage->private;
+       if (top == NULL)
+               RETURN(NULL);
+
+       for (page = top; page != NULL; page = page->cp_child) {
+               if (cl_object_same(page->cp_obj, obj)) {
+                       cl_page_get_trust(page);
+                       break;
+               }
+       }
+       LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+       RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+       return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                      const struct lu_device_type *dtype)
+{
+       return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)             \
+({                                                                   \
+       const struct lu_env     *__env  = (_env);                   \
+       struct cl_page       *__page = (_page);            \
+       const struct cl_page_slice *__scan;                          \
+       int                      __result;                         \
+       ptrdiff_t                  __op   = (_op);                   \
+       int                    (*__method)_proto;                   \
+                                                                       \
+       __result = 0;                                              \
+       __page = cl_page_top(__page);                              \
+       do {                                                        \
+               list_for_each_entry(__scan, &__page->cp_layers,     \
+                                       cpl_linkage) {            \
+                       __method = *(void **)((char *)__scan->cpl_ops + \
+                                             __op);                \
+                       if (__method != NULL) {                  \
+                               __result = (*__method)(__env, __scan,   \
+                                                      ## __VA_ARGS__); \
+                               if (__result != 0)                    \
+                                       break;                    \
+                       }                                              \
+               }                                                      \
+               __page = __page->cp_child;                            \
+       } while (__page != NULL && __result == 0);                    \
+       if (__result > 0)                                              \
+               __result = 0;                                      \
+       __result;                                                      \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)             \
+do {                                                               \
+       const struct lu_env     *__env  = (_env);                   \
+       struct cl_page       *__page = (_page);            \
+       const struct cl_page_slice *__scan;                          \
+       ptrdiff_t                  __op   = (_op);                   \
+       void                  (*__method)_proto;                    \
+                                                                       \
+       __page = cl_page_top(__page);                              \
+       do {                                                        \
+               list_for_each_entry(__scan, &__page->cp_layers,     \
+                                       cpl_linkage) {            \
+                       __method = *(void **)((char *)__scan->cpl_ops + \
+                                             __op);                \
+                       if (__method != NULL)                      \
+                               (*__method)(__env, __scan,            \
+                                           ## __VA_ARGS__);        \
+               }                                                      \
+               __page = __page->cp_child;                            \
+       } while (__page != NULL);                                      \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)         \
+do {                                                                   \
+       const struct lu_env     *__env  = (_env);                       \
+       struct cl_page       *__page = (_page);                \
+       const struct cl_page_slice *__scan;                              \
+       ptrdiff_t                  __op   = (_op);                       \
+       void                  (*__method)_proto;                        \
+                                                                           \
+       /* get to the bottom page. */                                  \
+       while (__page->cp_child != NULL)                                    \
+               __page = __page->cp_child;                                \
+       do {                                                            \
+               list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+                                               cpl_linkage) {        \
+                       __method = *(void **)((char *)__scan->cpl_ops +     \
+                                             __op);                    \
+                       if (__method != NULL)                          \
+                               (*__method)(__env, __scan,                \
+                                           ## __VA_ARGS__);            \
+               }                                                          \
+               __page = __page->cp_parent;                              \
+       } while (__page != NULL);                                          \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+       PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+       ENTRY;
+       RETURN(CL_PAGE_INVOKE(env, page, op,
+                             (const struct lu_env *,
+                              const struct cl_page_slice *, struct cl_io *),
+                             io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+                          struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+       PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+       ENTRY;
+       CL_PAGE_INVOID(env, page, op,
+                      (const struct lu_env *,
+                       const struct cl_page_slice *, struct cl_io *), io);
+       EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+       ENTRY;
+       for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+               if (page->cp_owner != NULL) {
+                       LASSERT(page->cp_owner->ci_owned_nr > 0);
+                       page->cp_owner->ci_owned_nr--;
+                       page->cp_owner = NULL;
+                       page->cp_task = NULL;
+               }
+       }
+       EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+       ENTRY;
+       for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+               LASSERT(page->cp_owner != NULL);
+               page->cp_owner->ci_owned_nr++;
+       }
+       EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+       enum cl_page_state state;
+
+       ENTRY;
+       state = pg->cp_state;
+       PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       cl_page_owner_clear(pg);
+
+       if (state == CPS_OWNED)
+               cl_page_state_set(env, pg, CPS_CACHED);
+       /*
+        * Completion call-backs are executed in the bottom-up order, so that
+        * uppermost layer (llite), responsible for VFS/VM interaction runs
+        * last and can release locks safely.
+        */
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, struct cl_io *),
+                              io);
+       EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+       LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+       ENTRY;
+       RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *          cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *          or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page *pg, int nonblock)
+{
+       int result;
+
+       PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+
+       if (pg->cp_state == CPS_FREEING) {
+               result = -ENOENT;
+       } else {
+               result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+                                       (const struct lu_env *,
+                                        const struct cl_page_slice *,
+                                        struct cl_io *, int),
+                                       io, nonblock);
+               if (result == 0) {
+                       PASSERT(env, pg, pg->cp_owner == NULL);
+                       PASSERT(env, pg, pg->cp_req == NULL);
+                       pg->cp_owner = io;
+                       pg->cp_task  = current;
+                       cl_page_owner_set(pg);
+                       if (pg->cp_state != CPS_FREEING) {
+                               cl_page_state_set(env, pg, CPS_OWNED);
+                       } else {
+                               cl_page_disown0(env, io, pg);
+                               result = -ENOENT;
+                       }
+               }
+       }
+       PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+       RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+       return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page *pg)
+{
+       return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+                   struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+
+       cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+       PASSERT(env, pg, pg->cp_owner == NULL);
+       pg->cp_owner = io;
+       pg->cp_task = current;
+       cl_page_owner_set(pg);
+       cl_page_state_set(env, pg, CPS_OWNED);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+       cl_page_owner_clear(pg);
+       cl_page_state_set(env, pg, CPS_CACHED);
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, struct cl_io *),
+                              io);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+                   struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+       cl_page_disown0(env, io, pg);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                           int radix)
+{
+       struct cl_page *tmp = pg;
+       ENTRY;
+
+       PASSERT(env, pg, pg == cl_page_top(pg));
+       PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+       /*
+        * Severe all ways to obtain new pointers to @pg.
+        */
+       cl_page_owner_clear(pg);
+
+       /*
+        * unexport the page firstly before freeing it so that
+        * the page content is considered to be invalid.
+        * We have to do this because a CPS_FREEING cl_page may
+        * be NOT under the protection of a cl_lock.
+        * Afterwards, if this page is found by other threads, then this
+        * page will be forced to reread.
+        */
+       cl_page_export(env, pg, 0);
+       cl_page_state_set0(env, pg, CPS_FREEING);
+
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+                      (const struct lu_env *, const struct cl_page_slice *));
+
+       if (tmp->cp_type == CPT_CACHEABLE) {
+               if (!radix)
+                       /* !radix means that @pg is not yet in the radix tree,
+                        * skip removing it.
+                        */
+                       tmp = pg->cp_child;
+               for (; tmp != NULL; tmp = tmp->cp_child) {
+                       void                *value;
+                       struct cl_object_header *hdr;
+
+                       hdr = cl_object_header(tmp->cp_obj);
+                       spin_lock(&hdr->coh_page_guard);
+                       value = radix_tree_delete(&hdr->coh_tree,
+                                                 tmp->cp_index);
+                       PASSERT(env, tmp, value == tmp);
+                       PASSERT(env, tmp, hdr->coh_pages > 0);
+                       hdr->coh_pages--;
+                       spin_unlock(&hdr->coh_page_guard);
+                       cl_page_put(env, tmp);
+               }
+       }
+
+       EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       ENTRY;
+       cl_page_delete0(env, pg, 1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+                 struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+                      (const struct lu_env *,
+                       const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+       int result;
+       const struct cl_page_slice *slice;
+
+       ENTRY;
+       pg = cl_page_top_trusted((struct cl_page *)pg);
+       slice = container_of(pg->cp_layers.next,
+                            const struct cl_page_slice, cpl_linkage);
+       PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+       /*
+        * Call ->cpo_is_vmlocked() directly instead of going through
+        * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+        * cl_page_invariant().
+        */
+       result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+       PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+       RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+       ENTRY;
+       RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+                            struct cl_page *pg, enum cl_req_type crt)
+{
+       /*
+        * Page is queued for IO, change its state.
+        */
+       ENTRY;
+       cl_page_owner_clear(pg);
+       cl_page_state_set(env, pg, cl_req_type_state(crt));
+       EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+                struct cl_page *pg, enum cl_req_type crt)
+{
+       int result;
+
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       PINVRNT(env, pg, crt < CRT_NR);
+
+       /*
+        * XXX this has to be called bottom-to-top, so that llite can set up
+        * PG_writeback without risking other layers deciding to skip this
+        * page.
+        */
+       if (crt >= CRT_NR)
+               return -EINVAL;
+       result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+       if (result == 0)
+               cl_page_io_start(env, pg, crt);
+
+       KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+                     equi(result == 0,
+                          PageWriteback(cl_page_vmpage(env, pg)))));
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+                       struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+       struct cl_sync_io *anchor = pg->cp_sync_io;
+
+       PASSERT(env, pg, crt < CRT_NR);
+       /* cl_page::cp_req already cleared by the caller (osc_completion()) */
+       PASSERT(env, pg, pg->cp_req == NULL);
+       PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+       ENTRY;
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+       if (crt == CRT_READ && ioret == 0) {
+               PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+               pg->cp_flags |= CPF_READ_COMPLETED;
+       }
+
+       cl_page_state_set(env, pg, CPS_CACHED);
+       if (crt >= CRT_NR)
+               return;
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, int), ioret);
+       if (anchor) {
+               LASSERT(cl_page_is_vmlocked(env, pg));
+               LASSERT(pg->cp_sync_io == anchor);
+               pg->cp_sync_io = NULL;
+       }
+       /*
+        * As page->cp_obj is pinned by a reference from page->cp_req, it is
+        * safe to call cl_page_put() without risking object destruction in a
+        * non-blocking context.
+        */
+       cl_page_put(env, pg);
+
+       if (anchor)
+               cl_sync_io_note(anchor, ioret);
+
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+                      enum cl_req_type crt)
+{
+       int result;
+
+       PINVRNT(env, pg, crt < CRT_NR);
+
+       ENTRY;
+       if (crt >= CRT_NR)
+               RETURN(-EINVAL);
+       result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+                               (const struct lu_env *,
+                                const struct cl_page_slice *));
+       if (result == 0) {
+               PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+               cl_page_io_start(env, pg, crt);
+       }
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+                     struct cl_page *pg, enum cl_req_type crt)
+{
+       const struct cl_page_slice *scan;
+       int result = 0;
+
+       PINVRNT(env, pg, crt < CRT_NR);
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+
+       if (crt >= CRT_NR)
+               RETURN(-EINVAL);
+
+       list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+               if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+                       continue;
+
+               result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+               if (result != 0)
+                       break;
+       }
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page *pg)
+{
+       int result;
+
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+
+       result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page)
+{
+       int rc;
+
+       PINVRNT(env, page, cl_page_invariant(page));
+
+       ENTRY;
+       rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+                           (const struct lu_env *,
+                            const struct cl_page_slice *, struct cl_io *),
+                           io);
+       PASSERT(env, page, rc != 0);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *page, void *cbdata)
+{
+       cl_page_own(env, io, page);
+       cl_page_unmap(env, io, page);
+       cl_page_discard(env, io, page);
+       cl_page_disown(env, io, page);
+       return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+       struct cl_thread_info   *info;
+       struct cl_object        *obj = cl_object_top(clobj);
+       struct cl_io        *io;
+       int                   result;
+
+       ENTRY;
+       info  = cl_env_info(env);
+       io    = &info->clt_io;
+
+       /*
+        * initialize the io. This is ugly since we never do IO in this
+        * function, we just make cl_page_list functions happy. -jay
+        */
+       io->ci_obj = obj;
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, obj);
+       if (result != 0) {
+               cl_io_fini(env, io);
+               RETURN(io->ci_result);
+       }
+
+       do {
+               result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+                                            page_prune_cb, NULL);
+               if (result == CLP_GANG_RESCHED)
+                       cond_resched();
+       } while (result != CLP_GANG_OKAY);
+
+       cl_io_fini(env, io);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+                 int from, int to)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+                      (const struct lu_env *,
+                       const struct cl_page_slice *,int, int),
+                      from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t printer, const struct cl_page *pg)
+{
+       (*printer)(env, cookie,
+                  "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+                  pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+                  pg->cp_index, pg->cp_parent, pg->cp_child,
+                  pg->cp_state, pg->cp_error, pg->cp_type,
+                  pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t printer, const struct cl_page *pg)
+{
+       struct cl_page *scan;
+
+       for (scan = cl_page_top((struct cl_page *)pg);
+            scan != NULL; scan = scan->cp_child)
+               cl_page_header_print(env, cookie, printer, scan);
+       CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+                      (const struct lu_env *env,
+                       const struct cl_page_slice *slice,
+                       void *cookie, lu_printer_t p), cookie, printer);
+       (*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+       return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+                             (const struct lu_env *,
+                              const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+       /*
+        * XXX for now.
+        */
+       return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+       /*
+        * XXX for now.
+        */
+       return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+       return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_page_operations *ops)
+{
+       ENTRY;
+       list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+       slice->cpl_obj  = obj;
+       slice->cpl_ops  = ops;
+       slice->cpl_page = page;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+       return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644 (file)
index 0000000..20d9eaf
--- /dev/null
@@ -0,0 +1,691 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_unstable_pages;
+EXPORT_SYMBOL(obd_unstable_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+       int jobid_len = JOBSTATS_JOBID_SIZE;
+       int rc = 0;
+       ENTRY;
+
+       memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+       /* Jobstats isn't enabled */
+       if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+               RETURN(0);
+
+       /* Use process name + fsuid as jobid */
+       if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+               snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+                        current_comm(), current_fsuid());
+               RETURN(0);
+       }
+
+       rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+       if (rc) {
+               if (rc == -EOVERFLOW) {
+                       /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+                        * variable length strings instead of just numbers), it
+                        * might make sense to keep the unique parts for JobID,
+                        * instead of just returning an error.  That means a
+                        * larger temp buffer for cfs_get_environ(), then
+                        * truncating the string at some separator to fit into
+                        * the specified jobid_len.  Fix later if needed. */
+                       static bool printed;
+                       if (unlikely(!printed)) {
+                               LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+                                                  "for JobID buffer (%d)\n",
+                                                  obd_jobid_var, jobid_len);
+                               printed = true;
+                       }
+               } else {
+                       CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+                               rc == -EDEADLK) ? D_INFO : D_ERROR,
+                              "Get jobid for (%s) failed: rc = %d\n",
+                              obd_jobid_var, rc);
+               }
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+                  size_t size, const char *file, int line)
+{
+       if (ptr == NULL ||
+           (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+               CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+                      ptr ? "force " :"", type, name, (__u64)size, file,
+                      line);
+               CERROR(LPU64" total bytes and "LPU64" total pages "
+                      "("LPU64" bytes) allocated by Lustre, "
+                      "%d total bytes by LNET\n",
+                      obd_memory_sum(),
+                      obd_pages_sum() << PAGE_CACHE_SHIFT,
+                      obd_pages_sum(),
+                       atomic_read(&libcfs_kmemory));
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+                                struct obd_ioctl_data *data)
+{
+       memset(conn, 0, sizeof *conn);
+       conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+                                struct lustre_handle *conn)
+{
+       data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+       int rc;
+       int dev;
+
+       ENTRY;
+       if (!len || !name) {
+               CERROR("No name passed,!\n");
+               GOTO(out, rc = -EINVAL);
+       }
+       if (name[len - 1] != 0) {
+               CERROR("Name not nul terminated!\n");
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_IOCTL, "device name %s\n", name);
+       dev = class_name2dev(name);
+       if (dev == -1) {
+               CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+       rc = dev;
+
+out:
+       RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+       char *buf = NULL;
+       struct obd_ioctl_data *data;
+       struct libcfs_debug_ioctl_data *debug_data;
+       struct obd_device *obd = NULL;
+       int err = 0, len = 0;
+       ENTRY;
+
+       /* only for debugging */
+       if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+               debug_data = (struct libcfs_debug_ioctl_data*)arg;
+               libcfs_subsystem_debug = debug_data->subs;
+               libcfs_debug = debug_data->debug;
+               return 0;
+       }
+
+       CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+       if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+               CERROR("OBD ioctl: data error\n");
+               RETURN(-EINVAL);
+       }
+       data = (struct obd_ioctl_data *)buf;
+
+       switch (cmd) {
+       case OBD_IOC_PROCESS_CFG: {
+               struct lustre_cfg *lcfg;
+
+               if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+                       CERROR("No config buffer passed!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               OBD_ALLOC(lcfg, data->ioc_plen1);
+               if (lcfg == NULL)
+                       GOTO(out, err = -ENOMEM);
+               err = copy_from_user(lcfg, data->ioc_pbuf1,
+                                        data->ioc_plen1);
+               if (!err)
+                       err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+               if (!err)
+                       err = class_process_config(lcfg);
+
+               OBD_FREE(lcfg, data->ioc_plen1);
+               GOTO(out, err);
+       }
+
+       case OBD_GET_VERSION:
+               if (!data->ioc_inlbuf1) {
+                       CERROR("No buffer passed in ioctl\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+                       CERROR("ioctl buffer too small to hold version\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               memcpy(data->ioc_bulk, BUILD_VERSION,
+                      strlen(BUILD_VERSION) + 1);
+
+               err = obd_ioctl_popdata((void *)arg, data, len);
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+
+       case OBD_IOC_NAME2DEV: {
+               /* Resolve a device name.  This does not change the
+                * currently selected device.
+                */
+               int dev;
+
+               dev = class_resolve_dev_name(data->ioc_inllen1,
+                                            data->ioc_inlbuf1);
+               data->ioc_dev = dev;
+               if (dev < 0)
+                       GOTO(out, err = -EINVAL);
+
+               err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+
+       case OBD_IOC_UUID2DEV: {
+               /* Resolve a device uuid.  This does not change the
+                * currently selected device.
+                */
+               int dev;
+               struct obd_uuid uuid;
+
+               if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+                       CERROR("No UUID passed!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+                       CERROR("UUID not NUL terminated!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+               obd_str2uuid(&uuid, data->ioc_inlbuf1);
+               dev = class_uuid2dev(&uuid);
+               data->ioc_dev = dev;
+               if (dev == -1) {
+                       CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+                              data->ioc_inlbuf1);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+                      dev);
+               err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+
+       case OBD_IOC_CLOSE_UUID: {
+               CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+                      data->ioc_inlbuf1);
+               GOTO(out, err = 0);
+       }
+
+       case OBD_IOC_GETDEVICE: {
+               int     index = data->ioc_count;
+               char    *status, *str;
+
+               if (!data->ioc_inlbuf1) {
+                       CERROR("No buffer passed in ioctl\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               if (data->ioc_inllen1 < 128) {
+                       CERROR("ioctl buffer too small to hold version\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               obd = class_num2obd(index);
+               if (!obd)
+                       GOTO(out, err = -ENOENT);
+
+               if (obd->obd_stopping)
+                       status = "ST";
+               else if (obd->obd_set_up)
+                       status = "UP";
+               else if (obd->obd_attached)
+                       status = "AT";
+               else
+                       status = "--";
+               str = (char *)data->ioc_bulk;
+               snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+                        (int)index, status, obd->obd_type->typ_name,
+                        obd->obd_name, obd->obd_uuid.uuid,
+                        atomic_read(&obd->obd_refcount));
+               err = obd_ioctl_popdata((void *)arg, data, len);
+
+               GOTO(out, err = 0);
+       }
+
+       }
+
+       if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+               if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+                       GOTO(out, err = -EINVAL);
+               if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+                       GOTO(out, err = -EINVAL);
+               obd = class_name2obd(data->ioc_inlbuf4);
+       } else if (data->ioc_dev < class_devno_max()) {
+               obd = class_num2obd(data->ioc_dev);
+       } else {
+               CERROR("OBD ioctl: No device\n");
+               GOTO(out, err = -EINVAL);
+       }
+
+       if (obd == NULL) {
+               CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+               GOTO(out, err = -EINVAL);
+       }
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+       if (!obd->obd_set_up || obd->obd_stopping) {
+               CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+               GOTO(out, err = -EINVAL);
+       }
+
+       switch(cmd) {
+       case OBD_IOC_NO_TRANSNO: {
+               if (!obd->obd_attached) {
+                       CERROR("Device %d not attached\n", obd->obd_minor);
+                       GOTO(out, err = -ENODEV);
+               }
+               CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+                      obd->obd_name);
+               obd->obd_no_transno = 1;
+               GOTO(out, err = 0);
+       }
+
+       default: {
+               err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+               if (err)
+                       GOTO(out, err);
+
+               err = obd_ioctl_popdata((void *)arg, data, len);
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+       }
+
+ out:
+       if (buf)
+               obd_ioctl_freedata(buf, len);
+       RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+       __u64 u64val, div64val;
+       char buf[64];
+       int len, ret = 0;
+
+       CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+       CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+       u64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPX64, u64val);
+       if (len != 18) {
+               CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
+
+       div64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EOVERFLOW;
+       }
+       if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               return -EOVERFLOW;
+       }
+       if (do_div(div64val, 256) != (u64val & 255)) {
+               CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+               return -EOVERFLOW;
+       }
+       if (u64val >> 8 != div64val) {
+               CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+                      u64val, div64val, u64val >> 8);
+               return -EOVERFLOW;
+       }
+       len = snprintf(buf, sizeof(buf), LPX64, u64val);
+       if (len != 18) {
+               CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPU64, u64val);
+       if (len != 20) {
+               CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPD64, u64val);
+       if (len != 2) {
+               CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+               ret = -EINVAL;
+       }
+       if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+               CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+                     (__u64)PAGE_CACHE_SIZE);
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+       int i, err;
+       int lustre_register_fs(void);
+
+       for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+               INIT_LIST_HEAD(&capa_list[i]);
+
+       LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+       spin_lock_init(&obd_types_lock);
+       obd_zombie_impexp_init();
+#ifdef LPROCFS
+       obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+                                        LPROCFS_STATS_FLAG_NONE |
+                                        LPROCFS_STATS_FLAG_IRQ_SAFE);
+       if (obd_memory == NULL) {
+               CERROR("kmalloc of 'obd_memory' failed\n");
+               RETURN(-ENOMEM);
+       }
+
+       lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+                            LPROCFS_CNTR_AVGMINMAX,
+                            "memused", "bytes");
+       lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+                            LPROCFS_CNTR_AVGMINMAX,
+                            "pagesused", "pages");
+#endif
+       err = obd_init_checks();
+       if (err == -EOVERFLOW)
+               return err;
+
+       class_init_uuidlist();
+       err = class_handle_init();
+       if (err)
+               return err;
+
+       INIT_LIST_HEAD(&obd_types);
+
+       err = misc_register(&obd_psdev);
+       if (err) {
+               CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+               return err;
+       }
+
+       /* This struct is already zeroed for us (static global) */
+       for (i = 0; i < class_devno_max(); i++)
+               obd_devs[i] = NULL;
+
+       /* Default the dirty page cache cap to 1/2 of system memory.
+        * For clients with less memory, a larger fraction is needed
+        * for other purposes (mostly for BGL). */
+       if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+               obd_max_dirty_pages = num_physpages / 4;
+       else
+               obd_max_dirty_pages = num_physpages / 2;
+
+       err = obd_init_caches();
+       if (err)
+               return err;
+       err = class_procfs_init();
+       if (err)
+               return err;
+
+       err = lu_global_init();
+       if (err)
+               return err;
+
+       err = cl_global_init();
+       if (err != 0)
+               return err;
+
+
+       err = llog_info_init();
+       if (err)
+               return err;
+
+       err = lustre_register_fs();
+
+       return err;
+}
+
+void obd_update_maxusage(void)
+{
+       __u64 max1, max2;
+
+       max1 = obd_pages_sum();
+       max2 = obd_memory_sum();
+
+       spin_lock(&obd_updatemax_lock);
+       if (max1 > obd_max_pages)
+               obd_max_pages = max1;
+       if (max2 > obd_max_alloc)
+               obd_max_alloc = max2;
+       spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+       __u64 ret;
+
+       spin_lock(&obd_updatemax_lock);
+       ret = obd_max_alloc;
+       spin_unlock(&obd_updatemax_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+       __u64 ret;
+
+       spin_lock(&obd_updatemax_lock);
+       ret = obd_max_pages;
+       spin_unlock(&obd_updatemax_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+       int i;
+       int lustre_unregister_fs(void);
+       __u64 memory_leaked, pages_leaked;
+       __u64 memory_max, pages_max;
+       ENTRY;
+
+       lustre_unregister_fs();
+
+       misc_deregister(&obd_psdev);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+               if (obd && obd->obd_set_up &&
+                   OBT(obd) && OBP(obd, detach)) {
+                       /* XXX should this call generic detach otherwise? */
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       OBP(obd, detach)(obd);
+               }
+       }
+       llog_info_fini();
+       cl_global_fini();
+       lu_global_fini();
+
+       obd_cleanup_caches();
+       obd_sysctl_clean();
+
+       class_procfs_clean();
+
+       class_handle_cleanup();
+       class_exit_uuidlist();
+       obd_zombie_impexp_stop();
+
+       memory_leaked = obd_memory_sum();
+       pages_leaked = obd_pages_sum();
+
+       memory_max = obd_memory_max();
+       pages_max = obd_pages_max();
+
+       lprocfs_free_stats(&obd_memory);
+       CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+              "obd_memory max: "LPU64", leaked: "LPU64"\n",
+              memory_max, memory_leaked);
+       CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+              "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+              pages_max, pages_leaked);
+
+       EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644 (file)
index 0000000..15f71bb
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+       CDEBUG(D_RPCTRACE,
+              "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+              nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+       CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+                       nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+       CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+              " stripe_size %u, stripe_count %u, refc: %d,"
+              " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+              POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+              lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+              atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+              lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+       LASSERT(addr);
+
+       off = cpu_to_le64 (off);
+       id = cpu_to_le64 (id);
+       memcpy(addr, (char *)&off, LPDS);
+       memcpy(addr + LPDS, (char *)&id, LPDS);
+
+       addr += len - LPDS - LPDS;
+       memcpy(addr, (char *)&off, LPDS);
+       memcpy(addr + LPDS, (char *)&id, LPDS);
+
+       return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+       __u64 ne_off;
+       int err = 0;
+
+       LASSERT(addr);
+
+       ne_off = le64_to_cpu (off);
+       id = le64_to_cpu (id);
+       if (memcmp(addr, (char *)&ne_off, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+               err = -EINVAL;
+       }
+       if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+                      who, id, off, *(__u64 *)(addr + LPDS), id);
+               err = -EINVAL;
+       }
+
+       addr += end - LPDS - LPDS;
+       if (memcmp(addr, (char *)&ne_off, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+               err = -EINVAL;
+       }
+       if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+               err = -EINVAL;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644 (file)
index 0000000..4303698
--- /dev/null
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+       .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+       .lct_init = dt_global_key_init,
+       .lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+       list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+       list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                     struct dt_device *dev, struct thandle *th)
+{
+       int rc = 0;
+       struct dt_txn_callback *cb;
+
+       if (th->th_local)
+               return 0;
+
+       list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+               if (cb->dtc_txn_start == NULL ||
+                   !(cb->dtc_tag & env->le_ctx.lc_tags))
+                       continue;
+               rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+               if (rc < 0)
+                       break;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+       struct dt_device       *dev = txn->th_dev;
+       struct dt_txn_callback *cb;
+       int                  rc = 0;
+
+       if (txn->th_local)
+               return 0;
+
+       list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+               if (cb->dtc_txn_stop == NULL ||
+                   !(cb->dtc_tag & env->le_ctx.lc_tags))
+                       continue;
+               rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+               if (rc < 0)
+                       break;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+       struct dt_txn_callback *cb;
+
+       if (txn->th_local)
+               return;
+
+       list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+                               dtc_linkage) {
+               if (cb->dtc_txn_commit)
+                       cb->dtc_txn_commit(txn, cb->dtc_cookie);
+       }
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+       INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+       return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+       lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+                  struct lu_object_header *h, struct lu_device *d)
+
+{
+       return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+       lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+       if (obj->do_index_ops == NULL)
+               obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+       return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+       enum dt_format_type result;
+
+       switch (mode & S_IFMT) {
+       case S_IFDIR:
+               result = DFT_DIR;
+               break;
+       case S_IFREG:
+               result = DFT_REGULAR;
+               break;
+       case S_IFLNK:
+               result = DFT_SYM;
+               break;
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFIFO:
+       case S_IFSOCK:
+               result = DFT_NODE;
+               break;
+       default:
+               LBUG();
+               break;
+       }
+       return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                 const char *name, struct lu_fid *fid)
+{
+       if (dt_try_as_dir(env, dir))
+               return dt_lookup(env, dir, (struct dt_rec *)fid,
+                                (const struct dt_key *)name, BYPASS_CAPA);
+       return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+                              struct dt_device *dev, const struct lu_fid *fid,
+                              struct lu_device *top_dev)
+{
+       struct lu_object *lo, *n;
+       ENTRY;
+
+       lo = lu_object_find_at(env, top_dev, fid, NULL);
+       if (IS_ERR(lo))
+               return (void *)lo;
+
+       LASSERT(lo != NULL);
+
+       list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+               if (n->lo_dev == &dev->dd_lu_dev)
+                       return container_of0(n, struct dt_object, do_lu);
+       }
+       return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+       struct dt_find_hint  *dfh = data;
+       struct dt_device     *dt = dfh->dfh_dt;
+       struct lu_fid   *fid = dfh->dfh_fid;
+       struct dt_object     *obj = dfh->dfh_o;
+       int                result;
+
+       result = dt_lookup_dir(env, obj, entry, fid);
+       lu_object_put(env, &obj->do_lu);
+       if (result == 0) {
+               obj = dt_locate(env, dt, fid);
+               if (IS_ERR(obj))
+                       result = PTR_ERR(obj);
+       }
+       dfh->dfh_o = obj;
+       return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+                  char *path, dt_entry_func_t entry_func,
+                  void *data)
+{
+       char *e;
+       int rc = 0;
+
+       while (1) {
+               e = strsep(&path, "/");
+               if (e == NULL)
+                       break;
+
+               if (e[0] == 0) {
+                       if (!path || path[0] == '\0')
+                               break;
+                       continue;
+               }
+               rc = entry_func(env, e, data);
+               if (rc)
+                       break;
+       }
+
+       return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+                const char *path, struct lu_fid *fid)
+{
+       struct dt_thread_info *info = dt_info(env);
+       struct dt_find_hint   *dfh = &info->dti_dfh;
+       struct dt_object      *obj;
+       char                  *local = info->dti_buf;
+       int                    result;
+
+
+       dfh->dfh_dt = dt;
+       dfh->dfh_fid = fid;
+
+       strncpy(local, path, DT_MAX_PATH);
+       local[DT_MAX_PATH - 1] = '\0';
+
+       result = dt->dd_ops->dt_root_get(env, dt, fid);
+       if (result == 0) {
+               obj = dt_locate(env, dt, fid);
+               if (!IS_ERR(obj)) {
+                       dfh->dfh_o = obj;
+                       result = dt_path_parser(env, local, dt_find_entry, dfh);
+                       if (result != 0)
+                               obj = ERR_PTR(result);
+                       else
+                               obj = dfh->dfh_o;
+               }
+       } else {
+               obj = ERR_PTR(result);
+       }
+       return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+                                    struct dt_device *dt,
+                                    struct dt_object *p,
+                                    const char *name,
+                                    struct lu_fid *fid)
+{
+       struct dt_object *o;
+       int result;
+
+       result = dt_lookup_dir(env, p, name, fid);
+       if (result == 0){
+               o = dt_locate(env, dt, fid);
+       }
+       else
+               o = ERR_PTR(result);
+
+       return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+                               struct dt_device *dt,
+                               const char *dirname,
+                               const char *filename,
+                               struct lu_fid *fid)
+{
+       struct dt_object *file;
+       struct dt_object *dir;
+
+       dir = dt_store_resolve(env, dt, dirname, fid);
+       if (!IS_ERR(dir)) {
+               file = dt_reg_open(env, dt, dir,
+                                  filename, fid);
+               lu_object_put(env, &dir->do_lu);
+       } else {
+               file = dir;
+       }
+       return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object_format *dof,
+                                   struct lu_attr *at)
+{
+       struct dt_object *dto;
+       struct thandle *th;
+       int rc;
+
+       ENTRY;
+
+       dto = dt_locate(env, dt, fid);
+       if (IS_ERR(dto))
+               RETURN(dto);
+
+       LASSERT(dto != NULL);
+       if (dt_object_exists(dto))
+               RETURN(dto);
+
+       th = dt_trans_create(env, dt);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = dt_declare_create(env, dto, at, NULL, dof, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       rc = dt_trans_start_local(env, dt, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       if (dt_object_exists(dto))
+               GOTO(unlock, rc = 0);
+
+       CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+       rc = dt_create(env, dto, at, NULL, dof, th);
+       if (rc)
+               GOTO(unlock, rc);
+       LASSERT(dt_object_exists(dto));
+unlock:
+       dt_write_unlock(env, dto);
+trans_stop:
+       dt_trans_stop(env, dt, th);
+out:
+       if (rc) {
+               lu_object_put(env, &dto->do_lu);
+               RETURN(ERR_PTR(rc));
+       }
+       RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+       int result;
+
+       LU_CONTEXT_KEY_INIT(&dt_key);
+       result = lu_context_key_register(&dt_key);
+       return result;
+}
+
+void dt_global_fini(void)
+{
+       lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+           struct lu_buf *buf, loff_t *pos)
+{
+       LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+       return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                  struct lu_buf *buf, loff_t *pos)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+       rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+       if (rc == buf->lb_len)
+               rc = 0;
+       else if (rc >= 0)
+               rc = -EFAULT;
+       return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+       LASSERT(th != NULL);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_write);
+       rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+       if (rc == buf->lb_len)
+               rc = 0;
+       else if (rc >= 0)
+               rc = -EFAULT;
+       return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                          struct thandle *th)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+
+       LASSERT(o);
+       vbuf.lb_buf = NULL;
+       vbuf.lb_len = sizeof(dt_obj_version_t);
+       return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                   dt_obj_version_t version, struct thandle *th)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+       int rc;
+
+       LASSERT(o);
+       vbuf.lb_buf = &version;
+       vbuf.lb_len = sizeof(version);
+
+       rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+       if (rc < 0)
+               CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+       return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+       dt_obj_version_t version;
+       int rc;
+
+       LASSERT(o);
+       vbuf.lb_buf = &version;
+       vbuf.lb_len = sizeof(version);
+       rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+       if (rc != sizeof(version)) {
+               CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+               version = 0;
+       }
+       return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       .dif_keysize_min        = sizeof(struct lu_fid),
+       .dif_keysize_max        = sizeof(struct lu_fid),
+       .dif_recsize_min        = sizeof(__u8),
+       .dif_recsize_max        = sizeof(__u8),
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_acct_rec), /* 16 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_acct_rec), /* 16 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       /* a different key would have to be used for per-directory quota */
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_glb_rec), /* 32 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_glb_rec), /* 32 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       /* a different key would have to be used for per-directory quota */
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_slv_rec), /* 8 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_slv_rec), /* 8 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+                                                                  __u32 mode)
+{
+       if (seq == FID_SEQ_QUOTA_GLB) {
+               /* global quota index */
+               if (!S_ISREG(mode))
+                       /* global quota index should be a regular file */
+                       return ERR_PTR(-ENOENT);
+               return &dt_quota_glb_features;
+       } else if (seq == FID_SEQ_QUOTA) {
+               /* quota slave index */
+               if (!S_ISREG(mode))
+                       /* slave index should be a regular file */
+                       return ERR_PTR(-ENOENT);
+               return &dt_quota_slv_features;
+       } else if (seq >= FID_SEQ_NORMAL) {
+               /* object is part of the namespace, verify that it is a
+                * directory */
+               if (!S_ISDIR(mode))
+                       /* sorry, we can only deal with directory */
+                       return ERR_PTR(-ENOTDIR);
+               return &dt_directory_features;
+       }
+
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+                              int nob, const struct dt_it_ops *iops,
+                              struct dt_it *it, __u32 attr, void *arg)
+{
+       struct idx_info         *ii = (struct idx_info *)arg;
+       struct lu_idxpage       *lip = &lp->lp_idx;
+       char                    *entry;
+       int                      rc, size;
+       ENTRY;
+
+       /* no support for variable key & record size for now */
+       LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+       LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+       /* initialize the header of the new container */
+       memset(lip, 0, LIP_HDR_SIZE);
+       lip->lip_magic = LIP_MAGIC;
+       nob        -= LIP_HDR_SIZE;
+
+       /* compute size needed to store a key/record pair */
+       size = ii->ii_recsize + ii->ii_keysize;
+       if ((ii->ii_flags & II_FL_NOHASH) == 0)
+               /* add hash if the client wants it */
+               size += sizeof(__u64);
+
+       entry = lip->lip_entries;
+       do {
+               char            *tmp_entry = entry;
+               struct dt_key   *key;
+               __u64            hash;
+
+               /* fetch 64-bit hash value */
+               hash = iops->store(env, it);
+               ii->ii_hash_end = hash;
+
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+                       if (lip->lip_nr != 0)
+                               GOTO(out, rc = 0);
+               }
+
+               if (nob < size) {
+                       if (lip->lip_nr == 0)
+                               GOTO(out, rc = -EINVAL);
+                       GOTO(out, rc = 0);
+               }
+
+               if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+                       /* client wants to the 64-bit hash value associated with
+                        * each record */
+                       memcpy(tmp_entry, &hash, sizeof(hash));
+                       tmp_entry += sizeof(hash);
+               }
+
+               /* then the key value */
+               LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+               key = iops->key(env, it);
+               memcpy(tmp_entry, key, ii->ii_keysize);
+               tmp_entry += ii->ii_keysize;
+
+               /* and finally the record */
+               rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+               if (rc != -ESTALE) {
+                       if (rc != 0)
+                               GOTO(out, rc);
+
+                       /* hash/key/record successfully copied! */
+                       lip->lip_nr++;
+                       if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+                               ii->ii_hash_start = hash;
+                       entry = tmp_entry + ii->ii_recsize;
+                       nob -= size;
+               }
+
+               /* move on to the next record */
+               do {
+                       rc = iops->next(env, it);
+               } while (rc == -ESTALE);
+
+       } while (rc == 0);
+
+       GOTO(out, rc);
+out:
+       if (rc >= 0 && lip->lip_nr > 0)
+               /* one more container */
+               ii->ii_count++;
+       if (rc > 0)
+               /* no more entries */
+               ii->ii_hash_end = II_END_OFF;
+       return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *              with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+                 const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+                 void *arg)
+{
+       struct dt_it            *it;
+       const struct dt_it_ops  *iops;
+       unsigned int             pageidx, nob, nlupgs = 0;
+       int                      rc;
+       ENTRY;
+
+       LASSERT(rdpg->rp_pages != NULL);
+       LASSERT(obj->do_index_ops != NULL);
+
+       nob = rdpg->rp_count;
+       if (nob <= 0)
+               RETURN(-EFAULT);
+
+       /* Iterate through index and fill containers from @rdpg */
+       iops = &obj->do_index_ops->dio_it;
+       LASSERT(iops != NULL);
+       it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+       if (IS_ERR(it))
+               RETURN(PTR_ERR(it));
+
+       rc = iops->load(env, it, rdpg->rp_hash);
+       if (rc == 0) {
+               /*
+                * Iterator didn't find record with exactly the key requested.
+                *
+                * It is currently either
+                *
+                *     - positioned above record with key less than
+                *     requested---skip it.
+                *     - or not positioned at all (is in IAM_IT_SKEWED
+                *     state)---position it on the next item.
+                */
+               rc = iops->next(env, it);
+       } else if (rc > 0) {
+               rc = 0;
+       }
+
+       /* Fill containers one after the other. There might be multiple
+        * containers per physical page.
+        *
+        * At this point and across for-loop:
+        *  rc == 0 -> ok, proceed.
+        *  rc >  0 -> end of index.
+        *  rc <  0 -> error. */
+       for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+               union lu_page   *lp;
+               int              i;
+
+               LASSERT(pageidx < rdpg->rp_npages);
+               lp = kmap(rdpg->rp_pages[pageidx]);
+
+               /* fill lu pages */
+               for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+                       rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+                                   iops, it, rdpg->rp_attrs, arg);
+                       if (rc < 0)
+                               break;
+                       /* one more lu_page */
+                       nlupgs++;
+                       if (rc > 0)
+                               /* end of index */
+                               break;
+               }
+               kunmap(rdpg->rp_pages[i]);
+       }
+
+       iops->put(env, it);
+       iops->fini(env, it);
+
+       if (rc >= 0)
+               rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *           OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+                 struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+       const struct dt_index_features  *feat;
+       struct dt_object                *obj;
+       int                              rc;
+       ENTRY;
+
+       /* rp_count shouldn't be null and should be a multiple of the container
+        * size */
+       if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+               RETURN(-EFAULT);
+
+       if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+               /* we don't support directory transfer via OBD_IDX_READ for the
+                * time being */
+               RETURN(-EOPNOTSUPP);
+
+       if (!fid_is_quota(&ii->ii_fid))
+               /* block access to all local files except quota files */
+               RETURN(-EPERM);
+
+       /* lookup index object subject to the transfer */
+       obj = dt_locate(env, dev, &ii->ii_fid);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+       if (dt_object_exists(obj) == 0)
+               GOTO(out, rc = -ENOENT);
+
+       /* fetch index features associated with index object */
+       feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+                                   lu_object_attr(&obj->do_lu));
+       if (IS_ERR(feat))
+               GOTO(out, rc = PTR_ERR(feat));
+
+       /* load index feature if not done already */
+       if (obj->do_index_ops == NULL) {
+               rc = obj->do_ops->do_index_try(env, obj, feat);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       /* fill ii_flags with supported index features */
+       ii->ii_flags &= II_FL_NOHASH;
+
+       ii->ii_keysize = feat->dif_keysize_max;
+       if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+               /* key size is variable */
+               ii->ii_flags |= II_FL_VARKEY;
+               /* we don't support variable key size for the time being */
+               GOTO(out, rc = -EOPNOTSUPP);
+       }
+
+       ii->ii_recsize = feat->dif_recsize_max;
+       if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+               /* record size is variable */
+               ii->ii_flags |= II_FL_VARREC;
+               /* we don't support variable record size for the time being */
+               GOTO(out, rc = -EOPNOTSUPP);
+       }
+
+       if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+               /* key isn't necessarily unique */
+               ii->ii_flags |= II_FL_NONUNQ;
+
+       dt_read_lock(env, obj, 0);
+       /* fetch object version before walking the index */
+       ii->ii_version = dt_version_get(env, obj);
+
+       /* walk the index and fill lu_idxpages with key/record pairs */
+       rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+       dt_read_unlock(env, obj);
+
+       if (rc == 0) {
+               /* index is empty */
+               LASSERT(ii->ii_count == 0);
+               ii->ii_hash_end = II_END_OFF;
+       }
+
+       GOTO(out, rc);
+out:
+       lu_object_put(env, &obj->do_lu);
+       return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               *eof = 1;
+               rc = snprintf(page, count, "%d\n",
+                               (unsigned) osfs.os_bsize);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc != 0) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644 (file)
index 0000000..1cc9b55
--- /dev/null
@@ -0,0 +1,1855 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head      obd_zombie_imports;
+struct list_head      obd_zombie_exports;
+spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+                             const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+       struct obd_device *obd;
+
+       OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+       if (obd != NULL) {
+               obd->obd_magic = OBD_DEVICE_MAGIC;
+       }
+       return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+       LASSERT(obd != NULL);
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       if (obd->obd_namespace != NULL) {
+               CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+                      obd, obd->obd_namespace, obd->obd_force);
+               LBUG();
+       }
+       lu_ref_fini(&obd->obd_reference);
+       OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+       struct list_head *tmp;
+       struct obd_type *type;
+
+       spin_lock(&obd_types_lock);
+       list_for_each(tmp, &obd_types) {
+               type = list_entry(tmp, struct obd_type, typ_chain);
+               if (strcmp(type->typ_name, name) == 0) {
+                       spin_unlock(&obd_types_lock);
+                       return type;
+               }
+       }
+       spin_unlock(&obd_types_lock);
+       return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+       struct obd_type *type = class_search_type(name);
+
+       if (!type) {
+               const char *modname = name;
+
+               if (strcmp(modname, "obdfilter") == 0)
+                       modname = "ofd";
+
+               if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+                       modname = LUSTRE_OSP_NAME;
+
+               if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+                       modname = LUSTRE_MDT_NAME;
+
+               if (!request_module("%s", modname)) {
+                       CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+                       type = class_search_type(name);
+               } else {
+                       LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+                                          modname);
+               }
+       }
+       if (type) {
+               spin_lock(&type->obd_type_lock);
+               type->typ_refcnt++;
+               try_module_get(type->typ_dt_ops->o_owner);
+               spin_unlock(&type->obd_type_lock);
+       }
+       return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+       LASSERT(type);
+       spin_lock(&type->obd_type_lock);
+       type->typ_refcnt--;
+       module_put(type->typ_dt_ops->o_owner);
+       spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+                       struct lprocfs_vars *vars, const char *name,
+                       struct lu_device_type *ldt)
+{
+       struct obd_type *type;
+       int rc = 0;
+       ENTRY;
+
+       /* sanity check */
+       LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+       if (class_search_type(name)) {
+               CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+               RETURN(-EEXIST);
+       }
+
+       rc = -ENOMEM;
+       OBD_ALLOC(type, sizeof(*type));
+       if (type == NULL)
+               RETURN(rc);
+
+       OBD_ALLOC_PTR(type->typ_dt_ops);
+       OBD_ALLOC_PTR(type->typ_md_ops);
+       OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+       if (type->typ_dt_ops == NULL ||
+           type->typ_md_ops == NULL ||
+           type->typ_name == NULL)
+               GOTO (failed, rc);
+
+       *(type->typ_dt_ops) = *dt_ops;
+       /* md_ops is optional */
+       if (md_ops)
+               *(type->typ_md_ops) = *md_ops;
+       strcpy(type->typ_name, name);
+       spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+       type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+                                             vars, type);
+       if (IS_ERR(type->typ_procroot)) {
+               rc = PTR_ERR(type->typ_procroot);
+               type->typ_procroot = NULL;
+               GOTO (failed, rc);
+       }
+#endif
+       if (ldt != NULL) {
+               type->typ_lu = ldt;
+               rc = lu_device_type_init(ldt);
+               if (rc != 0)
+                       GOTO (failed, rc);
+       }
+
+       spin_lock(&obd_types_lock);
+       list_add(&type->typ_chain, &obd_types);
+       spin_unlock(&obd_types_lock);
+
+       RETURN (0);
+
+ failed:
+       if (type->typ_name != NULL)
+               OBD_FREE(type->typ_name, strlen(name) + 1);
+       if (type->typ_md_ops != NULL)
+               OBD_FREE_PTR(type->typ_md_ops);
+       if (type->typ_dt_ops != NULL)
+               OBD_FREE_PTR(type->typ_dt_ops);
+       OBD_FREE(type, sizeof(*type));
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+       struct obd_type *type = class_search_type(name);
+       ENTRY;
+
+       if (!type) {
+               CERROR("unknown obd type\n");
+               RETURN(-EINVAL);
+       }
+
+       if (type->typ_refcnt) {
+               CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+               /* This is a bad situation, let's make the best of it */
+               /* Remove ops, but leave the name for debugging */
+               OBD_FREE_PTR(type->typ_dt_ops);
+               OBD_FREE_PTR(type->typ_md_ops);
+               RETURN(-EBUSY);
+       }
+
+       /* we do not use type->typ_procroot as for compatibility purposes
+        * other modules can share names (i.e. lod can use lov entry). so
+        * we can't reference pointer as it can get invalided when another
+        * module removes the entry */
+       lprocfs_try_remove_proc_entry(type->typ_name, proc_lustre_root);
+
+       if (type->typ_lu)
+               lu_device_type_fini(type->typ_lu);
+
+       spin_lock(&obd_types_lock);
+       list_del(&type->typ_chain);
+       spin_unlock(&obd_types_lock);
+       OBD_FREE(type->typ_name, strlen(name) + 1);
+       if (type->typ_dt_ops != NULL)
+               OBD_FREE_PTR(type->typ_dt_ops);
+       if (type->typ_md_ops != NULL)
+               OBD_FREE_PTR(type->typ_md_ops);
+       OBD_FREE(type, sizeof(*type));
+       RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *      pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+       struct obd_device *result = NULL;
+       struct obd_device *newdev;
+       struct obd_type *type = NULL;
+       int i;
+       int new_obd_minor = 0;
+       ENTRY;
+
+       if (strlen(name) >= MAX_OBD_NAME) {
+               CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+               RETURN(ERR_PTR(-EINVAL));
+       }
+
+       type = class_get_type(type_name);
+       if (type == NULL){
+               CERROR("OBD: unknown type: %s\n", type_name);
+               RETURN(ERR_PTR(-ENODEV));
+       }
+
+       newdev = obd_device_alloc();
+       if (newdev == NULL)
+               GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+       LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+       write_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && (strcmp(name, obd->obd_name) == 0)) {
+                       CERROR("Device %s already exists at %d, won't add\n",
+                              name, i);
+                       if (result) {
+                               LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+                                        "%p obd_magic %08x != %08x\n", result,
+                                        result->obd_magic, OBD_DEVICE_MAGIC);
+                               LASSERTF(result->obd_minor == new_obd_minor,
+                                        "%p obd_minor %d != %d\n", result,
+                                        result->obd_minor, new_obd_minor);
+
+                               obd_devs[result->obd_minor] = NULL;
+                               result->obd_name[0]='\0';
+                        }
+                       result = ERR_PTR(-EEXIST);
+                       break;
+               }
+               if (!result && !obd) {
+                       result = newdev;
+                       result->obd_minor = i;
+                       new_obd_minor = i;
+                       result->obd_type = type;
+                       strncpy(result->obd_name, name,
+                               sizeof(result->obd_name) - 1);
+                       obd_devs[i] = result;
+               }
+       }
+       write_unlock(&obd_dev_lock);
+
+       if (result == NULL && i >= class_devno_max()) {
+               CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+                      class_devno_max());
+               GOTO(out, result = ERR_PTR(-EOVERFLOW));
+       }
+
+       if (IS_ERR(result))
+               GOTO(out, result);
+
+       CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+              result->obd_name, result);
+
+       RETURN(result);
+out:
+       obd_device_free(newdev);
+out_type:
+       class_put_type(type);
+       return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+       struct obd_type *obd_type = obd->obd_type;
+
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+                obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+       LASSERT(obd_type != NULL);
+
+       CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+              obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+       write_lock(&obd_dev_lock);
+       obd_devs[obd->obd_minor] = NULL;
+       write_unlock(&obd_dev_lock);
+       obd_device_free(obd);
+
+       class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+       int i;
+
+       if (!name)
+               return -1;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && strcmp(name, obd->obd_name) == 0) {
+                       /* Make sure we finished attaching before we give
+                          out any references */
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       if (obd->obd_attached) {
+                               read_unlock(&obd_dev_lock);
+                               return i;
+                       }
+                       break;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+       int dev = class_name2dev(name);
+
+       if (dev < 0 || dev > class_devno_max())
+               return NULL;
+       return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       read_unlock(&obd_dev_lock);
+                       return i;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+       int dev = class_uuid2dev(uuid);
+       if (dev < 0)
+               return NULL;
+       return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *      otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+       struct obd_device *obd = NULL;
+
+       if (num < class_devno_max()) {
+               obd = obd_devs[num];
+               if (obd == NULL)
+                       return NULL;
+
+               LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                        "%p obd_magic %08x != %08x\n",
+                        obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+               LASSERTF(obd->obd_minor == num,
+                        "%p obd_minor %0d != %0d\n",
+                        obd, obd->obd_minor, num);
+       }
+
+       return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+       int index, max_index = class_devno_max(), dev_count = 0;
+
+       read_lock(&obd_dev_lock);
+       for (index = 0; index <= max_index; index++) {
+               struct obd_device *obd = class_num2obd(index);
+               if (obd != NULL)
+                       dev_count++;
+       }
+       read_unlock(&obd_dev_lock);
+
+       return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+       char *status;
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if (obd->obd_stopping)
+                       status = "ST";
+               else if (obd->obd_set_up)
+                       status = "UP";
+               else if (obd->obd_attached)
+                       status = "AT";
+               else
+                       status = "--";
+               LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+                        i, status, obd->obd_type->typ_name,
+                        obd->obd_name, obd->obd_uuid.uuid,
+                        atomic_read(&obd->obd_refcount));
+       }
+       read_unlock(&obd_dev_lock);
+       return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                         const char * typ_name,
+                                         struct obd_uuid *grp_uuid)
+{
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if ((strncmp(obd->obd_type->typ_name, typ_name,
+                            strlen(typ_name)) == 0)) {
+                       if (obd_uuid_equals(tgt_uuid,
+                                           &obd->u.cli.cl_target_uuid) &&
+                           ((grp_uuid)? obd_uuid_equals(grp_uuid,
+                                                        &obd->obd_uuid) : 1)) {
+                               read_unlock(&obd_dev_lock);
+                               return obd;
+                       }
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+       int i;
+
+       if (next == NULL)
+               i = 0;
+       else if (*next >= 0 && *next < class_devno_max())
+               i = *next;
+       else
+               return NULL;
+
+       read_lock(&obd_dev_lock);
+       for (; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+                       if (next != NULL)
+                               *next = i+1;
+                       read_unlock(&obd_dev_lock);
+                       return obd;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+       struct obd_device  *obd;
+       const char       *type;
+       int              i, rc = 0, rc2;
+
+       LASSERT(namelen > 0);
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               obd = class_num2obd(i);
+
+               if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+                       continue;
+
+               /* only notify mdc, osc, mdt, ost */
+               type = obd->obd_type->typ_name;
+               if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+                   strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+                   strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+                   strcmp(type, LUSTRE_OST_NAME) != 0)
+                       continue;
+
+               if (strncmp(obd->obd_name, fsname, namelen))
+                       continue;
+
+               class_incref(obd, __FUNCTION__, obd);
+               read_unlock(&obd_dev_lock);
+               rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+                                        sizeof(KEY_SPTLRPC_CONF),
+                                        KEY_SPTLRPC_CONF, 0, NULL, NULL);
+               rc = rc ? rc : rc2;
+               class_decref(obd, __FUNCTION__, obd);
+               read_lock(&obd_dev_lock);
+       }
+       read_unlock(&obd_dev_lock);
+       return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+       ENTRY;
+       if (obd_device_cachep) {
+               kmem_cache_destroy(obd_device_cachep);
+               obd_device_cachep = NULL;
+       }
+       if (obdo_cachep) {
+               kmem_cache_destroy(obdo_cachep);
+               obdo_cachep = NULL;
+       }
+       if (import_cachep) {
+               kmem_cache_destroy(import_cachep);
+               import_cachep = NULL;
+       }
+       if (capa_cachep) {
+               kmem_cache_destroy(capa_cachep);
+               capa_cachep = NULL;
+       }
+       EXIT;
+}
+
+int obd_init_caches(void)
+{
+       ENTRY;
+
+       LASSERT(obd_device_cachep == NULL);
+       obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+                                                sizeof(struct obd_device),
+                                                0, 0, NULL);
+       if (!obd_device_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(obdo_cachep == NULL);
+       obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+                                          0, 0, NULL);
+       if (!obdo_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(import_cachep == NULL);
+       import_cachep = kmem_cache_create("ll_import_cache",
+                                            sizeof(struct obd_import),
+                                            0, 0, NULL);
+       if (!import_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(capa_cachep == NULL);
+       capa_cachep = kmem_cache_create("capa_cache",
+                                          sizeof(struct obd_capa), 0, 0, NULL);
+       if (!capa_cachep)
+               GOTO(out, -ENOMEM);
+
+       RETURN(0);
+ out:
+       obd_cleanup_caches();
+       RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+       struct obd_export *export;
+       ENTRY;
+
+       if (!conn) {
+               CDEBUG(D_CACHE, "looking for null handle\n");
+               RETURN(NULL);
+       }
+
+       if (conn->cookie == -1) {  /* this means assign a new connection */
+               CDEBUG(D_CACHE, "want a new connection\n");
+               RETURN(NULL);
+       }
+
+       CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+       export = class_handle2object(conn->cookie);
+       RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+       if (exp)
+               return exp->exp_obd;
+       return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+       struct obd_export *export;
+       export = class_conn2export(conn);
+       if (export) {
+               struct obd_device *obd = export->exp_obd;
+               class_export_put(export);
+               return obd;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       if (obd == NULL)
+               return NULL;
+       return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+       struct obd_device *obd = class_conn2obd(conn);
+       if (obd == NULL)
+               return NULL;
+       return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       ENTRY;
+
+       LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+       LASSERT(obd != NULL);
+
+       CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+              exp->exp_client_uuid.uuid, obd->obd_name);
+
+       /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+       if (exp->exp_connection)
+               ptlrpc_put_connection_superhack(exp->exp_connection);
+
+       LASSERT(list_empty(&exp->exp_outstanding_replies));
+       LASSERT(list_empty(&exp->exp_uncommitted_replies));
+       LASSERT(list_empty(&exp->exp_req_replay_queue));
+       LASSERT(list_empty(&exp->exp_hp_rpcs));
+       obd_destroy_export(exp);
+       class_decref(obd, "export", exp);
+
+       OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+       EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+       class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+       .hop_addref = export_handle_addref,
+       .hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+       atomic_inc(&exp->exp_refcount);
+       CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+              atomic_read(&exp->exp_refcount));
+       return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+              atomic_read(&exp->exp_refcount) - 1);
+
+       if (atomic_dec_and_test(&exp->exp_refcount)) {
+               LASSERT(!list_empty(&exp->exp_obd_chain));
+               CDEBUG(D_IOCTL, "final put %p/%s\n",
+                      exp, exp->exp_client_uuid.uuid);
+
+               /* release nid stat refererence */
+               lprocfs_exp_cleanup(exp);
+
+               obd_zombie_export_add(exp);
+       }
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+                                   struct obd_uuid *cluuid)
+{
+       struct obd_export *export;
+       cfs_hash_t *hash = NULL;
+       int rc = 0;
+       ENTRY;
+
+       OBD_ALLOC_PTR(export);
+       if (!export)
+               return ERR_PTR(-ENOMEM);
+
+       export->exp_conn_cnt = 0;
+       export->exp_lock_hash = NULL;
+       export->exp_flock_hash = NULL;
+       atomic_set(&export->exp_refcount, 2);
+       atomic_set(&export->exp_rpc_count, 0);
+       atomic_set(&export->exp_cb_count, 0);
+       atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       INIT_LIST_HEAD(&export->exp_locks_list);
+       spin_lock_init(&export->exp_locks_list_guard);
+#endif
+       atomic_set(&export->exp_replay_count, 0);
+       export->exp_obd = obd;
+       INIT_LIST_HEAD(&export->exp_outstanding_replies);
+       spin_lock_init(&export->exp_uncommitted_replies_lock);
+       INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+       INIT_LIST_HEAD(&export->exp_req_replay_queue);
+       INIT_LIST_HEAD(&export->exp_handle.h_link);
+       INIT_LIST_HEAD(&export->exp_hp_rpcs);
+       class_handle_hash(&export->exp_handle, &export_handle_ops);
+       export->exp_last_request_time = cfs_time_current_sec();
+       spin_lock_init(&export->exp_lock);
+       spin_lock_init(&export->exp_rpc_lock);
+       INIT_HLIST_NODE(&export->exp_uuid_hash);
+       INIT_HLIST_NODE(&export->exp_nid_hash);
+       spin_lock_init(&export->exp_bl_list_lock);
+       INIT_LIST_HEAD(&export->exp_bl_list);
+
+       export->exp_sp_peer = LUSTRE_SP_ANY;
+       export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+       export->exp_client_uuid = *cluuid;
+       obd_init_export(export);
+
+       spin_lock(&obd->obd_dev_lock);
+       /* shouldn't happen, but might race */
+       if (obd->obd_stopping)
+               GOTO(exit_unlock, rc = -ENODEV);
+
+       hash = cfs_hash_getref(obd->obd_uuid_hash);
+       if (hash == NULL)
+               GOTO(exit_unlock, rc = -ENODEV);
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+               rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+               if (rc != 0) {
+                       LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+                                     obd->obd_name, cluuid->uuid, rc);
+                       GOTO(exit_err, rc = -EALREADY);
+               }
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+               GOTO(exit_unlock, rc = -ENODEV);
+       }
+
+       class_incref(obd, "export", export);
+       list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+       list_add_tail(&export->exp_obd_chain_timed,
+                         &export->exp_obd->obd_exports_timed);
+       export->exp_obd->obd_num_exports++;
+       spin_unlock(&obd->obd_dev_lock);
+       cfs_hash_putref(hash);
+       RETURN(export);
+
+exit_unlock:
+       spin_unlock(&obd->obd_dev_lock);
+exit_err:
+       if (hash)
+               cfs_hash_putref(hash);
+       class_handle_unhash(&export->exp_handle);
+       LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+       obd_destroy_export(export);
+       OBD_FREE_PTR(export);
+       return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+       class_handle_unhash(&exp->exp_handle);
+
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+       /* delete an uuid-export hashitem from hashtables */
+       if (!hlist_unhashed(&exp->exp_uuid_hash))
+               cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+                            &exp->exp_client_uuid,
+                            &exp->exp_uuid_hash);
+
+       list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+       list_del_init(&exp->exp_obd_chain_timed);
+       exp->exp_obd->obd_num_exports--;
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+       class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+       ENTRY;
+
+       CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+               imp->imp_obd->obd_name);
+
+       LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+       ptlrpc_put_connection_superhack(imp->imp_connection);
+
+       while (!list_empty(&imp->imp_conn_list)) {
+               struct obd_import_conn *imp_conn;
+
+               imp_conn = list_entry(imp->imp_conn_list.next,
+                                         struct obd_import_conn, oic_item);
+               list_del_init(&imp_conn->oic_item);
+               ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+       }
+
+       LASSERT(imp->imp_sec == NULL);
+       class_decref(imp->imp_obd, "import", imp);
+       OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+       EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+       class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+       .hop_addref = import_handle_addref,
+       .hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+       atomic_inc(&import->imp_refcount);
+       CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+              atomic_read(&import->imp_refcount),
+              import->imp_obd->obd_name);
+       return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+       ENTRY;
+
+       LASSERT(list_empty(&imp->imp_zombie_chain));
+       LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+       CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+              atomic_read(&imp->imp_refcount) - 1,
+              imp->imp_obd->obd_name);
+
+       if (atomic_dec_and_test(&imp->imp_refcount)) {
+               CDEBUG(D_INFO, "final put import %p\n", imp);
+               obd_zombie_import_add(imp);
+       }
+
+       /* catch possible import put race */
+       LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+       EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+       int i;
+       at_init(&at->iat_net_latency, 0, 0);
+       for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               /* max service estimates are tracked on the server side, so
+                  don't use the AT history here, just use the last reported
+                  val. (But keep hist for proc histogram, worst_ever) */
+               at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+                       AT_FLG_NOHIST);
+       }
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+       struct obd_import *imp;
+
+       OBD_ALLOC(imp, sizeof(*imp));
+       if (imp == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&imp->imp_pinger_chain);
+       INIT_LIST_HEAD(&imp->imp_zombie_chain);
+       INIT_LIST_HEAD(&imp->imp_replay_list);
+       INIT_LIST_HEAD(&imp->imp_sending_list);
+       INIT_LIST_HEAD(&imp->imp_delayed_list);
+       spin_lock_init(&imp->imp_lock);
+       imp->imp_last_success_conn = 0;
+       imp->imp_state = LUSTRE_IMP_NEW;
+       imp->imp_obd = class_incref(obd, "import", imp);
+       mutex_init(&imp->imp_sec_mutex);
+       init_waitqueue_head(&imp->imp_recovery_waitq);
+
+       atomic_set(&imp->imp_refcount, 2);
+       atomic_set(&imp->imp_unregistering, 0);
+       atomic_set(&imp->imp_inflight, 0);
+       atomic_set(&imp->imp_replay_inflight, 0);
+       atomic_set(&imp->imp_inval_count, 0);
+       INIT_LIST_HEAD(&imp->imp_conn_list);
+       INIT_LIST_HEAD(&imp->imp_handle.h_link);
+       class_handle_hash(&imp->imp_handle, &import_handle_ops);
+       init_imp_at(&imp->imp_at);
+
+       /* the default magic is V2, will be used in connect RPC, and
+        * then adjusted according to the flags in request/reply. */
+       imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+       return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+       LASSERT(import != NULL);
+       LASSERT(import != LP_POISON);
+
+       class_handle_unhash(&import->imp_handle);
+
+       spin_lock(&import->imp_lock);
+       import->imp_generation++;
+       spin_unlock(&import->imp_lock);
+       class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+
+       LASSERT(lock->l_exp_refs_nr >= 0);
+
+       if (lock->l_exp_refs_target != NULL &&
+           lock->l_exp_refs_target != exp) {
+               LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+                             exp, lock, lock->l_exp_refs_target);
+       }
+       if ((lock->l_exp_refs_nr ++) == 0) {
+               list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+               lock->l_exp_refs_target = exp;
+       }
+       CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+              lock, exp, lock->l_exp_refs_nr);
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+       LASSERT(lock->l_exp_refs_nr > 0);
+       if (lock->l_exp_refs_target != exp) {
+               LCONSOLE_WARN("lock %p, "
+                             "mismatching export pointers: %p, %p\n",
+                             lock, lock->l_exp_refs_target, exp);
+       }
+       if (-- lock->l_exp_refs_nr == 0) {
+               list_del_init(&lock->l_exp_refs_link);
+               lock->l_exp_refs_target = NULL;
+       }
+       CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+              lock, exp, lock->l_exp_refs_nr);
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                 struct obd_uuid *cluuid)
+{
+       struct obd_export *export;
+       LASSERT(conn != NULL);
+       LASSERT(obd != NULL);
+       LASSERT(cluuid != NULL);
+       ENTRY;
+
+       export = class_new_export(obd, cluuid);
+       if (IS_ERR(export))
+               RETURN(PTR_ERR(export));
+
+       conn->cookie = export->exp_handle.h_cookie;
+       class_export_put(export);
+
+       CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+              cluuid->uuid, conn->cookie);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+
+       spin_lock(&obd->obd_recovery_task_lock);
+       if (exp->exp_delayed)
+               obd->obd_delayed_clients--;
+       if (obd->obd_recovering) {
+               if (exp->exp_in_recovery) {
+                       spin_lock(&exp->exp_lock);
+                       exp->exp_in_recovery = 0;
+                       spin_unlock(&exp->exp_lock);
+                       LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+                       atomic_dec(&obd->obd_connected_clients);
+               }
+
+               /* if called during recovery then should update
+                * obd_stale_clients counter,
+                * lightweight exports are not counted */
+               if (exp->exp_failed &&
+                   (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+                       exp->exp_obd->obd_stale_clients++;
+       }
+       spin_unlock(&obd->obd_recovery_task_lock);
+       /** Cleanup req replay fields */
+       if (exp->exp_req_replay_needed) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_req_replay_needed = 0;
+               spin_unlock(&exp->exp_lock);
+               LASSERT(atomic_read(&obd->obd_req_replay_clients));
+               atomic_dec(&obd->obd_req_replay_clients);
+       }
+       /** Cleanup lock replay data */
+       if (exp->exp_lock_replay_needed) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_lock_replay_needed = 0;
+               spin_unlock(&exp->exp_lock);
+               LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+               atomic_dec(&obd->obd_lock_replay_clients);
+       }
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+       int already_disconnected;
+       ENTRY;
+
+       if (export == NULL) {
+               CWARN("attempting to free NULL export %p\n", export);
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&export->exp_lock);
+       already_disconnected = export->exp_disconnected;
+       export->exp_disconnected = 1;
+       spin_unlock(&export->exp_lock);
+
+       /* class_cleanup(), abort_recovery(), and class_fail_export()
+        * all end up in here, and if any of them race we shouldn't
+        * call extra class_export_puts(). */
+       if (already_disconnected) {
+               LASSERT(hlist_unhashed(&export->exp_nid_hash));
+               GOTO(no_disconn, already_disconnected);
+       }
+
+       CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+              export->exp_handle.h_cookie);
+
+       if (!hlist_unhashed(&export->exp_nid_hash))
+               cfs_hash_del(export->exp_obd->obd_nid_hash,
+                            &export->exp_connection->c_peer.nid,
+                            &export->exp_nid_hash);
+
+       class_export_recovery_cleanup(export);
+       class_unlink_export(export);
+no_disconn:
+       class_export_put(export);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+       if (exp) {
+               int connected;
+               spin_lock(&exp->exp_lock);
+               connected = (exp->exp_conn_cnt > 0);
+               spin_unlock(&exp->exp_lock);
+               return connected;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+                                        enum obd_option flags)
+{
+       int rc;
+       struct obd_export *exp;
+       ENTRY;
+
+       /* It's possible that an export may disconnect itself, but
+        * nothing else will be added to this list. */
+       while (!list_empty(list)) {
+               exp = list_entry(list->next, struct obd_export,
+                                    exp_obd_chain);
+               /* need for safe call CDEBUG after obd_disconnect */
+               class_export_get(exp);
+
+               spin_lock(&exp->exp_lock);
+               exp->exp_flags = flags;
+               spin_unlock(&exp->exp_lock);
+
+               if (obd_uuid_equals(&exp->exp_client_uuid,
+                                   &exp->exp_obd->obd_uuid)) {
+                       CDEBUG(D_HA,
+                              "exp %p export uuid == obd uuid, don't discon\n",
+                              exp);
+                       /* Need to delete this now so we don't end up pointing
+                        * to work_list later when this export is cleaned up. */
+                       list_del_init(&exp->exp_obd_chain);
+                       class_export_put(exp);
+                       continue;
+               }
+
+               class_export_get(exp);
+               CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+                      "last request at "CFS_TIME_T"\n",
+                      exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                      exp, exp->exp_last_request_time);
+               /* release one export reference anyway */
+               rc = obd_disconnect(exp);
+
+               CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                      obd_export_nid2str(exp), exp, rc);
+               class_export_put(exp);
+       }
+       EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+       struct list_head work_list;
+       ENTRY;
+
+       /* Move all of the exports from obd_exports to a work list, en masse. */
+       INIT_LIST_HEAD(&work_list);
+       spin_lock(&obd->obd_dev_lock);
+       list_splice_init(&obd->obd_exports, &work_list);
+       list_splice_init(&obd->obd_delayed_exports, &work_list);
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (!list_empty(&work_list)) {
+               CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+                      "disconnecting them\n", obd->obd_minor, obd);
+               class_disconnect_export_list(&work_list,
+                                            exp_flags_from_obd(obd));
+       } else
+               CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+                      obd->obd_minor, obd);
+       EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                   int (*test_export)(struct obd_export *))
+{
+       struct list_head work_list;
+       struct obd_export *exp, *n;
+       int evicted = 0;
+       ENTRY;
+
+       INIT_LIST_HEAD(&work_list);
+       spin_lock(&obd->obd_dev_lock);
+       list_for_each_entry_safe(exp, n, &obd->obd_exports,
+                                    exp_obd_chain) {
+               /* don't count self-export as client */
+               if (obd_uuid_equals(&exp->exp_client_uuid,
+                                   &exp->exp_obd->obd_uuid))
+                       continue;
+
+               /* don't evict clients which have no slot in last_rcvd
+                * (e.g. lightweight connection) */
+               if (exp->exp_target_data.ted_lr_idx == -1)
+                       continue;
+
+               spin_lock(&exp->exp_lock);
+               if (exp->exp_failed || test_export(exp)) {
+                       spin_unlock(&exp->exp_lock);
+                       continue;
+               }
+               exp->exp_failed = 1;
+               spin_unlock(&exp->exp_lock);
+
+               list_move(&exp->exp_obd_chain, &work_list);
+               evicted++;
+               CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+                      obd->obd_name, exp->exp_client_uuid.uuid,
+                      exp->exp_connection == NULL ? "<unknown>" :
+                      libcfs_nid2str(exp->exp_connection->c_peer.nid));
+               print_export_data(exp, "EVICTING", 0);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (evicted)
+               LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+                             obd->obd_name, evicted);
+
+       class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+                                                OBD_OPT_ABORT_RECOV);
+       EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+       int rc, already_failed;
+
+       spin_lock(&exp->exp_lock);
+       already_failed = exp->exp_failed;
+       exp->exp_failed = 1;
+       spin_unlock(&exp->exp_lock);
+
+       if (already_failed) {
+               CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+                      exp, exp->exp_client_uuid.uuid);
+               return;
+       }
+
+       CDEBUG(D_HA, "disconnecting export %p/%s\n",
+              exp, exp->exp_client_uuid.uuid);
+
+       if (obd_dump_on_timeout)
+               libcfs_debug_dumplog();
+
+       /* need for safe call CDEBUG after obd_disconnect */
+       class_export_get(exp);
+
+       /* Most callers into obd_disconnect are removing their own reference
+        * (request, for example) in addition to the one from the hash table.
+        * We don't have such a reference here, so make one. */
+       class_export_get(exp);
+       rc = obd_disconnect(exp);
+       if (rc)
+               CERROR("disconnecting export %p failed: %d\n", exp, rc);
+       else
+               CDEBUG(D_HA, "disconnected export %p/%s\n",
+                      exp, exp->exp_client_uuid.uuid);
+       class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+       if (exp->exp_connection != NULL)
+               return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+       return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+       cfs_hash_t *nid_hash;
+       struct obd_export *doomed_exp = NULL;
+       int exports_evicted = 0;
+
+       lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+       spin_lock(&obd->obd_dev_lock);
+       /* umount has run already, so evict thread should leave
+        * its task to umount thread now */
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               return exports_evicted;
+       }
+       nid_hash = obd->obd_nid_hash;
+       cfs_hash_getref(nid_hash);
+       spin_unlock(&obd->obd_dev_lock);
+
+       do {
+               doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+               if (doomed_exp == NULL)
+                       break;
+
+               LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+                        "nid %s found, wanted nid %s, requested nid %s\n",
+                        obd_export_nid2str(doomed_exp),
+                        libcfs_nid2str(nid_key), nid);
+               LASSERTF(doomed_exp != obd->obd_self_export,
+                        "self-export is hashed by NID?\n");
+               exports_evicted++;
+               LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+                             "request\n", obd->obd_name,
+                             obd_uuid2str(&doomed_exp->exp_client_uuid),
+                             obd_export_nid2str(doomed_exp));
+               class_fail_export(doomed_exp);
+               class_export_put(doomed_exp);
+       } while (1);
+
+       cfs_hash_putref(nid_hash);
+
+       if (!exports_evicted)
+               CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+                      obd->obd_name, nid);
+       return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+       cfs_hash_t *uuid_hash;
+       struct obd_export *doomed_exp = NULL;
+       struct obd_uuid doomed_uuid;
+       int exports_evicted = 0;
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               return exports_evicted;
+       }
+       uuid_hash = obd->obd_uuid_hash;
+       cfs_hash_getref(uuid_hash);
+       spin_unlock(&obd->obd_dev_lock);
+
+       obd_str2uuid(&doomed_uuid, uuid);
+       if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+               CERROR("%s: can't evict myself\n", obd->obd_name);
+               cfs_hash_putref(uuid_hash);
+               return exports_evicted;
+       }
+
+       doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+       if (doomed_exp == NULL) {
+               CERROR("%s: can't disconnect %s: no exports found\n",
+                      obd->obd_name, uuid);
+       } else {
+               CWARN("%s: evicting %s at adminstrative request\n",
+                      obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+               class_fail_export(doomed_exp);
+               class_export_put(doomed_exp);
+               exports_evicted++;
+       }
+       cfs_hash_putref(uuid_hash);
+
+       return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+                             int locks)
+{
+       struct ptlrpc_reply_state *rs;
+       struct ptlrpc_reply_state *first_reply = NULL;
+       int nreplies = 0;
+
+       spin_lock(&exp->exp_lock);
+       list_for_each_entry(rs, &exp->exp_outstanding_replies,
+                               rs_exp_list) {
+               if (nreplies == 0)
+                       first_reply = rs;
+               nreplies++;
+       }
+       spin_unlock(&exp->exp_lock);
+
+       CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+              exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+              obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+              atomic_read(&exp->exp_rpc_count),
+              atomic_read(&exp->exp_cb_count),
+              atomic_read(&exp->exp_locks_count),
+              exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+              nreplies, first_reply, nreplies > 3 ? "..." : "",
+              exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       if (locks && class_export_dump_hook != NULL)
+               class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+       struct obd_export *exp;
+
+       spin_lock(&obd->obd_dev_lock);
+       list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+               print_export_data(exp, "ACTIVE", locks);
+       list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+               print_export_data(exp, "UNLINKED", locks);
+       list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+               print_export_data(exp, "DELAYED", locks);
+       spin_unlock(&obd->obd_dev_lock);
+       spin_lock(&obd_zombie_impexp_lock);
+       list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+               print_export_data(exp, "ZOMBIE", locks);
+       spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+       int waited = 2;
+       LASSERT(list_empty(&obd->obd_exports));
+       spin_lock(&obd->obd_dev_lock);
+       while (!list_empty(&obd->obd_unlinked_exports)) {
+               spin_unlock(&obd->obd_dev_lock);
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(waited));
+               if (waited > 5 && IS_PO2(waited)) {
+                       LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+                                     "more than %d seconds. "
+                                     "The obd refcount = %d. Is it stuck?\n",
+                                     obd->obd_name, waited,
+                                     atomic_read(&obd->obd_refcount));
+                       dump_exports(obd, 1);
+               }
+               waited *= 2;
+               spin_lock(&obd->obd_dev_lock);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+       struct obd_import *import;
+       struct obd_export *export;
+       ENTRY;
+
+       do {
+               spin_lock(&obd_zombie_impexp_lock);
+
+               import = NULL;
+               if (!list_empty(&obd_zombie_imports)) {
+                       import = list_entry(obd_zombie_imports.next,
+                                               struct obd_import,
+                                               imp_zombie_chain);
+                       list_del_init(&import->imp_zombie_chain);
+               }
+
+               export = NULL;
+               if (!list_empty(&obd_zombie_exports)) {
+                       export = list_entry(obd_zombie_exports.next,
+                                               struct obd_export,
+                                               exp_obd_chain);
+                       list_del_init(&export->exp_obd_chain);
+               }
+
+               spin_unlock(&obd_zombie_impexp_lock);
+
+               if (import != NULL) {
+                       class_import_destroy(import);
+                       spin_lock(&obd_zombie_impexp_lock);
+                       zombies_count--;
+                       spin_unlock(&obd_zombie_impexp_lock);
+               }
+
+               if (export != NULL) {
+                       class_export_destroy(export);
+                       spin_lock(&obd_zombie_impexp_lock);
+                       zombies_count--;
+                       spin_unlock(&obd_zombie_impexp_lock);
+               }
+
+               cond_resched();
+       } while (import != NULL || export != NULL);
+       EXIT;
+}
+
+static struct completion       obd_zombie_start;
+static struct completion       obd_zombie_stop;
+static unsigned long           obd_zombie_flags;
+static wait_queue_head_t               obd_zombie_waitq;
+static pid_t                   obd_zombie_pid;
+
+enum {
+       OBD_ZOMBIE_STOP         = 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+       int rc;
+
+       spin_lock(&obd_zombie_impexp_lock);
+       rc = (zombies_count == 0) &&
+            !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+       LASSERT(!list_empty(&exp->exp_obd_chain));
+       list_del_init(&exp->exp_obd_chain);
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+       spin_lock(&obd_zombie_impexp_lock);
+       zombies_count++;
+       list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+       LASSERT(imp->imp_sec == NULL);
+       LASSERT(imp->imp_rq_pool == NULL);
+       spin_lock(&obd_zombie_impexp_lock);
+       LASSERT(list_empty(&imp->imp_zombie_chain));
+       zombies_count++;
+       list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+       /*
+        * Make sure obd_zomebie_impexp_thread get this notification.
+        * It is possible this signal only get by obd_zombie_barrier, and
+        * barrier gulps this notification and sleeps away and hangs ensues
+        */
+       wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+       int rc;
+
+       LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+       spin_lock(&obd_zombie_impexp_lock);
+       rc = (zombies_count == 0);
+       spin_unlock(&obd_zombie_impexp_lock);
+       return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+       struct l_wait_info lwi = { 0 };
+
+       if (obd_zombie_pid == current_pid())
+               /* don't wait for myself */
+               return;
+       l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+       unshare_fs_struct();
+       complete(&obd_zombie_start);
+
+       obd_zombie_pid = current_pid();
+
+       while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+               struct l_wait_info lwi = { 0 };
+
+               l_wait_event(obd_zombie_waitq,
+                            !obd_zombie_impexp_check(NULL), &lwi);
+               obd_zombie_impexp_cull();
+
+               /*
+                * Notify obd_zombie_barrier callers that queues
+                * may be empty.
+                */
+               wake_up(&obd_zombie_waitq);
+       }
+
+       complete(&obd_zombie_stop);
+
+       RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+       task_t *task;
+
+       INIT_LIST_HEAD(&obd_zombie_imports);
+       INIT_LIST_HEAD(&obd_zombie_exports);
+       spin_lock_init(&obd_zombie_impexp_lock);
+       init_completion(&obd_zombie_start);
+       init_completion(&obd_zombie_stop);
+       init_waitqueue_head(&obd_zombie_waitq);
+       obd_zombie_pid = 0;
+
+       task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+       if (IS_ERR(task))
+               RETURN(PTR_ERR(task));
+
+       wait_for_completion(&obd_zombie_start);
+       RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+       set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+       obd_zombie_impexp_notify();
+       wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+       return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+       struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+       LASSERT(lh->kuc_magic == KUC_MAGIC);
+       return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+       struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+       if (kh->kuc_magic == KUC_MAGIC)
+               return 1;
+       else
+               return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+       struct kuc_hdr *lh;
+       int len = kuc_len(payload_len);
+
+       OBD_ALLOC(lh, len);
+       if (lh == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = transport;
+       lh->kuc_msgtype = type;
+       lh->kuc_msglen = len;
+
+       return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+       struct kuc_hdr *lh = kuc_ptr(p);
+       OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644 (file)
index 0000000..622f8d1
--- /dev/null
@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {      \
+       atomic_inc(&(group_info)->usage);             \
+} while (0)
+
+#define lustre_put_group_info(group_info) do {      \
+       if (atomic_dec_and_test(&(group_info)->usage)) \
+               groups_free(group_info);               \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+                               gid_t grp)
+{
+       int left, right;
+
+       if (!group_info)
+               return 0;
+
+       left = 0;
+       right = group_info->ngroups;
+       while (left < right) {
+               int mid = (left + right) / 2;
+               int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+               if (cmp > 0)
+                       left = mid + 1;
+               else if (cmp < 0)
+                       right = mid;
+               else
+                       return 1;
+       }
+       return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+       int i;
+       int count = ginfo->ngroups;
+
+       /* fill group_info from gid array */
+       for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+               int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+               int off = i * CFS_NGROUPS_PER_BLOCK;
+               int len = cp_count * sizeof(*glist);
+
+               memcpy(ginfo->blocks[i], glist + off, len);
+               count -= cp_count;
+       }
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+       int base, max, stride;
+       int gidsetsize = group_info->ngroups;
+
+       for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+               ; /* nothing */
+       stride /= 3;
+
+       while (stride) {
+               max = gidsetsize - stride;
+               for (base = 0; base < max; base++) {
+                       int left = base;
+                       int right = left + stride;
+                       gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+                       while (left >= 0 &&
+                              CFS_GROUP_AT(group_info, left) > tmp) {
+                               CFS_GROUP_AT(group_info, right) =
+                                   CFS_GROUP_AT(group_info, left);
+                               right = left;
+                               left -= stride;
+                       }
+                       CFS_GROUP_AT(group_info, right) = tmp;
+               }
+               stride /= 3;
+       }
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+       int rc = 1;
+
+       if (grp != mu->uc_fsgid) {
+               group_info_t *group_info = NULL;
+
+               if (mu->uc_ginfo || !mu->uc_identity ||
+                   mu->uc_valid == UCRED_OLD)
+                       if (grp == mu->uc_suppgids[0] ||
+                           grp == mu->uc_suppgids[1])
+                               return 1;
+
+               if (mu->uc_ginfo)
+                       group_info = mu->uc_ginfo;
+               else if (mu->uc_identity)
+                       group_info = mu->uc_identity->mi_ginfo;
+
+               if (!group_info)
+                       return 0;
+
+               lustre_get_group_info(group_info);
+               rc = lustre_groups_search(group_info, grp);
+               lustre_put_group_info(group_info);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+       struct list_head       lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+       struct list_head       lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+       struct list_head       lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+       struct list_head       lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+       uid_t       lie_rmt_uid;      /* remote uid */
+       uid_t       lie_lcl_uid;      /* local uid */
+       gid_t       lie_rmt_gid;      /* remote gid */
+       gid_t       lie_lcl_gid;      /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+       return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+                                            gid_t rmt_gid, gid_t lcl_gid)
+{
+       struct lustre_idmap_entry *e;
+
+       OBD_ALLOC_PTR(e);
+       if (e == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+       INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+       INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+       INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+       e->lie_rmt_uid = rmt_uid;
+       e->lie_lcl_uid = lcl_uid;
+       e->lie_rmt_gid = rmt_gid;
+       e->lie_lcl_gid = lcl_gid;
+
+       return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+       if (!list_empty(&e->lie_rmt_uid_hash))
+               list_del(&e->lie_rmt_uid_hash);
+       if (!list_empty(&e->lie_lcl_uid_hash))
+               list_del(&e->lie_lcl_uid_hash);
+       if (!list_empty(&e->lie_rmt_gid_hash))
+               list_del(&e->lie_rmt_gid_hash);
+       if (!list_empty(&e->lie_lcl_gid_hash))
+               list_del(&e->lie_lcl_gid_hash);
+       OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+                                             uid_t rmt_uid, uid_t lcl_uid,
+                                             gid_t rmt_gid, gid_t lcl_gid)
+{
+       struct list_head *head;
+       struct lustre_idmap_entry *e;
+
+       head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+       list_for_each_entry(e, head, lie_rmt_uid_hash)
+               if (e->lie_rmt_uid == rmt_uid) {
+                       if (e->lie_lcl_uid == lcl_uid) {
+                               if (e->lie_rmt_gid == rmt_gid &&
+                                   e->lie_lcl_gid == lcl_gid)
+                                       /* must be quaternion match */
+                                       return e;
+                       } else {
+                               /* 1:N uid mapping */
+                               CERROR("rmt uid %u already be mapped to %u"
+                                      " (new %u)\n", e->lie_rmt_uid,
+                                      e->lie_lcl_uid, lcl_uid);
+                               return ERR_PTR(-EACCES);
+                       }
+               }
+
+       head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+       list_for_each_entry(e, head, lie_rmt_gid_hash)
+               if (e->lie_rmt_gid == rmt_gid) {
+                       if (e->lie_lcl_gid == lcl_gid) {
+                               if (unlikely(e->lie_rmt_uid == rmt_uid &&
+                                   e->lie_lcl_uid == lcl_uid))
+                                       /* after uid mapping search above,
+                                        * we should never come here */
+                                       LBUG();
+                       } else {
+                               /* 1:N gid mapping */
+                               CERROR("rmt gid %u already be mapped to %u"
+                                      " (new %u)\n", e->lie_rmt_gid,
+                                      e->lie_lcl_gid, lcl_gid);
+                               return ERR_PTR(-EACCES);
+                       }
+               }
+
+       return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+                             __u32 uid)
+{
+       struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+       struct lustre_idmap_entry *e;
+
+       if (!reverse) {
+               list_for_each_entry(e, head, lie_rmt_uid_hash)
+                       if (e->lie_rmt_uid == uid)
+                               return e->lie_lcl_uid;
+       } else {
+               list_for_each_entry(e, head, lie_lcl_uid_hash)
+                       if (e->lie_lcl_uid == uid)
+                               return e->lie_rmt_uid;
+       }
+
+       return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+       struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+       struct lustre_idmap_entry *e;
+
+       if (!reverse) {
+               list_for_each_entry(e, head, lie_rmt_gid_hash)
+                       if (e->lie_rmt_gid == gid)
+                               return e->lie_lcl_gid;
+       } else {
+               list_for_each_entry(e, head, lie_lcl_gid_hash)
+                       if (e->lie_lcl_gid == gid)
+                               return e->lie_rmt_gid;
+       }
+
+       return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+                    uid_t ruid, uid_t luid,
+                    gid_t rgid, gid_t lgid)
+{
+       struct lustre_idmap_entry *e0, *e1;
+
+       LASSERT(t);
+
+       spin_lock(&t->lit_lock);
+       e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+       spin_unlock(&t->lit_lock);
+       if (!e0) {
+               e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+               if (!e0)
+                       return -ENOMEM;
+
+               spin_lock(&t->lit_lock);
+               e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+               if (e1 == NULL) {
+                       list_add_tail(&e0->lie_rmt_uid_hash,
+                                         &t->lit_idmaps[RMT_UIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(ruid)]);
+                       list_add_tail(&e0->lie_lcl_uid_hash,
+                                         &t->lit_idmaps[LCL_UIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(luid)]);
+                       list_add_tail(&e0->lie_rmt_gid_hash,
+                                         &t->lit_idmaps[RMT_GIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(rgid)]);
+                       list_add_tail(&e0->lie_lcl_gid_hash,
+                                         &t->lit_idmaps[LCL_GIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(lgid)]);
+               }
+               spin_unlock(&t->lit_lock);
+               if (e1 != NULL) {
+                       idmap_entry_free(e0);
+                       if (IS_ERR(e1))
+                               return PTR_ERR(e1);
+               }
+       } else if (IS_ERR(e0)) {
+               return PTR_ERR(e0);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+                   uid_t ruid, uid_t luid,
+                   gid_t rgid, gid_t lgid)
+{
+       struct lustre_idmap_entry *e;
+       int rc = 0;
+
+       LASSERT(t);
+
+       spin_lock(&t->lit_lock);
+       e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+       if (IS_ERR(e))
+               rc = PTR_ERR(e);
+       else if (e)
+               idmap_entry_free(e);
+       spin_unlock(&t->lit_lock);
+
+       return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+                           struct lustre_idmap_table *t,
+                           int reverse, uid_t uid)
+{
+       struct list_head *hash;
+
+       if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+               if (!reverse) {
+                       if (uid == mu->uc_o_uid)
+                               return mu->uc_uid;
+                       else if (uid == mu->uc_o_fsuid)
+                               return mu->uc_fsuid;
+               } else {
+                       if (uid == mu->uc_uid)
+                               return mu->uc_o_uid;
+                       else if (uid == mu->uc_fsuid)
+                               return mu->uc_o_fsuid;
+               }
+       }
+
+       if (t == NULL)
+               return CFS_IDMAP_NOTFOUND;
+
+       hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+       spin_lock(&t->lit_lock);
+       uid = idmap_lookup_uid(hash, reverse, uid);
+       spin_unlock(&t->lit_lock);
+
+       return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+                           int reverse, gid_t gid)
+{
+       struct list_head *hash;
+
+       if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+               if (!reverse) {
+                       if (gid == mu->uc_o_gid)
+                               return mu->uc_gid;
+                       else if (gid == mu->uc_o_fsgid)
+                               return mu->uc_fsgid;
+               } else {
+                       if (gid == mu->uc_gid)
+                               return mu->uc_o_gid;
+                       else if (gid == mu->uc_fsgid)
+                               return mu->uc_o_fsgid;
+               }
+       }
+
+       if (t == NULL)
+               return CFS_IDMAP_NOTFOUND;
+
+       hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+       spin_lock(&t->lit_lock);
+       gid = idmap_lookup_gid(hash, reverse, gid);
+       spin_unlock(&t->lit_lock);
+
+       return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+       struct lustre_idmap_table *t;
+       int i, j;
+
+       OBD_ALLOC_PTR(t);
+       if(unlikely(t == NULL))
+               return (ERR_PTR(-ENOMEM));
+
+       spin_lock_init(&t->lit_lock);
+       for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+               for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+                       INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+       return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+       struct list_head *list;
+       struct lustre_idmap_entry *e;
+       int i;
+       LASSERT(t);
+
+       list = t->lit_idmaps[RMT_UIDMAP_IDX];
+       spin_lock(&t->lit_lock);
+       for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+               while (!list_empty(&list[i])) {
+                       e = list_entry(list[i].next,
+                                          struct lustre_idmap_entry,
+                                          lie_rmt_uid_hash);
+                       idmap_entry_free(e);
+               }
+       spin_unlock(&t->lit_lock);
+
+       OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644 (file)
index 0000000..b5c19ac
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+       ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+       if (ldata->ld_buf->lb_buf == NULL)
+               return -ENOMEM;
+       ldata->ld_leh = ldata->ld_buf->lb_buf;
+       ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+       ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+       ldata->ld_leh->leh_reccount = 0;
+       return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+       struct link_ea_header *leh;
+
+       LASSERT(ldata->ld_buf != NULL);
+       leh = ldata->ld_buf->lb_buf;
+       if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+               leh->leh_magic = LINK_EA_MAGIC;
+               leh->leh_reccount = __swab32(leh->leh_reccount);
+               leh->leh_len = __swab64(leh->leh_len);
+               /* entries are swabbed by linkea_entry_unpack */
+       }
+       if (leh->leh_magic != LINK_EA_MAGIC)
+               return -EINVAL;
+       if (leh->leh_reccount == 0)
+               return -ENODATA;
+
+       ldata->ld_leh = leh;
+       return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+                            const struct lu_name *lname,
+                            const struct lu_fid *pfid)
+{
+       struct lu_fid   tmpfid;
+       int          reclen;
+
+       fid_cpu_to_be(&tmpfid, pfid);
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+               tmpfid.f_ver = ~0;
+       memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+       memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+       reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+       lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+       lee->lee_reclen[1] = reclen & 0xff;
+       return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+                        struct lu_name *lname, struct lu_fid *pfid)
+{
+       *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+       memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+       fid_be_to_cpu(pfid, pfid);
+       lname->ln_name = lee->lee_name;
+       lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+                  const struct lu_fid *pfid)
+{
+       LASSERT(ldata->ld_leh != NULL);
+
+       if (lname == NULL || pfid == NULL)
+               return -EINVAL;
+
+       ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+       if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+           ldata->ld_buf->lb_len) {
+               if (lu_buf_check_and_grow(ldata->ld_buf,
+                                         ldata->ld_leh->leh_len +
+                                         ldata->ld_reclen) < 0)
+                       return -ENOMEM;
+       }
+
+       ldata->ld_leh = ldata->ld_buf->lb_buf;
+       ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+       ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+       ldata->ld_leh->leh_len += ldata->ld_reclen;
+       ldata->ld_leh->leh_reccount++;
+       CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+              lname->ln_namelen, lname->ln_name);
+       return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+       LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+       ldata->ld_leh->leh_reccount--;
+       ldata->ld_leh->leh_len -= ldata->ld_reclen;
+       memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+               (char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+               (char *)ldata->ld_lee);
+       CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+              lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+                     const struct lu_fid  *pfid)
+{
+       struct lu_name tmpname;
+       struct lu_fid  tmpfid;
+       int count;
+
+       LASSERT(ldata->ld_leh != NULL);
+
+       /* link #0 */
+       ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+       for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+               linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+                                   &tmpname, &tmpfid);
+               if (tmpname.ln_namelen == lname->ln_namelen &&
+                   lu_fid_eq(&tmpfid, pfid) &&
+                   (strncmp(tmpname.ln_name, lname->ln_name,
+                            tmpname.ln_namelen) == 0))
+                       break;
+               ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+                                                        ldata->ld_reclen);
+       }
+
+       if (count == ldata->ld_leh->leh_reccount) {
+               CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+                      lname->ln_namelen, lname->ln_name);
+               ldata->ld_lee = NULL;
+               return -ENOENT;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644 (file)
index 0000000..16208ba
--- /dev/null
@@ -0,0 +1,430 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+       struct obd_ioctl_hdr hdr;
+       struct obd_ioctl_data *data;
+       int err;
+       int offset = 0;
+       ENTRY;
+
+       err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+       if ( err )
+               RETURN(err);
+
+       if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+               CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+                      OBD_IOCTL_VERSION, hdr.ioc_version);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+               CERROR("User buffer len %d exceeds %d max buffer\n",
+                      hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+               CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       /* When there are lots of processes calling vmalloc on multi-core
+        * system, the high lock contention will hurt performance badly,
+        * obdfilter-survey is an example, which relies on ioctl. So we'd
+        * better avoid vmalloc on ioctl path. LU-66 */
+       OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+       if (*buf == NULL) {
+               CERROR("Cannot allocate control buffer of len %d\n",
+                      hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+       *len = hdr.ioc_len;
+       data = (struct obd_ioctl_data *)*buf;
+
+       err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+       if ( err ) {
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(err);
+       }
+
+       if (obd_ioctl_is_invalid(data)) {
+               CERROR("ioctl not correctly formatted\n");
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       if (data->ioc_inllen1) {
+               data->ioc_inlbuf1 = &data->ioc_bulk[0];
+               offset += cfs_size_round(data->ioc_inllen1);
+       }
+
+       if (data->ioc_inllen2) {
+               data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen2);
+       }
+
+       if (data->ioc_inllen3) {
+               data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen3);
+       }
+
+       if (data->ioc_inllen4) {
+               data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+       }
+
+       EXIT;
+       return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+       int err;
+
+       err = copy_to_user(arg, data, len);
+       if (err)
+               err = -EFAULT;
+       return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+       ENTRY;
+
+       try_module_get(THIS_MODULE);
+       RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+       ENTRY;
+
+       module_put(THIS_MODULE);
+       RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+                           unsigned long arg)
+{
+       int err = 0;
+       ENTRY;
+
+       /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+               RETURN(err = -EACCES);
+       if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+               RETURN(err = -ENOTTY);
+
+       err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+       RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+       .owner    = THIS_MODULE,
+       .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+       .open      = obd_class_open,      /* open */
+       .release        = obd_class_release,   /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+       .minor = OBD_DEV_MINOR,
+       .name  = OBD_DEV_NAME,
+       .fops  = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_read_version(char *page, char **start, off_t off, int count,
+                         int *eof, void *data)
+{
+       *eof = 1;
+       return snprintf(page, count, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+                       LUSTRE_VERSION_STRING, "patchless_client",
+                       BUILD_VERSION);
+}
+
+int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       *eof = 1;
+       return snprintf(page, count, "%s\n",
+                       "on"
+                      );
+}
+
+/**
+ * Check all obd devices health
+ *
+ * \param page
+ * \param start
+ * \param off
+ * \param count
+ * \param eof
+ * \param data
+ *               proc read function parameters, please refer to kernel
+ *               code fs/proc/generic.c proc_file_read()
+ * \param data [in] unused
+ *
+ * \retval number of characters printed
+ */
+static int obd_proc_read_health(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+       int rc = 0, i;
+       *eof = 1;
+
+       if (libcfs_catastrophe)
+               rc += snprintf(page + rc, count - rc, "LBUG\n");
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd;
+
+               obd = class_num2obd(i);
+               if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+                       continue;
+
+               LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+               if (obd->obd_stopping)
+                       continue;
+
+               class_incref(obd, __FUNCTION__, current);
+               read_unlock(&obd_dev_lock);
+
+               if (obd_health_check(NULL, obd)) {
+                       rc += snprintf(page + rc, count - rc,
+                                      "device %s reported unhealthy\n",
+                                      obd->obd_name);
+               }
+               class_decref(obd, __FUNCTION__, current);
+               read_lock(&obd_dev_lock);
+       }
+       read_unlock(&obd_dev_lock);
+
+       if (rc == 0)
+               return snprintf(page, count, "healthy\n");
+
+       rc += snprintf(page + rc, count - rc, "NOT HEALTHY\n");
+       return rc;
+}
+
+static int obd_proc_rd_jobid_var(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+       return snprintf(page, count, "%s\n", obd_jobid_var);
+}
+
+static int obd_proc_wr_jobid_var(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+       if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+               return -EINVAL;
+
+       memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+       /* Trim the trailing '\n' if any */
+       memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+       return count;
+}
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+       { "version", obd_proc_read_version, NULL, NULL },
+       { "pinger", obd_proc_read_pinger, NULL, NULL },
+       { "health_check", obd_proc_read_health, NULL, NULL },
+       { "jobid_var", obd_proc_rd_jobid_var,
+                      obd_proc_wr_jobid_var, NULL },
+       { 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+       if (*pos >= class_devno_max())
+               return NULL;
+
+       return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= class_devno_max())
+               return NULL;
+
+       return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+       loff_t index = *(loff_t *)v;
+       struct obd_device *obd = class_num2obd((int)index);
+       char *status;
+
+       if (obd == NULL)
+               return 0;
+
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       if (obd->obd_stopping)
+               status = "ST";
+       else if (obd->obd_inactive)
+               status = "IN";
+       else if (obd->obd_set_up)
+               status = "UP";
+       else if (obd->obd_attached)
+               status = "AT";
+       else
+               status = "--";
+
+       return seq_printf(p, "%3d %s %s %s %s %d\n",
+                         (int)index, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+                         atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+       .start = obd_device_list_seq_start,
+       .stop = obd_device_list_seq_stop,
+       .next = obd_device_list_seq_next,
+       .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file *seq;
+       int rc = seq_open(file, &obd_device_list_sops);
+
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = dp->data;
+
+       return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+       .owner   = THIS_MODULE,
+       .open    = obd_device_list_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+       int rc;
+       ENTRY;
+
+       obd_sysctl_init();
+       proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+                                           lprocfs_base, NULL);
+       rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+                               &obd_device_list_fops, NULL);
+       if (rc)
+               CERROR("error adding /proc/fs/lustre/devices file\n");
+       RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+       ENTRY;
+       if (proc_lustre_root) {
+               lprocfs_remove(&proc_lustre_root);
+       }
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644 (file)
index 0000000..6ee3471
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+       obd_flag newvalid = 0;
+
+       if (valid & LA_ATIME) {
+               dst->o_atime = la->la_atime;
+               newvalid |= OBD_MD_FLATIME;
+       }
+       if (valid & LA_MTIME) {
+               dst->o_mtime = la->la_mtime;
+               newvalid |= OBD_MD_FLMTIME;
+       }
+       if (valid & LA_CTIME) {
+               dst->o_ctime = la->la_ctime;
+               newvalid |= OBD_MD_FLCTIME;
+       }
+       if (valid & LA_SIZE) {
+               dst->o_size = la->la_size;
+               newvalid |= OBD_MD_FLSIZE;
+       }
+       if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+               dst->o_blocks = la->la_blocks;
+               newvalid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & LA_TYPE) {
+               dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                             (la->la_mode & S_IFMT);
+               newvalid |= OBD_MD_FLTYPE;
+       }
+       if (valid & LA_MODE) {
+               dst->o_mode = (dst->o_mode & S_IFMT) |
+                             (la->la_mode & S_IALLUGO);
+               newvalid |= OBD_MD_FLMODE;
+       }
+       if (valid & LA_UID) {
+               dst->o_uid = la->la_uid;
+               newvalid |= OBD_MD_FLUID;
+       }
+       if (valid & LA_GID) {
+               dst->o_gid = la->la_gid;
+               newvalid |= OBD_MD_FLGID;
+       }
+       dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+       __u64 newvalid = 0;
+
+       valid &= obdo->o_valid;
+
+       if (valid & OBD_MD_FLATIME) {
+               dst->la_atime = obdo->o_atime;
+               newvalid |= LA_ATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               dst->la_mtime = obdo->o_mtime;
+               newvalid |= LA_MTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               dst->la_ctime = obdo->o_ctime;
+               newvalid |= LA_CTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               dst->la_size = obdo->o_size;
+               newvalid |= LA_SIZE;
+       }
+       if (valid & OBD_MD_FLBLOCKS) {
+               dst->la_blocks = obdo->o_blocks;
+               newvalid |= LA_BLOCKS;
+       }
+       if (valid & OBD_MD_FLTYPE) {
+               dst->la_mode = (dst->la_mode & S_IALLUGO) |
+                              (obdo->o_mode & S_IFMT);
+               newvalid |= LA_TYPE;
+       }
+       if (valid & OBD_MD_FLMODE) {
+               dst->la_mode = (dst->la_mode & S_IFMT) |
+                              (obdo->o_mode & S_IALLUGO);
+               newvalid |= LA_MODE;
+       }
+       if (valid & OBD_MD_FLUID) {
+               dst->la_uid = obdo->o_uid;
+               newvalid |= LA_UID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               dst->la_gid = obdo->o_gid;
+               newvalid |= LA_GID;
+       }
+       dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+       valid &= src->o_valid;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE,
+                      "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+                      src->o_valid, LTIME_S(dst->i_mtime),
+                      LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+       if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+               LTIME_S(dst->i_atime) = src->o_atime;
+       if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+               LTIME_S(dst->i_mtime) = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+               LTIME_S(dst->i_ctime) = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               i_size_write(dst, src->o_size);
+       /* optimum IO size */
+       if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+               dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+       if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+               dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+       /* allocation of space */
+       if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+               /*
+                * XXX shouldn't overflow be checked here like in
+                * obdo_to_inode().
+                */
+               dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+       valid &= src->o_valid;
+
+       LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+                           OBD_MD_FLID | OBD_MD_FLGROUP)),
+                "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE,
+                      "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+                      src->o_valid, LTIME_S(dst->i_mtime),
+                      LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+       if (valid & OBD_MD_FLATIME)
+               LTIME_S(dst->i_atime) = src->o_atime;
+       if (valid & OBD_MD_FLMTIME)
+               LTIME_S(dst->i_mtime) = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+               LTIME_S(dst->i_ctime) = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               i_size_write(dst, src->o_size);
+       if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+               dst->i_blocks = src->o_blocks;
+               if (dst->i_blocks < src->o_blocks) /* overflow */
+                       dst->i_blocks = -1;
+
+       }
+       if (valid & OBD_MD_FLBLKSZ)
+               dst->i_blkbits = ffs(src->o_blksize)-1;
+       if (valid & OBD_MD_FLMODE)
+               dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+       if (valid & OBD_MD_FLUID)
+               dst->i_uid = src->o_uid;
+       if (valid & OBD_MD_FLGID)
+               dst->i_gid = src->o_gid;
+       if (valid & OBD_MD_FLFLAGS)
+               dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644 (file)
index 0000000..46aad68
--- /dev/null
@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+       OBD_TIMEOUT = 3,        /* RPC timeout before recovery/intr */
+       OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+       OBD_MEMUSED,        /* bytes currently OBD_ALLOCated */
+       OBD_PAGESUSED,    /* pages currently OBD_PAGE_ALLOCated */
+       OBD_MAXMEMUSED,  /* maximum bytes OBD_ALLOCated concurrently */
+       OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+       OBD_SYNCFILTER,  /* XXX temporary, as we play with sync osts.. */
+       OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+       OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+       OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+       OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+       OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+       OBD_AT_MIN,          /* Adaptive timeouts params */
+       OBD_AT_MAX,
+       OBD_AT_EXTRA,
+       OBD_AT_EARLY_MARGIN,
+       OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+       int rc;
+
+       rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (ldlm_timeout >= obd_timeout)
+               ldlm_timeout = max(obd_timeout / 3, 1U);
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+       int rc = 0;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write) {
+               rc = lprocfs_write_frac_helper(buffer, *lenp,
+                                              (unsigned int*)table->data,
+                                              1 << (20 - PAGE_CACHE_SHIFT));
+               /* Don't allow them to let dirty pages exceed 90% of system
+                * memory and set a hard minimum of 4MB. */
+               if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+                       CERROR("Refusing to set max dirty pages to %u, which "
+                              "is more than 90%% of available RAM; setting "
+                              "to %lu\n", obd_max_dirty_pages,
+                              ((num_physpages / 10) * 9));
+                       obd_max_dirty_pages = ((num_physpages / 10) * 9);
+               } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+                       obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+               }
+       } else {
+               char buf[21];
+               int len;
+
+               len = lprocfs_read_frac_helper(buf, sizeof(buf),
+                                              *(unsigned int*)table->data,
+                                              1 << (20 - PAGE_CACHE_SHIFT));
+               if (len > *lenp)
+                       len = *lenp;
+               buf[len] = '\0';
+               if (copy_to_user(buffer, buf, len))
+                       return -EFAULT;
+               *lenp = len;
+       }
+       *ppos += *lenp;
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+       int rc    = 0;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write) {
+               rc = lprocfs_write_frac_helper(buffer, *lenp,
+                                              (unsigned int*)table->data,
+                                              OBD_ALLOC_FAIL_MULT);
+       } else {
+               char buf[21];
+               int  len;
+
+               len = lprocfs_read_frac_helper(buf, 21,
+                                              *(unsigned int*)table->data,
+                                              OBD_ALLOC_FAIL_MULT);
+               if (len > *lenp)
+                       len = *lenp;
+               buf[len] = '\0';
+               if (copy_to_user(buffer, buf, len))
+                       return -EFAULT;
+               *lenp = len;
+       }
+       *ppos += *lenp;
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+       {
+               INIT_CTL_NAME(OBD_TIMEOUT)
+               .procname = "timeout",
+               .data     = &obd_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_set_timeout
+       },
+       {
+               INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+               .procname = "debug_peer_on_timeout",
+               .data     = &obd_debug_peer_on_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+               .procname = "dump_on_timeout",
+               .data     = &obd_dump_on_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+               .procname = "dump_on_eviction",
+               .data     = &obd_dump_on_eviction,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_MEMUSED)
+               .procname = "memused",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_memory_alloc
+       },
+       {
+               INIT_CTL_NAME(OBD_PAGESUSED)
+               .procname = "pagesused",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_pages_alloc
+       },
+       {
+               INIT_CTL_NAME(OBD_MAXMEMUSED)
+               .procname = "memused_max",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_mem_max
+       },
+       {
+               INIT_CTL_NAME(OBD_MAXPAGESUSED)
+               .procname = "pagesused_max",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_pages_max
+       },
+       {
+               INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+               .procname = "ldlm_timeout",
+               .data     = &ldlm_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_set_timeout
+       },
+       {
+               INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+               .procname = "alloc_fail_rate",
+               .data     = &obd_alloc_fail_rate,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_alloc_fail_rate
+       },
+       {
+               INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+               .procname = "max_dirty_mb",
+               .data     = &obd_max_dirty_pages,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_max_dirty_pages_in_mb
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_MIN)
+               .procname = "at_min",
+               .data     = &at_min,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_min
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_MAX)
+               .procname = "at_max",
+               .data     = &at_max,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_max
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_EXTRA)
+               .procname = "at_extra",
+               .data     = &at_extra,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_extra
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+               .procname = "at_early_margin",
+               .data     = &at_early_margin,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_early_margin
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_HISTORY)
+               .procname = "at_history",
+               .data     = &at_history,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_history
+       },
+       {       INIT_CTL_NAME(0)    }
+};
+
+static ctl_table_t parent_table[] = {
+       {
+               INIT_CTL_NAME(OBD_SYSCTL)
+               .procname = "lustre",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = obd_table
+       },
+       {       INIT_CTL_NAME(0)   }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+       if ( !obd_table_header )
+               obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+       if ( obd_table_header )
+               unregister_sysctl_table(obd_table_header);
+       obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644 (file)
index 0000000..b1d215e
--- /dev/null
@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+       struct llog_handle *loghandle;
+
+       OBD_ALLOC_PTR(loghandle);
+       if (loghandle == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       init_rwsem(&loghandle->lgh_lock);
+       spin_lock_init(&loghandle->lgh_hdr_lock);
+       INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+       atomic_set(&loghandle->lgh_refcount, 1);
+
+       return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+       LASSERT(loghandle != NULL);
+
+       /* failed llog_init_handle */
+       if (!loghandle->lgh_hdr)
+               goto out;
+
+       if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+               LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+       else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               LASSERT(list_empty(&loghandle->u.chd.chd_head));
+       LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+       OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+       OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+       atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+       LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+       if (atomic_dec_and_test(&loghandle->lgh_refcount))
+               llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+                   int index)
+{
+       struct llog_log_hdr *llh = loghandle->lgh_hdr;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+              index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+       if (index == 0) {
+               CERROR("Can't cancel index 0 which is header\n");
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+               RETURN(-ENOENT);
+       }
+
+       llh->llh_count--;
+
+       if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+           (llh->llh_count == 1) &&
+           (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               rc = llog_destroy(env, loghandle);
+               if (rc < 0) {
+                       CERROR("%s: can't destroy empty llog #"DOSTID
+                              "#%08x: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, rc);
+                       GOTO(out_err, rc);
+               }
+               RETURN(1);
+       }
+       spin_unlock(&loghandle->lgh_hdr_lock);
+
+       rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+       if (rc < 0) {
+               CERROR("%s: fail to write header for llog #"DOSTID
+                      "#%08x: rc = %d\n",
+                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&loghandle->lgh_id.lgl_oi),
+                      loghandle->lgh_id.lgl_ogen, rc);
+               GOTO(out_err, rc);
+       }
+       RETURN(0);
+out_err:
+       spin_lock(&loghandle->lgh_hdr_lock);
+       ext2_set_bit(index, llh->llh_bitmap);
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+                           struct llog_handle *handle,
+                           struct obd_uuid *uuid)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+
+       if (lop->lop_read_header == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_read_header(env, handle);
+       if (rc == LLOG_EEMPTY) {
+               struct llog_log_hdr *llh = handle->lgh_hdr;
+
+               handle->lgh_last_idx = 0; /* header is record with index 0 */
+               llh->llh_count = 1;      /* for the header record */
+               llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+               llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+               llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+               llh->llh_timestamp = cfs_time_current_sec();
+               if (uuid)
+                       memcpy(&llh->llh_tgtuuid, uuid,
+                              sizeof(llh->llh_tgtuuid));
+               llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+               ext2_set_bit(0, llh->llh_bitmap);
+               rc = 0;
+       }
+       return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+                    int flags, struct obd_uuid *uuid)
+{
+       struct llog_log_hdr     *llh;
+       int                      rc;
+
+       ENTRY;
+       LASSERT(handle->lgh_hdr == NULL);
+
+       OBD_ALLOC_PTR(llh);
+       if (llh == NULL)
+               RETURN(-ENOMEM);
+       handle->lgh_hdr = llh;
+       /* first assign flags to use llog_client_ops */
+       llh->llh_flags = flags;
+       rc = llog_read_header(env, handle, uuid);
+       if (rc == 0) {
+               if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+                             flags & LLOG_F_IS_CAT) ||
+                            (llh->llh_flags & LLOG_F_IS_CAT &&
+                             flags & LLOG_F_IS_PLAIN))) {
+                       CERROR("%s: llog type is %s but initializing %s\n",
+                              handle->lgh_ctxt->loc_obd->obd_name,
+                              llh->llh_flags & LLOG_F_IS_CAT ?
+                              "catalog" : "plain",
+                              flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+                       GOTO(out, rc = -EINVAL);
+               } else if (llh->llh_flags &
+                          (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+                       /*
+                        * it is possible to open llog without specifying llog
+                        * type so it is taken from llh_flags
+                        */
+                       flags = llh->llh_flags;
+               } else {
+                       /* for some reason the llh_flags has no type set */
+                       CERROR("llog type is not specified!\n");
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (unlikely(uuid &&
+                            !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+                       CERROR("%s: llog uuid mismatch: %s/%s\n",
+                              handle->lgh_ctxt->loc_obd->obd_name,
+                              (char *)uuid->uuid,
+                              (char *)llh->llh_tgtuuid.uuid);
+                       GOTO(out, rc = -EEXIST);
+               }
+       }
+       if (flags & LLOG_F_IS_CAT) {
+               LASSERT(list_empty(&handle->u.chd.chd_head));
+               INIT_LIST_HEAD(&handle->u.chd.chd_head);
+               llh->llh_size = sizeof(struct llog_logid_rec);
+       } else if (!(flags & LLOG_F_IS_PLAIN)) {
+               CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+                      handle->lgh_ctxt->loc_obd->obd_name,
+                      flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+               rc = -EINVAL;
+       }
+out:
+       if (rc) {
+               OBD_FREE_PTR(llh);
+               handle->lgh_hdr = NULL;
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+                     struct llog_handle *llh,
+                     struct llog_rec_hdr *rec,
+                     void *data)
+{
+       struct llog_rec_hdr local_rec = *rec;
+       struct llog_handle *local_llh = (struct llog_handle *)data;
+       char *cfg_buf = (char*) (rec + 1);
+       struct lustre_cfg *lcfg;
+       int rc = 0;
+       ENTRY;
+
+       /* Append all records */
+       local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+       rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+                       (void *)cfg_buf, -1);
+
+       lcfg = (struct lustre_cfg *)cfg_buf;
+       CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+              rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+              lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+       struct llog_process_info        *lpi = arg;
+       struct llog_handle              *loghandle = lpi->lpi_loghandle;
+       struct llog_log_hdr             *llh = loghandle->lgh_hdr;
+       struct llog_process_cat_data    *cd  = lpi->lpi_catdata;
+       char                            *buf;
+       __u64                            cur_offset = LLOG_CHUNK_SIZE;
+       __u64                            last_offset;
+       int                              rc = 0, index = 1, last_index;
+       int                              saved_index = 0;
+       int                              last_called_index = 0;
+
+       ENTRY;
+
+       LASSERT(llh);
+
+       OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+       if (!buf) {
+               lpi->lpi_rc = -ENOMEM;
+               RETURN(0);
+       }
+
+       if (cd != NULL) {
+               last_called_index = cd->lpcd_first_idx;
+               index = cd->lpcd_first_idx + 1;
+       }
+       if (cd != NULL && cd->lpcd_last_idx)
+               last_index = cd->lpcd_last_idx;
+       else
+               last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+       while (rc == 0) {
+               struct llog_rec_hdr *rec;
+
+               /* skip records not set in bitmap */
+               while (index <= last_index &&
+                      !ext2_test_bit(index, llh->llh_bitmap))
+                       ++index;
+
+               LASSERT(index <= last_index + 1);
+               if (index == last_index + 1)
+                       break;
+repeat:
+               CDEBUG(D_OTHER, "index: %d last_index %d\n",
+                      index, last_index);
+
+               /* get the buf with our target record; avoid old garbage */
+               memset(buf, 0, LLOG_CHUNK_SIZE);
+               last_offset = cur_offset;
+               rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+                                    index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* NB: when rec->lrh_len is accessed it is already swabbed
+                * since it is used at the "end" of the loop and the rec
+                * swabbing is done at the beginning of the loop. */
+               for (rec = (struct llog_rec_hdr *)buf;
+                    (char *)rec < buf + LLOG_CHUNK_SIZE;
+                    rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+                       CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+                              rec, rec->lrh_type);
+
+                       if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                               lustre_swab_llog_rec(rec);
+
+                       CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+                              rec->lrh_type, rec->lrh_index);
+
+                       if (rec->lrh_index == 0) {
+                               /* probably another rec just got added? */
+                               if (index <= loghandle->lgh_last_idx)
+                                       GOTO(repeat, rc = 0);
+                               GOTO(out, rc = 0); /* no more records */
+                       }
+                       if (rec->lrh_len == 0 ||
+                           rec->lrh_len > LLOG_CHUNK_SIZE) {
+                               CWARN("invalid length %d in llog record for "
+                                     "index %d/%d\n", rec->lrh_len,
+                                     rec->lrh_index, index);
+                               GOTO(out, rc = -EINVAL);
+                       }
+
+                       if (rec->lrh_index < index) {
+                               CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+                                      rec->lrh_index);
+                               continue;
+                       }
+
+                       CDEBUG(D_OTHER,
+                              "lrh_index: %d lrh_len: %d (%d remains)\n",
+                              rec->lrh_index, rec->lrh_len,
+                              (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+                       loghandle->lgh_cur_idx = rec->lrh_index;
+                       loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+                                                   last_offset;
+
+                       /* if set, process the callback on this record */
+                       if (ext2_test_bit(index, llh->llh_bitmap)) {
+                               rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+                                                lpi->lpi_cbdata);
+                               last_called_index = index;
+                               if (rc == LLOG_PROC_BREAK) {
+                                       GOTO(out, rc);
+                               } else if (rc == LLOG_DEL_RECORD) {
+                                       llog_cancel_rec(lpi->lpi_env,
+                                                       loghandle,
+                                                       rec->lrh_index);
+                                       rc = 0;
+                               }
+                               if (rc)
+                                       GOTO(out, rc);
+                       } else {
+                               CDEBUG(D_OTHER, "Skipped index %d\n", index);
+                       }
+
+                       /* next record, still in buffer? */
+                       ++index;
+                       if (index > last_index)
+                               GOTO(out, rc = 0);
+               }
+       }
+
+out:
+       if (cd != NULL)
+               cd->lpcd_last_idx = last_called_index;
+
+       OBD_FREE(buf, LLOG_CHUNK_SIZE);
+       lpi->lpi_rc = rc;
+       return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+       struct llog_process_info        *lpi = arg;
+       struct lu_env                    env;
+       int                              rc;
+
+       unshare_fs_struct();
+
+       /* client env has no keys, tags is just 0 */
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               goto out;
+       lpi->lpi_env = &env;
+
+       rc = llog_process_thread(arg);
+
+       lu_env_fini(&env);
+out:
+       complete(&lpi->lpi_completion);
+       return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+       struct llog_process_info *lpi;
+       int                   rc;
+
+       ENTRY;
+
+       OBD_ALLOC_PTR(lpi);
+       if (lpi == NULL) {
+               CERROR("cannot alloc pointer\n");
+               RETURN(-ENOMEM);
+       }
+       lpi->lpi_loghandle = loghandle;
+       lpi->lpi_cb     = cb;
+       lpi->lpi_cbdata    = data;
+       lpi->lpi_catdata   = catdata;
+
+       if (fork) {
+               /* The new thread can't use parent env,
+                * init the new one in llog_process_thread_daemonize. */
+               lpi->lpi_env = NULL;
+               init_completion(&lpi->lpi_completion);
+               rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+                                            "llog_process_thread"));
+               if (IS_ERR_VALUE(rc)) {
+                       CERROR("%s: cannot start thread: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+                       OBD_FREE_PTR(lpi);
+                       RETURN(rc);
+               }
+               wait_for_completion(&lpi->lpi_completion);
+       } else {
+               lpi->lpi_env = env;
+               llog_process_thread(lpi);
+       }
+       rc = lpi->lpi_rc;
+       OBD_FREE_PTR(lpi);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+                llog_cb_t cb, void *data, void *catdata)
+{
+       return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+       if (loghandle && loghandle->lgh_hdr)
+               return loghandle->lgh_hdr->llh_count;
+       return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+                        struct llog_handle *loghandle, llog_cb_t cb,
+                        void *data, void *catdata)
+{
+       struct llog_log_hdr *llh = loghandle->lgh_hdr;
+       struct llog_process_cat_data *cd = catdata;
+       void *buf;
+       int rc = 0, first_index = 1, index, idx;
+       ENTRY;
+
+       OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+       if (!buf)
+               RETURN(-ENOMEM);
+
+       if (cd != NULL)
+               first_index = cd->lpcd_first_idx + 1;
+       if (cd != NULL && cd->lpcd_last_idx)
+               index = cd->lpcd_last_idx;
+       else
+               index = LLOG_BITMAP_BYTES * 8 - 1;
+
+       while (rc == 0) {
+               struct llog_rec_hdr *rec;
+               struct llog_rec_tail *tail;
+
+               /* skip records not set in bitmap */
+               while (index >= first_index &&
+                      !ext2_test_bit(index, llh->llh_bitmap))
+                       --index;
+
+               LASSERT(index >= first_index - 1);
+               if (index == first_index - 1)
+                       break;
+
+               /* get the buf with our target record; avoid old garbage */
+               memset(buf, 0, LLOG_CHUNK_SIZE);
+               rc = llog_prev_block(env, loghandle, index, buf,
+                                    LLOG_CHUNK_SIZE);
+               if (rc)
+                       GOTO(out, rc);
+
+               rec = buf;
+               idx = rec->lrh_index;
+               CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+               while (idx < index) {
+                       rec = (void *)rec + rec->lrh_len;
+                       if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                               lustre_swab_llog_rec(rec);
+                       idx ++;
+               }
+               LASSERT(idx == index);
+               tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+               /* process records in buffer, starting where we found one */
+               while ((void *)tail > buf) {
+                       if (tail->lrt_index == 0)
+                               GOTO(out, rc = 0); /* no more records */
+
+                       /* if set, process the callback on this record */
+                       if (ext2_test_bit(index, llh->llh_bitmap)) {
+                               rec = (void *)tail - tail->lrt_len +
+                                     sizeof(*tail);
+
+                               rc = cb(env, loghandle, rec, data);
+                               if (rc == LLOG_PROC_BREAK) {
+                                       GOTO(out, rc);
+                               } else if (rc == LLOG_DEL_RECORD) {
+                                       llog_cancel_rec(env, loghandle,
+                                                       tail->lrt_index);
+                                       rc = 0;
+                               }
+                               if (rc)
+                                       GOTO(out, rc);
+                       }
+
+                       /* previous record, still in buffer? */
+                       --index;
+                       if (index < first_index)
+                               GOTO(out, rc = 0);
+                       tail = (void *)tail - tail->lrt_len;
+               }
+       }
+
+out:
+       if (buf)
+               OBD_FREE(buf, LLOG_CHUNK_SIZE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_exist == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_exist(loghandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+                       struct llog_handle *loghandle, struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_declare_create == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_declare_create(env, loghandle, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+               struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_create == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_create(env, handle, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+                          struct llog_handle *handle,
+                          struct llog_rec_hdr *rec, int idx,
+                          struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       LASSERT(lop);
+       if (lop->lop_declare_write_rec == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+                  struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+                  int numcookies, void *buf, int idx, struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc, buflen;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(lop);
+       if (lop->lop_write_rec == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       if (buf)
+               buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+                        sizeof(struct llog_rec_tail);
+       else
+               buflen = rec->lrh_len;
+       LASSERT(cfs_size_round(buflen) == buflen);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+                               buf, idx, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+            struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+            void *buf, struct thandle *th)
+{
+       int raised, rc;
+
+       ENTRY;
+
+       if (lgh->lgh_logops->lop_add == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+                    struct llog_rec_hdr *rec, struct thandle *th)
+{
+       int raised, rc;
+
+       ENTRY;
+
+       if (lgh->lgh_logops->lop_declare_add == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+                    struct llog_handle **res, struct llog_logid *logid,
+                    char *name)
+{
+       struct thandle  *th;
+       int              rc;
+
+       ENTRY;
+
+       rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+       if (rc)
+               RETURN(rc);
+
+       if (llog_exist(*res))
+               RETURN(0);
+
+       if ((*res)->lgh_obj != NULL) {
+               struct dt_device *d;
+
+               d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+               th = dt_trans_create(env, d);
+               if (IS_ERR(th))
+                       GOTO(out, rc = PTR_ERR(th));
+
+               rc = llog_declare_create(env, *res, th);
+               if (rc == 0) {
+                       rc = dt_trans_start_local(env, d, th);
+                       if (rc == 0)
+                               rc = llog_create(env, *res, th);
+               }
+               dt_trans_stop(env, d, th);
+       } else {
+               /* lvfs compat code */
+               LASSERT((*res)->lgh_file == NULL);
+               rc = llog_create(env, *res, NULL);
+       }
+out:
+       if (rc)
+               llog_close(env, *res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+              struct llog_logid *logid, char *name)
+{
+       struct llog_handle      *handle;
+       int                      rc = 0, rc2;
+
+       ENTRY;
+
+       /* nothing to erase */
+       if (name == NULL && logid == NULL)
+               RETURN(0);
+
+       rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+       if (rc < 0)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+       if (rc == 0)
+               rc = llog_destroy(env, handle);
+
+       rc2 = llog_close(env, handle);
+       if (rc == 0)
+               rc = rc2;
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+              struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+              int cookiecount, void *buf, int idx)
+{
+       int rc;
+
+       ENTRY;
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       if (loghandle->lgh_obj != NULL) {
+               struct dt_device        *dt;
+               struct thandle          *th;
+
+               dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+               th = dt_trans_create(env, dt);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
+
+               rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dt, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               down_write(&loghandle->lgh_lock);
+               rc = llog_write_rec(env, loghandle, rec, reccookie,
+                                   cookiecount, buf, idx, th);
+               up_write(&loghandle->lgh_lock);
+out_trans:
+               dt_trans_stop(env, dt, th);
+       } else { /* lvfs compatibility */
+               down_write(&loghandle->lgh_lock);
+               rc = llog_write_rec(env, loghandle, rec, reccookie,
+                                   cookiecount, buf, idx, NULL);
+               up_write(&loghandle->lgh_lock);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+             struct llog_handle **lgh, struct llog_logid *logid,
+             char *name, enum llog_open_param open_param)
+{
+       int      raised;
+       int      rc;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_logops);
+
+       if (ctxt->loc_logops->lop_open == NULL) {
+               *lgh = NULL;
+               RETURN(-EOPNOTSUPP);
+       }
+
+       *lgh = llog_alloc_handle();
+       if (*lgh == NULL)
+               RETURN(-ENOMEM);
+       (*lgh)->lgh_ctxt = ctxt;
+       (*lgh)->lgh_logops = ctxt->loc_logops;
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       if (rc) {
+               llog_free_handle(*lgh);
+               *lgh = NULL;
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               GOTO(out, rc);
+       if (lop->lop_close == NULL)
+               GOTO(out, rc = -EOPNOTSUPP);
+       rc = lop->lop_close(env, loghandle);
+out:
+       llog_handle_put(loghandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644 (file)
index 0000000..cf00b2f
--- /dev/null
@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+                           struct llog_handle *cathandle,
+                           struct llog_handle *loghandle,
+                           struct thandle *th)
+{
+
+       struct llog_log_hdr *llh;
+       struct llog_logid_rec rec = { { 0 }, };
+       int rc, index, bitmap_size;
+       ENTRY;
+
+       llh = cathandle->lgh_hdr;
+       bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+       index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+       /* maximum number of available slots in catlog is bitmap_size - 2 */
+       if (llh->llh_cat_idx == index) {
+               CERROR("no free catalog slots for log...\n");
+               RETURN(-ENOSPC);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+               RETURN(-ENOSPC);
+
+       rc = llog_create(env, loghandle, th);
+       /* if llog is already created, no need to initialize it */
+       if (rc == -EEXIST) {
+               RETURN(0);
+       } else if (rc != 0) {
+               CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+                      loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, loghandle,
+                             LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+                             &cathandle->lgh_hdr->llh_tgtuuid);
+       if (rc)
+               GOTO(out_destroy, rc);
+
+       if (index == 0)
+               index = 1;
+
+       spin_lock(&loghandle->lgh_hdr_lock);
+       llh->llh_count++;
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("argh, index %u already set in log bitmap?\n",
+                      index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       spin_unlock(&loghandle->lgh_hdr_lock);
+
+       cathandle->lgh_last_idx = index;
+       llh->llh_tail.lrt_index = index;
+
+       CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+              DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+              loghandle->lgh_id.lgl_ogen, index,
+              POSTID(&cathandle->lgh_id.lgl_oi));
+       /* build the record for this log in the catalog */
+       rec.lid_hdr.lrh_len = sizeof(rec);
+       rec.lid_hdr.lrh_index = index;
+       rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+       rec.lid_id = loghandle->lgh_id;
+       rec.lid_tail.lrt_len = sizeof(rec);
+       rec.lid_tail.lrt_index = index;
+
+       /* update the catalog: header and record */
+       rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+                           &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+       if (rc < 0)
+               GOTO(out_destroy, rc);
+
+       loghandle->lgh_hdr->llh_cat_idx = index;
+       RETURN(0);
+out_destroy:
+       llog_destroy(env, loghandle);
+       RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+                      struct llog_handle **res, struct llog_logid *logid)
+{
+       struct llog_handle      *loghandle;
+       int                      rc = 0;
+
+       ENTRY;
+
+       if (cathandle == NULL)
+               RETURN(-EBADF);
+
+       down_write(&cathandle->lgh_lock);
+       list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+                               u.phd.phd_entry) {
+               struct llog_logid *cgl = &loghandle->lgh_id;
+
+               if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+                   ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+                       if (cgl->lgl_ogen != logid->lgl_ogen) {
+                               CERROR("%s: log "DOSTID" generation %x != %x\n",
+                                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                                      POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+                                      logid->lgl_ogen);
+                               continue;
+                       }
+                       loghandle->u.phd.phd_cat_handle = cathandle;
+                       up_write(&cathandle->lgh_lock);
+                       GOTO(out, rc = 0);
+               }
+       }
+       up_write(&cathandle->lgh_lock);
+
+       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+                      LLOG_OPEN_EXISTS);
+       if (rc < 0) {
+               CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+       if (rc < 0) {
+               llog_close(env, loghandle);
+               loghandle = NULL;
+               RETURN(rc);
+       }
+
+       down_write(&cathandle->lgh_lock);
+       list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+       up_write(&cathandle->lgh_lock);
+
+       loghandle->u.phd.phd_cat_handle = cathandle;
+       loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+       loghandle->u.phd.phd_cookie.lgc_index =
+                               loghandle->lgh_hdr->llh_cat_idx;
+       EXIT;
+out:
+       llog_handle_get(loghandle);
+       *res = loghandle;
+       return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+       struct llog_handle      *loghandle, *n;
+       int                      rc;
+
+       ENTRY;
+
+       list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+                                    u.phd.phd_entry) {
+               struct llog_log_hdr     *llh = loghandle->lgh_hdr;
+               int                      index;
+
+               /* unlink open-not-created llogs */
+               list_del_init(&loghandle->u.phd.phd_entry);
+               llh = loghandle->lgh_hdr;
+               if (loghandle->lgh_obj != NULL && llh != NULL &&
+                   (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+                   (llh->llh_count == 1)) {
+                       rc = llog_destroy(env, loghandle);
+                       if (rc)
+                               CERROR("%s: failure destroying log during "
+                                      "cleanup: rc = %d\n",
+                                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                                      rc);
+
+                       index = loghandle->u.phd.phd_cookie.lgc_index;
+                       llog_cat_cleanup(env, cathandle, NULL, index);
+               }
+               llog_close(env, loghandle);
+       }
+       /* if handle was stored in ctxt, remove it too */
+       if (cathandle->lgh_ctxt->loc_handle == cathandle)
+               cathandle->lgh_ctxt->loc_handle = NULL;
+       rc = llog_close(env, cathandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+       LLOGH_CAT,
+       LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+                                               struct thandle *th)
+{
+       struct llog_handle *loghandle = NULL;
+       ENTRY;
+
+       down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+       loghandle = cathandle->u.chd.chd_current_log;
+       if (loghandle) {
+               struct llog_log_hdr *llh;
+
+               down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+               llh = loghandle->lgh_hdr;
+               if (llh == NULL ||
+                   loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+                       up_read(&cathandle->lgh_lock);
+                       RETURN(loghandle);
+               } else {
+                       up_write(&loghandle->lgh_lock);
+               }
+       }
+       up_read(&cathandle->lgh_lock);
+
+       /* time to use next log */
+
+       /* first, we have to make sure the state hasn't changed */
+       down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+       loghandle = cathandle->u.chd.chd_current_log;
+       if (loghandle) {
+               struct llog_log_hdr *llh;
+
+               down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+               llh = loghandle->lgh_hdr;
+               LASSERT(llh);
+               if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+                       up_write(&cathandle->lgh_lock);
+                       RETURN(loghandle);
+               } else {
+                       up_write(&loghandle->lgh_lock);
+               }
+       }
+
+       CDEBUG(D_INODE, "use next log\n");
+
+       loghandle = cathandle->u.chd.chd_next_log;
+       cathandle->u.chd.chd_current_log = loghandle;
+       cathandle->u.chd.chd_next_log = NULL;
+       down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+       up_write(&cathandle->lgh_lock);
+       LASSERT(loghandle);
+       RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                    void *buf, struct thandle *th)
+{
+       struct llog_handle *loghandle;
+       int rc;
+       ENTRY;
+
+       LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+       loghandle = llog_cat_current_log(cathandle, th);
+       LASSERT(!IS_ERR(loghandle));
+
+       /* loghandle is already locked by llog_cat_current_log() for us */
+       if (!llog_exist(loghandle)) {
+               rc = llog_cat_new_log(env, cathandle, loghandle, th);
+               if (rc < 0) {
+                       up_write(&loghandle->lgh_lock);
+                       RETURN(rc);
+               }
+       }
+       /* now let's try to add the record */
+       rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+       if (rc < 0)
+               CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+                            "llog_write_rec %d: lh=%p\n", rc, loghandle);
+       up_write(&loghandle->lgh_lock);
+       if (rc == -ENOSPC) {
+               /* try to use next log */
+               loghandle = llog_cat_current_log(cathandle, th);
+               LASSERT(!IS_ERR(loghandle));
+               /* new llog can be created concurrently */
+               if (!llog_exist(loghandle)) {
+                       rc = llog_cat_new_log(env, cathandle, loghandle, th);
+                       if (rc < 0) {
+                               up_write(&loghandle->lgh_lock);
+                               RETURN(rc);
+                       }
+               }
+               /* now let's try to add the record */
+               rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+                                   -1, th);
+               if (rc < 0)
+                       CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+               up_write(&loghandle->lgh_lock);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+                            struct llog_handle *cathandle,
+                            struct llog_rec_hdr *rec, struct thandle *th)
+{
+       struct llog_handle      *loghandle, *next;
+       int                      rc = 0;
+
+       ENTRY;
+
+       if (cathandle->u.chd.chd_current_log == NULL) {
+               /* declare new plain llog */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_current_log == NULL) {
+                       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+                                      NULL, NULL, LLOG_OPEN_NEW);
+                       if (rc == 0) {
+                               cathandle->u.chd.chd_current_log = loghandle;
+                               list_add_tail(&loghandle->u.phd.phd_entry,
+                                                 &cathandle->u.chd.chd_head);
+                       }
+               }
+               up_write(&cathandle->lgh_lock);
+       } else if (cathandle->u.chd.chd_next_log == NULL) {
+               /* declare next plain llog */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_next_log == NULL) {
+                       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+                                      NULL, NULL, LLOG_OPEN_NEW);
+                       if (rc == 0) {
+                               cathandle->u.chd.chd_next_log = loghandle;
+                               list_add_tail(&loghandle->u.phd.phd_entry,
+                                                 &cathandle->u.chd.chd_head);
+                       }
+               }
+               up_write(&cathandle->lgh_lock);
+       }
+       if (rc)
+               GOTO(out, rc);
+
+       if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+               rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+                                        th);
+               if (rc)
+                       GOTO(out, rc);
+               llog_declare_write_rec(env, cathandle, NULL, -1, th);
+       }
+       /* declare records in the llogs */
+       rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+                                   rec, -1, th);
+       if (rc)
+               GOTO(out, rc);
+
+       next = cathandle->u.chd.chd_next_log;
+       if (next) {
+               if (!llog_exist(next)) {
+                       rc = llog_declare_create(env, next, th);
+                       llog_declare_write_rec(env, cathandle, NULL, -1, th);
+               }
+               llog_declare_write_rec(env, next, rec, -1, th);
+       }
+out:
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+                struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                void *buf)
+{
+       struct llog_ctxt        *ctxt;
+       struct dt_device        *dt;
+       struct thandle          *th = NULL;
+       int                      rc;
+
+       ctxt = cathandle->lgh_ctxt;
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+
+       if (cathandle->lgh_obj != NULL) {
+               dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+               LASSERT(dt);
+
+               th = dt_trans_create(env, dt);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
+
+               rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dt, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+               rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+               dt_trans_stop(env, dt, th);
+       } else { /* lvfs compat code */
+               LASSERT(cathandle->lgh_file != NULL);
+               rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+               if (rc == 0)
+                       rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+                                             buf, th);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+                           struct llog_handle *cathandle, int count,
+                           struct llog_cookie *cookies)
+{
+       int i, index, rc = 0, failed = 0;
+
+       ENTRY;
+
+       for (i = 0; i < count; i++, cookies++) {
+               struct llog_handle      *loghandle;
+               struct llog_logid       *lgl = &cookies->lgc_lgl;
+               int                      lrc;
+
+               rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+               if (rc) {
+                       CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                              cathandle->lgh_ctxt->loc_obd->obd_name,
+                              POSTID(&lgl->lgl_oi), rc);
+                       failed++;
+                       continue;
+               }
+
+               lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+               if (lrc == 1) {   /* log has been destroyed */
+                       index = loghandle->u.phd.phd_cookie.lgc_index;
+                       rc = llog_cat_cleanup(env, cathandle, loghandle,
+                                             index);
+               } else if (lrc == -ENOENT) {
+                       if (rc == 0) /* ENOENT shouldn't rewrite any error */
+                               rc = lrc;
+               } else if (lrc < 0) {
+                       failed++;
+                       rc = lrc;
+               }
+               llog_handle_put(loghandle);
+       }
+       if (rc)
+               CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+                      rc);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+                       struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_process_data *d = data;
+       struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle *llh;
+       int rc;
+
+       ENTRY;
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cat_llh->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               RETURN(rc);
+       }
+
+       if (rec->lrh_index < d->lpd_startcat)
+               /* Skip processing of the logs until startcat */
+               RETURN(0);
+
+       if (d->lpd_startidx > 0) {
+               struct llog_process_cat_data cd;
+
+               cd.lpcd_first_idx = d->lpd_startidx;
+               cd.lpcd_last_idx = 0;
+               rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+                                         &cd, false);
+               /* Continue processing the next log from idx 0 */
+               d->lpd_startidx = 0;
+       } else {
+               rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+                                         NULL, false);
+       }
+       llog_handle_put(llh);
+
+       RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+                            struct llog_handle *cat_llh,
+                            llog_cb_t cb, void *data, int startcat,
+                            int startidx, bool fork)
+{
+       struct llog_process_data d;
+       struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+       int rc;
+       ENTRY;
+
+       LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+       d.lpd_data = data;
+       d.lpd_cb = cb;
+       d.lpd_startcat = startcat;
+       d.lpd_startidx = startidx;
+
+       if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+               struct llog_process_cat_data cd;
+
+               CWARN("catlog "DOSTID" crosses index zero\n",
+                     POSTID(&cat_llh->lgh_id.lgl_oi));
+
+               cd.lpcd_first_idx = llh->llh_cat_idx;
+               cd.lpcd_last_idx = 0;
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, &cd, fork);
+               if (rc != 0)
+                       RETURN(rc);
+
+               cd.lpcd_first_idx = 0;
+               cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, &cd, fork);
+       } else {
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, NULL, fork);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+                    llog_cb_t cb, void *data, int startcat, int startidx)
+{
+       return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+                                       startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+                                      struct llog_handle *cat_llh,
+                                      struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_process_data *d = data;
+       struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle *llh;
+       int rc;
+
+       if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cat_llh->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               RETURN(rc);
+       }
+
+       rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+       llog_handle_put(llh);
+       RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+                            struct llog_handle *cat_llh,
+                            llog_cb_t cb, void *data)
+{
+       struct llog_process_data d;
+       struct llog_process_cat_data cd;
+       struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+       int rc;
+       ENTRY;
+
+       LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+       d.lpd_data = data;
+       d.lpd_cb = cb;
+
+       if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+               CWARN("catalog "DOSTID" crosses index zero\n",
+                     POSTID(&cat_llh->lgh_id.lgl_oi));
+
+               cd.lpcd_first_idx = 0;
+               cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, &cd);
+               if (rc != 0)
+                       RETURN(rc);
+
+               cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+               cd.lpcd_last_idx = 0;
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, &cd);
+       } else {
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, NULL);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+       struct llog_log_hdr *llh = cathandle->lgh_hdr;
+       int i, bitmap_size, idx;
+       ENTRY;
+
+       bitmap_size = LLOG_BITMAP_SIZE(llh);
+       if (llh->llh_cat_idx == (index - 1)) {
+               idx = llh->llh_cat_idx + 1;
+               llh->llh_cat_idx = idx;
+               if (idx == cathandle->lgh_last_idx)
+                       goto out;
+               for (i = (index + 1) % bitmap_size;
+                    i != cathandle->lgh_last_idx;
+                    i = (i + 1) % bitmap_size) {
+                       if (!ext2_test_bit(i, llh->llh_bitmap)) {
+                               idx = llh->llh_cat_idx + 1;
+                               llh->llh_cat_idx = idx;
+                       } else if (i == 0) {
+                               llh->llh_cat_idx = 0;
+                       } else {
+                               break;
+                       }
+               }
+out:
+               CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+                      POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+       }
+
+       RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_handle *loghandle, int index)
+{
+       int rc;
+
+       LASSERT(index);
+       if (loghandle != NULL) {
+               /* remove destroyed llog from catalog list and
+                * chd_current_log variable */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_current_log == loghandle)
+                       cathandle->u.chd.chd_current_log = NULL;
+               list_del_init(&loghandle->u.phd.phd_entry);
+               up_write(&cathandle->lgh_lock);
+               LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+               /* llog was opened and keep in a list, close it now */
+               llog_close(env, loghandle);
+       }
+       /* remove plain llog entry from catalog by index */
+       llog_cat_set_first_idx(cathandle, index);
+       rc = llog_cancel_rec(env, cathandle, index);
+       if (rc == 0)
+               CDEBUG(D_HA, "cancel plain log at index"
+                      " %u of catalog "DOSTID"\n",
+                      index, POSTID(&cathandle->lgh_id.lgl_oi));
+       return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+                 struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle      *loghandle;
+       struct llog_log_hdr     *llh;
+       int                      rc;
+
+       ENTRY;
+
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               if (rc == -ENOENT || rc == -ESTALE) {
+                       /* remove index from catalog */
+                       llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+               }
+               RETURN(rc);
+       }
+
+       llh = loghandle->lgh_hdr;
+       if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+           (llh->llh_count == 1)) {
+               rc = llog_destroy(env, loghandle);
+               if (rc)
+                       CERROR("%s: fail to destroy empty log: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+               llog_cat_cleanup(env, cathandle, loghandle,
+                                loghandle->u.phd.phd_cookie.lgc_index);
+       }
+       llog_handle_put(loghandle);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+                             struct llog_handle *llh)
+{
+       int rc;
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+       if (rc)
+               CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+                      "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644 (file)
index 0000000..539e1d4
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+       struct llog_handle *lpi_loghandle;
+       llog_cb_t          lpi_cb;
+       void           *lpi_cbdata;
+       void           *lpi_catdata;
+       int              lpi_rc;
+       struct completion       lpi_completion;
+       const struct lu_env     *lpi_env;
+
+};
+
+struct llog_thread_info {
+       struct lu_attr                   lgi_attr;
+       struct lu_fid                    lgi_fid;
+       struct dt_object_format          lgi_dof;
+       struct lu_buf                    lgi_buf;
+       loff_t                           lgi_off;
+       struct llog_rec_hdr              lgi_lrh;
+       struct llog_rec_tail             lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+       struct llog_thread_info *lgi;
+
+       lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+       LASSERT(lgi);
+       return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+       ostid_set_seq_llog(&logid->lgl_oi);
+       ostid_set_id(&logid->lgl_oi, ino);
+       logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+                      struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644 (file)
index 0000000..0732874
--- /dev/null
@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+       char *start, *end, *endp;
+       __u64 id, seq;
+
+       ENTRY;
+       start = str;
+       if (*start != '#')
+               RETURN(-EINVAL);
+
+       start++;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       end = strchr(start, '#');
+       if (end == NULL || end == start)
+               RETURN(-EINVAL);
+
+       *end = '\0';
+       id = simple_strtoull(start, &endp, 0);
+       if (endp != end)
+               RETURN(-EINVAL);
+
+       start = ++end;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       end = strchr(start, '#');
+       if (end == NULL || end == start)
+               RETURN(-EINVAL);
+
+       *end = '\0';
+       seq = simple_strtoull(start, &endp, 0);
+       if (endp != end)
+               RETURN(-EINVAL);
+
+       ostid_set_seq(&logid->lgl_oi, seq);
+       ostid_set_id(&logid->lgl_oi, id);
+
+       start = ++end;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+       if (*endp != '\0')
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_rec_hdr *rec, void *data)
+{
+       struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+       static int l, remains, from, to;
+       static char *out;
+       char *endp;
+       int cur_index, rc = 0;
+
+       ENTRY;
+
+       if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+               l = 0;
+               remains = ioc_data->ioc_inllen4 +
+                       cfs_size_round(ioc_data->ioc_inllen1) +
+                       cfs_size_round(ioc_data->ioc_inllen2) +
+                       cfs_size_round(ioc_data->ioc_inllen3);
+               from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               ioc_data->ioc_inllen1 = 0;
+               out = ioc_data->ioc_bulk;
+       }
+
+       cur_index = rec->lrh_index;
+       if (cur_index < from)
+               RETURN(0);
+       if (to > 0 && cur_index > to)
+               RETURN(-LLOG_EEMPTY);
+
+       if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+               struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+               struct llog_handle      *loghandle;
+
+               if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                       l = snprintf(out, remains, "[index]: %05d  [type]: "
+                                    "%02x  [len]: %04d failed\n",
+                                    cur_index, rec->lrh_type,
+                                    rec->lrh_len);
+               }
+               if (handle->lgh_ctxt == NULL)
+                       RETURN(-EOPNOTSUPP);
+               rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+               if (rc) {
+                       CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+                              POSTID(&lir->lid_id.lgl_oi),
+                              lir->lid_id.lgl_ogen);
+                       RETURN(rc);
+               }
+               rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+               llog_handle_put(loghandle);
+       } else {
+               bool ok;
+
+               switch (rec->lrh_type) {
+               case OST_SZ_REC:
+               case MDS_UNLINK_REC:
+               case MDS_UNLINK64_REC:
+               case MDS_SETATTR64_REC:
+               case OBD_CFG_REC:
+               case LLOG_GEN_REC:
+               case LLOG_HDR_MAGIC:
+                       ok = true;
+                       break;
+               default:
+                       ok = false;
+               }
+
+               l = snprintf(out, remains, "[index]: %05d  [type]: "
+                            "%02x  [len]: %04d %s\n",
+                            cur_index, rec->lrh_type, rec->lrh_len,
+                            ok ? "ok" : "failed");
+               out += l;
+               remains -= l;
+               if (remains <= 0) {
+                       CERROR("%s: no space to print log records\n",
+                              handle->lgh_ctxt->loc_obd->obd_name);
+                       RETURN(-LLOG_EEMPTY);
+               }
+       }
+       RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_rec_hdr *rec, void *data)
+{
+       struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+       static int l, remains, from, to;
+       static char *out;
+       char *endp;
+       int cur_index;
+
+       ENTRY;
+       if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+               l = 0;
+               remains = ioc_data->ioc_inllen4 +
+                       cfs_size_round(ioc_data->ioc_inllen1) +
+                       cfs_size_round(ioc_data->ioc_inllen2) +
+                       cfs_size_round(ioc_data->ioc_inllen3);
+               from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               out = ioc_data->ioc_bulk;
+               ioc_data->ioc_inllen1 = 0;
+       }
+
+       cur_index = rec->lrh_index;
+       if (cur_index < from)
+               RETURN(0);
+       if (to > 0 && cur_index > to)
+               RETURN(-LLOG_EEMPTY);
+
+       if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+               struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+               if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                       CERROR("invalid record in catalog\n");
+                       RETURN(-EINVAL);
+               }
+
+               l = snprintf(out, remains,
+                            "[index]: %05d  [logid]: #"DOSTID"#%08x\n",
+                            cur_index, POSTID(&lir->lid_id.lgl_oi),
+                            lir->lid_id.lgl_ogen);
+       } else if (rec->lrh_type == OBD_CFG_REC) {
+               int rc;
+
+               rc = class_config_parse_rec(rec, out, remains);
+               if (rc < 0)
+                       RETURN(rc);
+               l = rc;
+       } else {
+               l = snprintf(out, remains,
+                            "[index]: %05d  [type]: %02x  [len]: %04d\n",
+                            cur_index, rec->lrh_type, rec->lrh_len);
+       }
+       out += l;
+       remains -= l;
+       if (remains <= 0) {
+               CERROR("not enough space for print log records\n");
+               RETURN(-LLOG_EEMPTY);
+       }
+
+       RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+                          struct llog_logid *logid)
+{
+       struct llog_handle      *log;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_cat_id2handle(env, cat, &log, logid);
+       if (rc) {
+               CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+                      POSTID(&logid->lgl_oi), logid->lgl_ogen);
+               RETURN(-ENOENT);
+       }
+
+       rc = llog_destroy(env, log);
+       if (rc) {
+               CDEBUG(D_IOCTL, "cannot destroy log\n");
+               GOTO(out, rc);
+       }
+       llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+       llog_handle_put(log);
+       RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+                         struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       int                      rc;
+
+       ENTRY;
+       if (rec->lrh_type != LLOG_LOGID_MAGIC)
+               RETURN(-EINVAL);
+       rc = llog_remove_log(env, handle, &lir->lid_id);
+
+       RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+              struct obd_ioctl_data *data)
+{
+       struct llog_logid        logid;
+       int                      rc = 0;
+       struct llog_handle      *handle = NULL;
+
+       ENTRY;
+
+       if (*data->ioc_inlbuf1 == '#') {
+               rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+               if (rc)
+                       RETURN(rc);
+               rc = llog_open(env, ctxt, &handle, &logid, NULL,
+                              LLOG_OPEN_EXISTS);
+               if (rc)
+                       RETURN(rc);
+       } else if (*data->ioc_inlbuf1 == '$') {
+               char *name = data->ioc_inlbuf1 + 1;
+
+               rc = llog_open(env, ctxt, &handle, NULL, name,
+                              LLOG_OPEN_EXISTS);
+               if (rc)
+                       RETURN(rc);
+       } else {
+               RETURN(-EINVAL);
+       }
+
+       rc = llog_init_handle(env, handle, 0, NULL);
+       if (rc)
+               GOTO(out_close, rc = -ENOENT);
+
+       switch (cmd) {
+       case OBD_IOC_LLOG_INFO: {
+               int      l;
+               int      remains = data->ioc_inllen2 +
+                                  cfs_size_round(data->ioc_inllen1);
+               char    *out = data->ioc_bulk;
+
+               l = snprintf(out, remains,
+                            "logid:        #"DOSTID"#%08x\n"
+                            "flags:        %x (%s)\n"
+                            "records count:    %d\n"
+                            "last index:       %d\n",
+                            POSTID(&handle->lgh_id.lgl_oi),
+                            handle->lgh_id.lgl_ogen,
+                            handle->lgh_hdr->llh_flags,
+                            handle->lgh_hdr->llh_flags &
+                            LLOG_F_IS_CAT ? "cat" : "plain",
+                            handle->lgh_hdr->llh_count,
+                            handle->lgh_last_idx);
+               out += l;
+               remains -= l;
+               if (remains <= 0) {
+                       CERROR("%s: not enough space for log header info\n",
+                              ctxt->loc_obd->obd_name);
+                       rc = -ENOSPC;
+               }
+               break;
+       }
+       case OBD_IOC_LLOG_CHECK:
+               LASSERT(data->ioc_inllen1 > 0);
+               rc = llog_process(env, handle, llog_check_cb, data, NULL);
+               if (rc == -LLOG_EEMPTY)
+                       rc = 0;
+               else if (rc)
+                       GOTO(out_close, rc);
+               break;
+       case OBD_IOC_LLOG_PRINT:
+               LASSERT(data->ioc_inllen1 > 0);
+               rc = llog_process(env, handle, llog_print_cb, data, NULL);
+               if (rc == -LLOG_EEMPTY)
+                       rc = 0;
+               else if (rc)
+                       GOTO(out_close, rc);
+               break;
+       case OBD_IOC_LLOG_CANCEL: {
+               struct llog_cookie cookie;
+               struct llog_logid plain;
+               char *endp;
+
+               cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       GOTO(out_close, rc = -EINVAL);
+
+               if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+                       rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+                       GOTO(out_close, rc);
+               } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+                       GOTO(out_close, rc = -EINVAL);
+               }
+
+               if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+                       GOTO(out_close, rc = -ENOTTY);
+
+               rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+               if (rc)
+                       GOTO(out_close, rc);
+               cookie.lgc_lgl = plain;
+               rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+               if (rc)
+                       GOTO(out_close, rc);
+               break;
+       }
+       case OBD_IOC_LLOG_REMOVE: {
+               struct llog_logid plain;
+
+               if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+                       rc = llog_destroy(env, handle);
+                       GOTO(out_close, rc);
+               } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+                       GOTO(out_close, rc = -EINVAL);
+               }
+
+               if (data->ioc_inlbuf2 > 0) {
+                       /* remove indicate log from the catalog */
+                       rc = str2logid(&plain, data->ioc_inlbuf2,
+                                      data->ioc_inllen2);
+                       if (rc)
+                               GOTO(out_close, rc);
+                       rc = llog_remove_log(env, handle, &plain);
+               } else {
+                       /* remove all the log of the catalog */
+                       rc = llog_process(env, handle, llog_delete_cb, NULL,
+                                         NULL);
+                       if (rc)
+                               GOTO(out_close, rc);
+               }
+               break;
+       }
+       default:
+               CERROR("%s: Unknown ioctl cmd %#x\n",
+                      ctxt->loc_obd->obd_name, cmd);
+               GOTO(out_close, rc = -ENOTTY);
+       }
+
+out_close:
+       if (handle->lgh_hdr &&
+           handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               llog_cat_close(env, handle);
+       else
+               llog_close(env, handle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644 (file)
index 0000000..7e12dc6
--- /dev/null
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if  defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+                               int len, int index)
+{
+       struct llog_rec_hdr rec = { 0 };
+       struct llog_rec_tail tail;
+       int rc;
+       ENTRY;
+
+       LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+       tail.lrt_len = rec.lrh_len = len;
+       tail.lrt_index = rec.lrh_index = index;
+       rec.lrh_type = LLOG_PAD_MAGIC;
+
+       rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing padding record: rc %d\n", rc);
+               goto out;
+       }
+
+       file->f_pos += len - sizeof(rec) - sizeof(tail);
+       rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+       if (rc) {
+               CERROR("error writing padding record: rc %d\n", rc);
+               goto out;
+       }
+
+ out:
+       RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+                               struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+       int rc;
+       struct llog_rec_tail end;
+       loff_t saved_off = file->f_pos;
+       int buflen = rec->lrh_len;
+
+       ENTRY;
+
+       file->f_pos = off;
+
+       if (buflen == 0)
+               CWARN("0-length record\n");
+
+       if (!buf) {
+               rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+               if (rc) {
+                       CERROR("error writing log record: rc %d\n", rc);
+                       goto out;
+               }
+               GOTO(out, rc = 0);
+       }
+
+       /* the buf case */
+       rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+       rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log hdr: rc %d\n", rc);
+               goto out;
+       }
+
+       rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log buffer: rc %d\n", rc);
+               goto out;
+       }
+
+       end.lrt_len = rec->lrh_len;
+       end.lrt_index = rec->lrh_index;
+       rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log tail: rc %d\n", rc);
+               goto out;
+       }
+
+       rc = 0;
+ out:
+       if (saved_off > file->f_pos)
+               file->f_pos = saved_off;
+       LASSERT(rc <= 0);
+       RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+                               void *buf, int size, loff_t off)
+{
+       loff_t offset = off;
+       int rc;
+       ENTRY;
+
+       rc = fsfilt_read_record(obd, file, buf, size, &offset);
+       if (rc) {
+               CERROR("error reading log record: rc %d\n", rc);
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+                                struct llog_handle *handle)
+{
+       struct obd_device *obd;
+       int rc;
+       ENTRY;
+
+       LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+       obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+       if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+               CDEBUG(D_HA, "not reading header from 0-byte log\n");
+               RETURN(LLOG_EEMPTY);
+       }
+
+       rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+                                LLOG_CHUNK_SIZE, 0);
+       if (rc) {
+               CERROR("error reading log header from %.*s\n",
+                      handle->lgh_file->f_dentry->d_name.len,
+                      handle->lgh_file->f_dentry->d_name.name);
+       } else {
+               struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+                       lustre_swab_llog_hdr(handle->lgh_hdr);
+
+               if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+                       CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+                              handle->lgh_file->f_dentry->d_name.len,
+                              handle->lgh_file->f_dentry->d_name.name,
+                              llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+                       rc = -EIO;
+               } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+                       CERROR("incorrectly sized log %.*s header: %#x "
+                              "(expected %#x)\n",
+                              handle->lgh_file->f_dentry->d_name.len,
+                              handle->lgh_file->f_dentry->d_name.name,
+                              llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+                       CERROR("you may need to re-run lconf --write_conf.\n");
+                       rc = -EIO;
+               }
+       }
+
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+       handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+       RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+                              struct llog_handle *loghandle,
+                              struct llog_rec_hdr *rec,
+                              struct llog_cookie *reccookie, int cookiecount,
+                              void *buf, int idx, struct thandle *th)
+{
+       struct llog_log_hdr *llh;
+       int reclen = rec->lrh_len, index, rc;
+       struct llog_rec_tail *lrt;
+       struct obd_device *obd;
+       struct file *file;
+       size_t left;
+       ENTRY;
+
+       llh = loghandle->lgh_hdr;
+       file = loghandle->lgh_file;
+       obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+       /* record length should not bigger than LLOG_CHUNK_SIZE */
+       if (buf)
+               rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                     sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+       else
+               rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+       if (rc)
+               RETURN(rc);
+
+       if (buf)
+               /* write_blob adds header and tail to lrh_len. */
+               reclen = sizeof(*rec) + rec->lrh_len +
+                        sizeof(struct llog_rec_tail);
+
+       if (idx != -1) {
+               loff_t saved_offset;
+
+               /* no header: only allowed to insert record 1 */
+               if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+                       CERROR("idx != -1 in empty log\n");
+                       LBUG();
+               }
+
+               if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+                       RETURN(-EINVAL);
+
+               if (!ext2_test_bit(idx, llh->llh_bitmap))
+                       CERROR("Modify unset record %u\n", idx);
+               if (idx != rec->lrh_index)
+                       CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+               rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+               /* we are done if we only write the header or on error */
+               if (rc || idx == 0)
+                       RETURN(rc);
+
+               if (buf) {
+                       /* We assume that caller has set lgh_cur_* */
+                       saved_offset = loghandle->lgh_cur_offset;
+                       CDEBUG(D_OTHER,
+                              "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+                              "offset %llu\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+                              loghandle->lgh_cur_idx, rec->lrh_len,
+                              (long long)(saved_offset - sizeof(*llh)));
+                       if (rec->lrh_index != loghandle->lgh_cur_idx) {
+                               CERROR("modify idx mismatch %u/%d\n",
+                                      idx, loghandle->lgh_cur_idx);
+                               RETURN(-EFAULT);
+                       }
+               } else {
+                       /* Assumes constant lrh_len */
+                       saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+               }
+
+               rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+               if (rc == 0 && reccookie) {
+                       reccookie->lgc_lgl = loghandle->lgh_id;
+                       reccookie->lgc_index = idx;
+                       rc = 1;
+               }
+               RETURN(rc);
+       }
+
+       /* Make sure that records don't cross a chunk boundary, so we can
+        * process them page-at-a-time if needed.  If it will cross a chunk
+        * boundary, write in a fake (but referenced) entry to pad the chunk.
+        *
+        * We know that llog_current_log() will return a loghandle that is
+        * big enough to hold reclen, so all we care about is padding here.
+        */
+       left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+       /* NOTE: padding is a record, but no bit is set */
+       if (left != 0 && left != reclen &&
+           left < (reclen + LLOG_MIN_REC_SIZE)) {
+                index = loghandle->lgh_last_idx + 1;
+                rc = llog_lvfs_pad(obd, file, left, index);
+                if (rc)
+                        RETURN(rc);
+                loghandle->lgh_last_idx++; /*for pad rec*/
+        }
+        /* if it's the last idx in log file, then return -ENOSPC */
+        if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+                RETURN(-ENOSPC);
+       loghandle->lgh_last_idx++;
+       index = loghandle->lgh_last_idx;
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       rec->lrh_index = index;
+       if (buf == NULL) {
+               lrt = (struct llog_rec_tail *)
+                       ((char *)rec + rec->lrh_len - sizeof(*lrt));
+               lrt->lrt_len = rec->lrh_len;
+               lrt->lrt_index = rec->lrh_index;
+       }
+       /*The caller should make sure only 1 process access the lgh_last_idx,
+        *Otherwise it might hit the assert.*/
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("argh, index %u already set in log bitmap?\n", index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       llh->llh_tail.lrt_index = index;
+
+       rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+              POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+       if (rc == 0 && reccookie) {
+               reccookie->lgc_lgl = loghandle->lgh_id;
+               reccookie->lgc_index = index;
+               if ((rec->lrh_type == MDS_UNLINK_REC) ||
+                   (rec->lrh_type == MDS_SETATTR64_REC))
+                       reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+               else if (rec->lrh_type == OST_SZ_REC)
+                       reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+               else
+                       reccookie->lgc_subsys = -1;
+               rc = 1;
+       }
+       if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+               rc = 1;
+
+       RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping.  If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+       if (goal <= curr)
+               return;
+       *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+               ~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+                               struct llog_handle *loghandle, int *cur_idx,
+                               int next_idx, __u64 *cur_offset, void *buf,
+                               int len)
+{
+       int rc;
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+              next_idx, *cur_idx, *cur_offset);
+
+       while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos;
+               int llen;
+
+               llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+               /* read up to next LLOG_CHUNK_SIZE block */
+               ppos = *cur_offset;
+               llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, llen,
+                                       cur_offset);
+               if (rc < 0) {
+                       CERROR("Cant read llog block at log id "DOSTID
+                              "/%u offset "LPU64"\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen,
+                              *cur_offset);
+                       RETURN(rc);
+               }
+
+               /* put number of bytes read into rc to make code simpler */
+               rc = *cur_offset - ppos;
+               if (rc < len) {
+                       /* signal the end of the valid buffer to llog_process */
+                       memset(buf + rc, 0, len - rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       RETURN(0);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       RETURN(-EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               *cur_idx = tail->lrt_index;
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       RETURN(-EINVAL);
+               }
+               if (tail->lrt_index < next_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > next_idx) {
+                       CERROR("missed desired record? %u > %u\n",
+                              rec->lrh_index, next_idx);
+                       RETURN(-ENOENT);
+               }
+               RETURN(0);
+       }
+       RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+                               struct llog_handle *loghandle,
+                               int prev_idx, void *buf, int len)
+{
+       __u64 cur_offset;
+       int rc;
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+       cur_offset = LLOG_CHUNK_SIZE;
+       llog_skip_over(&cur_offset, 0, prev_idx);
+
+       while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos = cur_offset;
+
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, len,
+                                       &cur_offset);
+               if (rc < 0) {
+                       CERROR("Cant read llog block at log id "DOSTID
+                              "/%u offset "LPU64"\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen,
+                              cur_offset);
+                       RETURN(rc);
+               }
+
+               /* put number of bytes read into rc to make code simpler */
+               rc = cur_offset - ppos;
+
+               if (rc == 0) /* end of file, nothing to do */
+                       RETURN(0);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       RETURN(-EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       RETURN(-EINVAL);
+               }
+               if (tail->lrt_index < prev_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > prev_idx) {
+                       CERROR("missed desired record? %u > %u\n",
+                              rec->lrh_index, prev_idx);
+                       RETURN(-ENOENT);
+               }
+               RETURN(0);
+       }
+       RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+       char *logname;
+       struct file *filp;
+       int len;
+
+       OBD_ALLOC(logname, PATH_MAX);
+       if (logname == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+       if (len >= PATH_MAX - 1) {
+               filp = ERR_PTR(-ENAMETOOLONG);
+       } else {
+               filp = l_filp_open(logname, flags, mode);
+               if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+                       CERROR("logfile creation %s: %ld\n", logname,
+                              PTR_ERR(filp));
+       }
+       OBD_FREE(logname, PATH_MAX);
+       return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+                         struct llog_logid *logid, char *name,
+                         enum llog_open_param open_param)
+{
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct l_dentry         *dchild = NULL;
+       struct obd_device       *obd;
+       int                      rc = 0;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       LASSERT(ctxt->loc_exp->exp_obd);
+       obd = ctxt->loc_exp->exp_obd;
+
+       LASSERT(handle);
+       if (logid != NULL) {
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+                                            logid->lgl_ogen);
+               if (IS_ERR(dchild)) {
+                       rc = PTR_ERR(dchild);
+                       CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+                              " rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               if (dchild->d_inode == NULL) {
+                       l_dput(dchild);
+                       rc = -ENOENT;
+                       CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+                                                O_RDWR | O_LARGEFILE);
+               l_dput(dchild);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       CERROR("%s: error opening llog #"DOSTID"#%08x:"
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_id = *logid;
+       } else if (name) {
+               handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+                                                 O_RDWR | O_LARGEFILE, 0644);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+                               OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+                               if (handle->lgh_name)
+                                       strcpy(handle->lgh_name, name);
+                               else
+                                       GOTO(out, rc = -ENOMEM);
+                               rc = 0;
+                       } else {
+                               GOTO(out, rc);
+                       }
+               } else {
+                       lustre_build_llog_lvfs_oid(&handle->lgh_id,
+                           handle->lgh_file->f_dentry->d_inode->i_ino,
+                           handle->lgh_file->f_dentry->d_inode->i_generation);
+               }
+       } else {
+               LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+               handle->lgh_file = NULL;
+       }
+
+       /* No new llog is expected but doesn't exist */
+       if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+               GOTO(out_name, rc = -ENOENT);
+
+       RETURN(0);
+out_name:
+       if (handle->lgh_name != NULL)
+               OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+       RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+       return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+                           struct llog_handle *handle,
+                           struct thandle *th)
+{
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct obd_device       *obd;
+       struct l_dentry         *dchild = NULL;
+       struct file             *file;
+       struct obdo             *oa = NULL;
+       int                      rc = 0;
+       int                      open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       obd = ctxt->loc_exp->exp_obd;
+       LASSERT(handle->lgh_file == NULL);
+
+       if (handle->lgh_name) {
+               file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+                                     open_flags, 0644);
+               if (IS_ERR(file))
+                       RETURN(PTR_ERR(file));
+
+               lustre_build_llog_lvfs_oid(&handle->lgh_id,
+                               file->f_dentry->d_inode->i_ino,
+                               file->f_dentry->d_inode->i_generation);
+               handle->lgh_file = file;
+       } else {
+               OBDO_ALLOC(oa);
+               if (oa == NULL)
+                       RETURN(-ENOMEM);
+
+               ostid_set_seq_llog(&oa->o_oi);
+               oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+               rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* FIXME: rationalize the misuse of o_generation in
+                *      this API along with mds_obd_{create,destroy}.
+                *      Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+                                            oa->o_generation);
+               if (IS_ERR(dchild))
+                       GOTO(out, rc = PTR_ERR(dchild));
+
+               file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+               l_dput(dchild);
+               if (IS_ERR(file))
+                       GOTO(out, rc = PTR_ERR(file));
+               handle->lgh_id.lgl_oi = oa->o_oi;
+               handle->lgh_id.lgl_ogen = oa->o_generation;
+               handle->lgh_file = file;
+out:
+               OBDO_FREE(oa);
+       }
+       RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+                          struct llog_handle *handle)
+{
+       int rc;
+
+       ENTRY;
+
+       if (handle->lgh_file == NULL)
+               RETURN(0);
+       rc = filp_close(handle->lgh_file, 0);
+       if (rc)
+               CERROR("%s: error closing llog #"DOSTID"#%08x: "
+                      "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&handle->lgh_id.lgl_oi),
+                      handle->lgh_id.lgl_ogen, rc);
+       handle->lgh_file = NULL;
+       if (handle->lgh_name) {
+               OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+               handle->lgh_name = NULL;
+       }
+       RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+                            struct llog_handle *handle)
+{
+       struct dentry *fdentry;
+       struct obdo *oa;
+       struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+       char *dir;
+       void *th;
+       struct inode *inode;
+       int rc, rc1;
+       ENTRY;
+
+       dir = MOUNT_CONFIGS_DIR;
+
+       LASSERT(handle->lgh_file);
+       fdentry = handle->lgh_file->f_dentry;
+       inode = fdentry->d_parent->d_inode;
+       if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+               struct lvfs_run_ctxt saved;
+               struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+               push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               dget(fdentry);
+               rc = llog_lvfs_close(env, handle);
+               if (rc == 0) {
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                       rc = ll_vfs_unlink(inode, fdentry, mnt);
+                       mutex_unlock(&inode->i_mutex);
+               }
+               mntput(mnt);
+
+               dput(fdentry);
+               pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               RETURN(rc);
+       }
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               RETURN(-ENOMEM);
+
+       oa->o_oi = handle->lgh_id.lgl_oi;
+       oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+       rc = llog_lvfs_close(env, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+       if (IS_ERR(th)) {
+               CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+               GOTO(out, rc = PTR_ERR(th));
+       }
+
+       rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+                        NULL, NULL, NULL, NULL);
+
+       rc1 = fsfilt_commit(obd, inode, th, 0);
+       if (rc == 0 && rc1 != 0)
+               rc = rc1;
+ out:
+       OBDO_FREE(oa);
+       RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+                                   struct llog_handle *res,
+                                   struct thandle *th)
+{
+       return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+                                      struct llog_handle *loghandle,
+                                      struct llog_rec_hdr *rec,
+                                      int idx, struct thandle *th)
+{
+       return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+       .lop_write_rec          = llog_lvfs_write_rec,
+       .lop_next_block         = llog_lvfs_next_block,
+       .lop_prev_block         = llog_lvfs_prev_block,
+       .lop_read_header        = llog_lvfs_read_header,
+       .lop_create             = llog_lvfs_create,
+       .lop_destroy            = llog_lvfs_destroy,
+       .lop_close              = llog_lvfs_close,
+       .lop_open               = llog_lvfs_open,
+       .lop_exist              = llog_lvfs_exist,
+       .lop_declare_create     = llog_lvfs_declare_create,
+       .lop_declare_write_rec  = llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644 (file)
index 0000000..7e22907
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+       struct llog_ctxt *ctxt;
+
+       OBD_ALLOC_PTR(ctxt);
+       if (!ctxt)
+               return NULL;
+
+       ctxt->loc_obd = obd;
+       atomic_set(&ctxt->loc_refcount, 1);
+
+       return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+       if (ctxt->loc_exp) {
+               class_export_put(ctxt->loc_exp);
+               ctxt->loc_exp = NULL;
+       }
+       if (ctxt->loc_imp) {
+               class_import_put(ctxt->loc_imp);
+               ctxt->loc_imp = NULL;
+       }
+       OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct obd_llog_group *olg = ctxt->loc_olg;
+       struct obd_device *obd;
+       int rc = 0;
+
+       spin_lock(&olg->olg_lock);
+       if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+               spin_unlock(&olg->olg_lock);
+               return rc;
+       }
+       olg->olg_ctxts[ctxt->loc_idx] = NULL;
+       spin_unlock(&olg->olg_lock);
+
+       obd = ctxt->loc_obd;
+       spin_lock(&obd->obd_dev_lock);
+       /* sync with llog ctxt user thread */
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* obd->obd_starting is needed for the case of cleanup
+        * in error case while obd is starting up. */
+       LASSERTF(obd->obd_starting == 1 ||
+                obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+                !!obd->obd_stopping, !!obd->obd_set_up);
+
+       /* cleanup the llog ctxt here */
+       if (CTXTP(ctxt, cleanup))
+               rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+       llog_ctxt_destroy(ctxt);
+       wake_up(&olg->olg_waitq);
+       return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       struct obd_llog_group *olg;
+       int rc, idx;
+       ENTRY;
+
+       LASSERT(ctxt != NULL);
+       LASSERT(ctxt != LP_POISON);
+
+       olg = ctxt->loc_olg;
+       LASSERT(olg != NULL);
+       LASSERT(olg != LP_POISON);
+
+       idx = ctxt->loc_idx;
+
+       /*
+        * Banlance the ctxt get when calling llog_cleanup()
+        */
+       LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+       LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+       llog_ctxt_put(ctxt);
+
+       /*
+        * Try to free the ctxt.
+        */
+       rc = __llog_ctxt_put(env, ctxt);
+       if (rc)
+               CERROR("Error %d while cleaning up ctxt %p\n",
+                      rc, ctxt);
+
+       l_wait_event(olg->olg_waitq,
+                    llog_group_ctxt_null(olg, idx), &lwi);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+              struct obd_llog_group *olg, int index,
+              struct obd_device *disk_obd, struct llog_operations *op)
+{
+       struct llog_ctxt *ctxt;
+       int rc = 0;
+       ENTRY;
+
+       if (index < 0 || index >= LLOG_MAX_CTXTS)
+               RETURN(-EINVAL);
+
+       LASSERT(olg != NULL);
+
+       ctxt = llog_new_ctxt(obd);
+       if (!ctxt)
+               RETURN(-ENOMEM);
+
+       ctxt->loc_obd = obd;
+       ctxt->loc_olg = olg;
+       ctxt->loc_idx = index;
+       ctxt->loc_logops = op;
+       mutex_init(&ctxt->loc_mutex);
+       ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+       ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+       rc = llog_group_set_ctxt(olg, ctxt, index);
+       if (rc) {
+               llog_ctxt_destroy(ctxt);
+               if (rc == -EEXIST) {
+                       ctxt = llog_group_get_ctxt(olg, index);
+                       if (ctxt) {
+                               /*
+                                * mds_lov_update_desc() might call here multiple
+                                * times. So if the llog is already set up then
+                                * don't to do it again.
+                                */
+                               CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+                                      obd->obd_name, index);
+                               LASSERT(ctxt->loc_olg == olg);
+                               LASSERT(ctxt->loc_obd == obd);
+                               LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+                               LASSERT(ctxt->loc_logops == op);
+                               llog_ctxt_put(ctxt);
+                       }
+                       rc = 0;
+               }
+               RETURN(rc);
+       }
+
+       if (op->lop_setup) {
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+                       rc = -EOPNOTSUPP;
+               else
+                       rc = op->lop_setup(env, obd, olg, index, disk_obd);
+       }
+
+       if (rc) {
+               CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+                      obd->obd_name, index, op->lop_setup, rc);
+               llog_group_clear_ctxt(olg, index);
+               llog_ctxt_destroy(ctxt);
+       } else {
+               CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+                      obd->obd_name, index);
+               ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!ctxt)
+               RETURN(0);
+
+       if (CTXTP(ctxt, sync))
+               rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+                struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                struct llog_cookie *logcookies, int numcookies)
+{
+       int raised, rc;
+       ENTRY;
+
+       if (!ctxt) {
+               CERROR("No ctxt\n");
+               RETURN(-ENODEV);
+       }
+
+       if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+               RETURN(-ENXIO);
+
+       CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+                                 numcookies);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+               struct lov_stripe_md *lsm, int count,
+               struct llog_cookie *cookies, int flags)
+{
+       int rc;
+       ENTRY;
+
+       if (!ctxt) {
+               CERROR("No ctxt\n");
+               RETURN(-ENODEV);
+       }
+
+       CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+       rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *index)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DT_OP(obd, llog_init, 0);
+       OBD_COUNTER_INCREMENT(obd, llog_init);
+
+       rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DT_OP(obd, llog_finish, 0);
+       OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+       rc = OBP(obd, llog_finish)(obd, count);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+       llog_key_init_generic(&llog_thread_key, NULL);
+       lu_context_key_register(&llog_thread_key);
+       return 0;
+}
+
+void llog_info_fini(void)
+{
+       lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644 (file)
index 0000000..6dbd21a
--- /dev/null
@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+                                      struct local_oid_storage *los,
+                                      struct dt_object *o,
+                                      struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+
+       lgi->lgi_attr.la_valid = LA_MODE;
+       lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+       lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+       return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+                                          &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+                                     struct local_oid_storage *los,
+                                     struct dt_object *o,
+                                     struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+
+       lgi->lgi_attr.la_valid = LA_MODE;
+       lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+       lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+       return local_object_create(env, los, o, &lgi->lgi_attr,
+                                  &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+                       loff_t *off, int len, int index, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(th);
+       LASSERT(off);
+       LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+       lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+       lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+       lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+       lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+       dt_write_lock(env, o, 0);
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing padding record: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+
+       lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+       *off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc)
+               CERROR("%s: error writing padding record: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+       dt_write_unlock(env, o);
+       RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+                              struct llog_rec_hdr *rec, void *buf,
+                              loff_t *off, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       int                      buflen = rec->lrh_len;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(o);
+
+       if (buflen == 0)
+               CWARN("0-length record\n");
+
+       CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+              rec->lrh_type, buf, buflen, *off);
+
+       lgi->lgi_attr.la_valid = LA_SIZE;
+       lgi->lgi_attr.la_size = *off;
+
+       if (!buf) {
+               lgi->lgi_buf.lb_len = buflen;
+               lgi->lgi_buf.lb_buf = rec;
+               rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+               if (rc)
+                       CERROR("%s: error writing log record: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+
+       /* the buf case */
+       /* protect the following 3 writes from concurrent read */
+       dt_write_lock(env, o, 0);
+       rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+       lgi->lgi_buf.lb_len = sizeof(*rec);
+       lgi->lgi_buf.lb_buf = rec;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing log hdr: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out_unlock, rc);
+       }
+
+       lgi->lgi_buf.lb_len = buflen;
+       lgi->lgi_buf.lb_buf = buf;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing log buffer: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+               GOTO(out_unlock, rc);
+       }
+
+       lgi->lgi_tail.lrt_len = rec->lrh_len;
+       lgi->lgi_tail.lrt_index = rec->lrh_index;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+       lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc)
+               CERROR("%s: error writing log tail: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+       dt_write_unlock(env, o);
+
+out:
+       /* cleanup the content written above */
+       if (rc) {
+               dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+                        BYPASS_CAPA);
+               dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+       }
+
+       RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+                               struct llog_handle *handle)
+{
+       struct llog_rec_hdr     *llh_hdr;
+       struct dt_object        *o;
+       struct llog_thread_info *lgi;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+       o = handle->lgh_obj;
+       LASSERT(o);
+
+       lgi = llog_info(env);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+       if (lgi->lgi_attr.la_size == 0) {
+               CDEBUG(D_HA, "not reading header from 0-byte log\n");
+               RETURN(LLOG_EEMPTY);
+       }
+
+       lgi->lgi_off = 0;
+       lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+       lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+       rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+       if (rc) {
+               CERROR("%s: error reading log header from "DFID": rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      PFID(lu_object_fid(&o->do_lu)), rc);
+               RETURN(rc);
+       }
+
+       llh_hdr = &handle->lgh_hdr->llh_hdr;
+       if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+               lustre_swab_llog_hdr(handle->lgh_hdr);
+
+       if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+               CERROR("%s: bad log %s "DFID" header magic: %#x "
+                      "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+                      handle->lgh_name ? handle->lgh_name : "",
+                      PFID(lu_object_fid(&o->do_lu)),
+                      llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+               RETURN(-EIO);
+       } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+               CERROR("%s: incorrectly sized log %s "DFID" header: "
+                      "%#x (expected %#x)\n"
+                      "you may need to re-run lconf --write_conf.\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      handle->lgh_name ? handle->lgh_name : "",
+                      PFID(lu_object_fid(&o->do_lu)),
+                      llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+               RETURN(-EIO);
+       }
+
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+       RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+                                     struct llog_handle *loghandle,
+                                     struct llog_rec_hdr *rec,
+                                     int idx, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(th);
+       LASSERT(loghandle);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+
+       /* each time we update header */
+       rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+                                    th);
+       if (rc || idx == 0) /* if error or just header */
+               RETURN(rc);
+
+       if (dt_object_exists(o)) {
+               rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+               lgi->lgi_off = lgi->lgi_attr.la_size;
+               LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+               if (rc)
+                       RETURN(rc);
+
+               rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+               if (rc)
+                       RETURN(rc);
+       } else {
+               lgi->lgi_off = 0;
+       }
+
+       /* XXX: implement declared window or multi-chunks approach */
+       rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+       RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+                             struct llog_handle *loghandle,
+                             struct llog_rec_hdr *rec,
+                             struct llog_cookie *reccookie, int cookiecount,
+                             void *buf, int idx, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct llog_log_hdr     *llh;
+       int                      reclen = rec->lrh_len;
+       int                      index, rc, old_tail_idx;
+       struct llog_rec_tail    *lrt;
+       struct dt_object        *o;
+       size_t                   left;
+
+       ENTRY;
+
+       LASSERT(env);
+       llh = loghandle->lgh_hdr;
+       LASSERT(llh);
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(th);
+
+       CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+              rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+       /* record length should not bigger than LLOG_CHUNK_SIZE */
+       if (buf)
+               rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                     sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+       else
+               rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+       if (rc)
+               RETURN(rc);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               RETURN(rc);
+
+       if (buf)
+               /* write_blob adds header and tail to lrh_len. */
+               reclen = sizeof(*rec) + rec->lrh_len +
+                        sizeof(struct llog_rec_tail);
+
+       if (idx != -1) {
+               /* no header: only allowed to insert record 1 */
+               if (idx != 1 && lgi->lgi_attr.la_size == 0)
+                       LBUG();
+
+               if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+                       RETURN(-EINVAL);
+
+               if (!ext2_test_bit(idx, llh->llh_bitmap))
+                       CERROR("%s: modify unset record %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, idx);
+               if (idx != rec->lrh_index)
+                       CERROR("%s: index mismatch %d %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, idx,
+                              rec->lrh_index);
+
+               lgi->lgi_off = 0;
+               rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+                                        &lgi->lgi_off, th);
+               /* we are done if we only write the header or on error */
+               if (rc || idx == 0)
+                       RETURN(rc);
+
+               if (buf) {
+                       /* We assume that caller has set lgh_cur_* */
+                       lgi->lgi_off = loghandle->lgh_cur_offset;
+                       CDEBUG(D_OTHER,
+                              "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+                              "offset %llu\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi), idx,
+                              rec->lrh_index,
+                              loghandle->lgh_cur_idx, rec->lrh_len,
+                              (long long)(lgi->lgi_off - sizeof(*llh)));
+                       if (rec->lrh_index != loghandle->lgh_cur_idx) {
+                               CERROR("%s: modify idx mismatch %u/%d\n",
+                                      o->do_lu.lo_dev->ld_obd->obd_name, idx,
+                                      loghandle->lgh_cur_idx);
+                               RETURN(-EFAULT);
+                       }
+               } else {
+                       /* Assumes constant lrh_len */
+                       lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+               }
+
+               rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+               if (rc == 0 && reccookie) {
+                       reccookie->lgc_lgl = loghandle->lgh_id;
+                       reccookie->lgc_index = idx;
+                       rc = 1;
+               }
+               RETURN(rc);
+       }
+
+       /* Make sure that records don't cross a chunk boundary, so we can
+        * process them page-at-a-time if needed.  If it will cross a chunk
+        * boundary, write in a fake (but referenced) entry to pad the chunk.
+        *
+        * We know that llog_current_log() will return a loghandle that is
+        * big enough to hold reclen, so all we care about is padding here.
+        */
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+       lgi->lgi_off = lgi->lgi_attr.la_size;
+       left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+       /* NOTE: padding is a record, but no bit is set */
+       if (left != 0 && left != reclen &&
+           left < (reclen + LLOG_MIN_REC_SIZE)) {
+               index = loghandle->lgh_last_idx + 1;
+               rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+               if (rc)
+                       RETURN(rc);
+               loghandle->lgh_last_idx++; /*for pad rec*/
+       }
+       /* if it's the last idx in log file, then return -ENOSPC */
+       if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+               RETURN(-ENOSPC);
+
+       loghandle->lgh_last_idx++;
+       index = loghandle->lgh_last_idx;
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       rec->lrh_index = index;
+       if (buf == NULL) {
+               lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+                                              sizeof(*lrt));
+               lrt->lrt_len = rec->lrh_len;
+               lrt->lrt_index = rec->lrh_index;
+       }
+       /* The caller should make sure only 1 process access the lgh_last_idx,
+        * Otherwise it might hit the assert.*/
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("%s: index %u already set in log bitmap\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       old_tail_idx = llh->llh_tail.lrt_index;
+       llh->llh_tail.lrt_index = index;
+
+       lgi->lgi_off = 0;
+       rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+                                th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+       lgi->lgi_off = lgi->lgi_attr.la_size;
+
+       rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+       /* cleanup llog for error case */
+       if (rc) {
+               spin_lock(&loghandle->lgh_hdr_lock);
+               ext2_clear_bit(index, llh->llh_bitmap);
+               llh->llh_count--;
+               spin_unlock(&loghandle->lgh_hdr_lock);
+
+               /* restore the header */
+               loghandle->lgh_last_idx--;
+               llh->llh_tail.lrt_index = old_tail_idx;
+               lgi->lgi_off = 0;
+               llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+                                   &lgi->lgi_off, th);
+       }
+
+       CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+              POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+       if (rc == 0 && reccookie) {
+               reccookie->lgc_lgl = loghandle->lgh_id;
+               reccookie->lgc_index = index;
+               if ((rec->lrh_type == MDS_UNLINK_REC) ||
+                   (rec->lrh_type == MDS_SETATTR64_REC))
+                       reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+               else if (rec->lrh_type == OST_SZ_REC)
+                       reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+               else
+                       reccookie->lgc_subsys = -1;
+               rc = 1;
+       }
+       RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+       if (goal <= curr)
+               return;
+       *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+               ~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+                              struct llog_handle *loghandle, int *cur_idx,
+                              int next_idx, __u64 *cur_offset, void *buf,
+                              int len)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       struct dt_device        *dt;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(lgi);
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+              next_idx, *cur_idx, *cur_offset);
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(dt_object_exists(o));
+       dt = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(dt);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       while (*cur_offset < lgi->lgi_attr.la_size) {
+               struct llog_rec_hdr     *rec, *last_rec;
+               struct llog_rec_tail    *tail;
+
+               llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+               /* read up to next LLOG_CHUNK_SIZE block */
+               lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+                                     (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+               lgi->lgi_buf.lb_buf = buf;
+
+               /* Note: read lock is not needed around la_size get above at
+                * the time of dt_attr_get(). There are only two cases that
+                * matter. Either la_size == cur_offset, in which case the
+                * entire read is skipped, or la_size > cur_offset and the loop
+                * is entered and this thread is blocked at dt_read_lock()
+                * until the write is completed. When the write completes, then
+                * the dt_read() will be done with the full length, and will
+                * get the full data.
+                */
+               dt_read_lock(env, o, 0);
+               rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+               dt_read_unlock(env, o);
+               if (rc < 0) {
+                       CERROR("%s: can't read llog block from log "DFID
+                              " offset "LPU64": rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+                              rc);
+                       GOTO(out, rc);
+               }
+
+               if (rc < len) {
+                       /* signal the end of the valid buffer to
+                        * llog_process */
+                       memset(buf + rc, 0, len - rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)((char *)buf + rc -
+                                               sizeof(struct llog_rec_tail));
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               *cur_idx = tail->lrt_index;
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (tail->lrt_index < next_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > next_idx) {
+                       CERROR("%s: missed desired record? %u > %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              rec->lrh_index, next_idx);
+                       GOTO(out, rc = -ENOENT);
+               }
+               GOTO(out, rc = 0);
+       }
+       GOTO(out, rc = -EIO);
+out:
+       return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+                              struct llog_handle *loghandle,
+                              int prev_idx, void *buf, int len)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       struct dt_device        *dt;
+       loff_t                   cur_offset;
+       int                      rc;
+
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(dt_object_exists(o));
+       dt = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(dt);
+
+       cur_offset = LLOG_CHUNK_SIZE;
+       llog_skip_over(&cur_offset, 0, prev_idx);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       while (cur_offset < lgi->lgi_attr.la_size) {
+               struct llog_rec_hdr     *rec, *last_rec;
+               struct llog_rec_tail    *tail;
+
+               lgi->lgi_buf.lb_len = len;
+               lgi->lgi_buf.lb_buf = buf;
+               /* It is OK to have locking around dt_read() only, see
+                * comment in llog_osd_next_block for details
+                */
+               dt_read_lock(env, o, 0);
+               rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+               dt_read_unlock(env, o);
+               if (rc < 0) {
+                       CERROR("%s: can't read llog block from log "DFID
+                              " offset "LPU64": rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+                       GOTO(out, rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)((char *)buf + rc -
+                                               sizeof(struct llog_rec_tail));
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (tail->lrt_index < prev_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > prev_idx) {
+                       CERROR("%s: missed desired record? %u > %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              rec->lrh_index, prev_idx);
+                       GOTO(out, rc = -ENOENT);
+               }
+               GOTO(out, rc = 0);
+       }
+       GOTO(out, rc = -EIO);
+out:
+       return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+                                  struct llog_ctxt *ctxt)
+{
+       struct dt_device        *dt;
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dir;
+       int                      rc;
+
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       if (ctxt->loc_dir == NULL) {
+               rc = dt_root_get(env, dt, &dti->dti_fid);
+               if (rc)
+                       return ERR_PTR(rc);
+               dir = dt_locate(env, dt, &dti->dti_fid);
+       } else {
+               lu_object_get(&ctxt->loc_dir->do_lu);
+               dir = ctxt->loc_dir;
+       }
+
+       return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_logid *logid, char *name,
+                        enum llog_open_param open_param)
+{
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct llog_ctxt                *ctxt = handle->lgh_ctxt;
+       struct dt_object                *o;
+       struct dt_device                *dt;
+       struct ls_device                *ls;
+       struct local_oid_storage        *los;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       LASSERT(ctxt->loc_exp->exp_obd);
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       LASSERT(dt);
+
+       ls = ls_device_get(dt);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+       mutex_unlock(&ls->ls_los_mutex);
+       LASSERT(los);
+       ls_device_put(env, ls);
+
+       LASSERT(handle);
+
+       if (logid != NULL) {
+               logid_to_fid(logid, &lgi->lgi_fid);
+       } else if (name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, ctxt);
+               if (IS_ERR(llog_dir))
+                       GOTO(out, rc = PTR_ERR(llog_dir));
+               dt_read_lock(env, llog_dir, 0);
+               rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+               dt_read_unlock(env, llog_dir);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+                       /* generate fid for new llog */
+                       rc = local_object_fid_generate(env, los,
+                                                      &lgi->lgi_fid);
+               }
+               if (rc < 0)
+                       GOTO(out, rc);
+               OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+               if (handle->lgh_name)
+                       strcpy(handle->lgh_name, name);
+               else
+                       GOTO(out, rc = -ENOMEM);
+       } else {
+               LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+               /* generate fid for new llog */
+               rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       o = ls_locate(env, ls, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               GOTO(out_name, rc = PTR_ERR(o));
+
+       /* No new llog is expected but doesn't exist */
+       if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+               GOTO(out_put, rc = -ENOENT);
+
+       fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+       handle->lgh_obj = o;
+       handle->private_data = los;
+       LASSERT(handle->lgh_ctxt);
+
+       RETURN(rc);
+
+out_put:
+       lu_object_put(env, &o->do_lu);
+out_name:
+       if (handle->lgh_name != NULL)
+               OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+       dt_los_put(los);
+       RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+       LASSERT(handle->lgh_obj);
+       return (dt_object_exists(handle->lgh_obj) &&
+               !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+                                  struct llog_handle *res, struct thandle *th)
+{
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct local_oid_storage        *los;
+       struct dt_object                *o;
+       int                              rc;
+
+       ENTRY;
+
+       LASSERT(res->lgh_obj);
+       LASSERT(th);
+
+       /* object can be created by another thread */
+       o = res->lgh_obj;
+       if (dt_object_exists(o))
+               RETURN(0);
+
+       los = res->private_data;
+       LASSERT(los);
+
+       rc = llog_osd_declare_new_object(env, los, o, th);
+       if (rc)
+               RETURN(rc);
+
+       rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+       if (rc)
+               RETURN(rc);
+
+       if (res->lgh_name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+               if (IS_ERR(llog_dir))
+                       RETURN(PTR_ERR(llog_dir));
+               logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+               rc = dt_declare_insert(env, llog_dir,
+                                      (struct dt_rec *)&lgi->lgi_fid,
+                                      (struct dt_key *)res->lgh_name, th);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc)
+                       CERROR("%s: can't declare named llog %s: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              res->lgh_name, rc);
+       }
+       RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+                          struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct local_oid_storage *los;
+       struct dt_object        *o;
+       int                   rc = 0;
+
+       ENTRY;
+
+       LASSERT(env);
+       o = res->lgh_obj;
+       LASSERT(o);
+
+       /* llog can be already created */
+       if (dt_object_exists(o))
+               RETURN(-EEXIST);
+
+       los = res->private_data;
+       LASSERT(los);
+
+       dt_write_lock(env, o, 0);
+       if (!dt_object_exists(o))
+               rc = llog_osd_create_new_object(env, los, o, th);
+       else
+               rc = -EEXIST;
+
+       dt_write_unlock(env, o);
+       if (rc)
+               RETURN(rc);
+
+       if (res->lgh_name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+               if (IS_ERR(llog_dir))
+                       RETURN(PTR_ERR(llog_dir));
+
+               logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+               dt_read_lock(env, llog_dir, 0);
+               rc = dt_insert(env, llog_dir,
+                              (struct dt_rec *)&lgi->lgi_fid,
+                              (struct dt_key *)res->lgh_name,
+                              th, BYPASS_CAPA, 1);
+               dt_read_unlock(env, llog_dir);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc)
+                       CERROR("%s: can't create named llog %s: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              res->lgh_name, rc);
+       }
+       RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+       struct local_oid_storage        *los;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(handle->lgh_obj);
+
+       lu_object_put(env, &handle->lgh_obj->do_lu);
+
+       los = handle->private_data;
+       LASSERT(los);
+       dt_los_put(los);
+
+       if (handle->lgh_name)
+               OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+       RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+                           struct llog_handle *loghandle)
+{
+       struct llog_ctxt        *ctxt;
+       struct dt_object        *o, *llog_dir = NULL;
+       struct dt_device        *d;
+       struct thandle          *th;
+       char                    *name = NULL;
+       int                      rc;
+
+       ENTRY;
+
+       ctxt = loghandle->lgh_ctxt;
+       LASSERT(ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+
+       d = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(d);
+       LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+       th = dt_trans_create(env, d);
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       if (loghandle->lgh_name) {
+               llog_dir = llog_osd_dir_get(env, ctxt);
+               if (IS_ERR(llog_dir))
+                       GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+               name = loghandle->lgh_name;
+               rc = dt_declare_delete(env, llog_dir,
+                                      (struct dt_key *)name, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+       }
+
+       dt_declare_ref_del(env, o, th);
+
+       rc = dt_declare_destroy(env, o, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       rc = dt_trans_start_local(env, d, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       dt_write_lock(env, o, 0);
+       if (dt_object_exists(o)) {
+               if (name) {
+                       dt_read_lock(env, llog_dir, 0);
+                       rc = dt_delete(env, llog_dir,
+                                      (struct dt_key *) name,
+                                      th, BYPASS_CAPA);
+                       dt_read_unlock(env, llog_dir);
+                       if (rc) {
+                               CERROR("%s: can't remove llog %s: rc = %d\n",
+                                      o->do_lu.lo_dev->ld_obd->obd_name,
+                                      name, rc);
+                               GOTO(out_unlock, rc);
+                       }
+               }
+               dt_ref_del(env, o, th);
+               rc = dt_destroy(env, o, th);
+               if (rc)
+                       GOTO(out_unlock, rc);
+       }
+out_unlock:
+       dt_write_unlock(env, o);
+out_trans:
+       dt_trans_stop(env, d, th);
+       if (llog_dir != NULL)
+               lu_object_put(env, &llog_dir->do_lu);
+       RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+                         struct obd_llog_group *olg, int ctxt_idx,
+                         struct obd_device *disk_obd)
+{
+       struct local_oid_storage        *los;
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct llog_ctxt                *ctxt;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(obd);
+       LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+       ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+       LASSERT(ctxt);
+
+       /* initialize data allowing to generate new fids,
+        * literally we need a sequece */
+       lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+       lgi->lgi_fid.f_oid = 1;
+       lgi->lgi_fid.f_ver = 0;
+       rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+                                   &lgi->lgi_fid, &los);
+       if (rc < 0)
+               return rc;
+
+       lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+       lgi->lgi_fid.f_oid = 1;
+       lgi->lgi_fid.f_ver = 0;
+       rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+                                   &lgi->lgi_fid, &los);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct dt_device                *dt;
+       struct ls_device                *ls;
+       struct local_oid_storage        *los, *nlos;
+
+       LASSERT(ctxt->loc_exp->exp_obd);
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       ls = ls_device_get(dt);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       los = dt_los_find(ls, FID_SEQ_LLOG);
+       nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+       mutex_unlock(&ls->ls_los_mutex);
+       if (los != NULL) {
+               dt_los_put(los);
+               local_oid_storage_fini(env, los);
+       }
+       if (nlos != NULL) {
+               dt_los_put(nlos);
+               local_oid_storage_fini(env, nlos);
+       }
+       ls_device_put(env, ls);
+       return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+       .lop_next_block         = llog_osd_next_block,
+       .lop_prev_block         = llog_osd_prev_block,
+       .lop_read_header        = llog_osd_read_header,
+       .lop_destroy            = llog_osd_destroy,
+       .lop_setup              = llog_osd_setup,
+       .lop_cleanup            = llog_osd_cleanup,
+       .lop_open               = llog_osd_open,
+       .lop_exist              = llog_osd_exist,
+       .lop_declare_create     = llog_osd_declare_create,
+       .lop_create             = llog_osd_create,
+       .lop_declare_write_rec  = llog_osd_declare_write_rec,
+       .lop_write_rec          = llog_osd_write_rec,
+       .lop_close              = llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count, struct llog_catid *idarray)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       int                      rc, size;
+
+       ENTRY;
+
+       LASSERT(d);
+
+       size = sizeof(*idarray) * count;
+       lgi->lgi_off = idx *  sizeof(*idarray);
+
+       lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+       o = dt_locate(env, d, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               RETURN(PTR_ERR(o));
+
+       if (!dt_object_exists(o)) {
+               th = dt_trans_create(env, d);
+               if (IS_ERR(th))
+                       GOTO(out, rc = PTR_ERR(th));
+
+               lgi->lgi_attr.la_valid = LA_MODE;
+               lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+               lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+               rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+                                      &lgi->lgi_dof, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, d, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               dt_write_lock(env, o, 0);
+               if (!dt_object_exists(o))
+                       rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+                                      &lgi->lgi_dof, th);
+               dt_write_unlock(env, o);
+out_trans:
+               dt_trans_stop(env, d, th);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+               CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      lgi->lgi_attr.la_mode);
+               GOTO(out, rc = -ENOENT);
+       }
+
+       CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+              (int)lgi->lgi_attr.la_size, size);
+
+       /* return just number of llogs */
+       if (idarray == NULL) {
+               rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+               GOTO(out, rc);
+       }
+
+       /* read for new ost index or for empty file */
+       memset(idarray, 0, size);
+       if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+               GOTO(out, rc = 0);
+       if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+               size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+       lgi->lgi_buf.lb_buf = idarray;
+       lgi->lgi_buf.lb_len = size;
+       rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+       if (rc) {
+               CERROR("%s: error reading CATALOGS: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+               GOTO(out, rc);
+       }
+
+       EXIT;
+out:
+       lu_object_put(env, &o->do_lu);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count, struct llog_catid *idarray)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       int                      rc, size;
+
+       if (!count)
+               RETURN(0);
+
+       LASSERT(d);
+
+       size = sizeof(*idarray) * count;
+       lgi->lgi_off = idx * sizeof(*idarray);
+
+       lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+       o = dt_locate(env, d, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               RETURN(PTR_ERR(o));
+
+       if (!dt_object_exists(o))
+               GOTO(out, rc = -ENOENT);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+               CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      lgi->lgi_attr.la_mode);
+               GOTO(out, rc = -ENOENT);
+       }
+
+       th = dt_trans_create(env, d);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = dt_trans_start_local(env, d, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       lgi->lgi_buf.lb_buf = idarray;
+       lgi->lgi_buf.lb_len = size;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+       if (rc)
+               CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+       dt_trans_stop(env, d, th);
+out:
+       lu_object_put(env, &o->do_lu);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644 (file)
index 0000000..ea70b99
--- /dev/null
@@ -0,0 +1,402 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+       CDEBUG(D_OTHER, "llogd body: %p\n", d);
+       CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+              POSTID(&d->lgd_logid.lgl_oi));
+       CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+       CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+       CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+       CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+       CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+       CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+       CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+       __swab64s (&fid->f_seq);
+       __swab32s (&fid->f_oid);
+       __swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+       if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+               __swab64s(&oid->oi.oi_id);
+               __swab64s(&oid->oi.oi_seq);
+       } else {
+               lustre_swab_lu_fid(&oid->oi_fid);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+       ENTRY;
+       print_llogd_body(d);
+       lustre_swab_ost_id(&d->lgd_logid.lgl_oi);
+       __swab32s (&d->lgd_logid.lgl_ogen);
+       __swab32s (&d->lgd_ctxt_idx);
+       __swab32s (&d->lgd_llh_flags);
+       __swab32s (&d->lgd_index);
+       __swab32s (&d->lgd_saved_index);
+       __swab32s (&d->lgd_len);
+       __swab64s (&d->lgd_cur_offset);
+       print_llogd_body(d);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+       __swab64s (&d->lgdc_gen.mnt_cnt);
+       __swab64s (&d->lgdc_gen.conn_cnt);
+       lustre_swab_ost_id(&d->lgdc_logid.lgl_oi);
+       __swab32s (&d->lgdc_logid.lgl_ogen);
+       __swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+       __swab64s (&fid->id);
+       __swab32s (&fid->generation);
+       __swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+       __swab64s (&range->lsr_start);
+       __swab64s (&range->lsr_end);
+       __swab32s (&range->lsr_index);
+       __swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+       struct llog_rec_tail *tail = NULL;
+
+       __swab32s(&rec->lrh_len);
+       __swab32s(&rec->lrh_index);
+       __swab32s(&rec->lrh_type);
+       __swab32s(&rec->lrh_id);
+
+       switch (rec->lrh_type) {
+       case OST_SZ_REC:
+       {
+               struct llog_size_change_rec *lsc =
+                       (struct llog_size_change_rec *)rec;
+
+               lustre_swab_ll_fid(&lsc->lsc_fid);
+               __swab32s(&lsc->lsc_ioepoch);
+               tail = &lsc->lsc_tail;
+               break;
+       }
+       case MDS_UNLINK_REC:
+       {
+               struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+               __swab64s(&lur->lur_oid);
+               __swab32s(&lur->lur_oseq);
+               __swab32s(&lur->lur_count);
+               tail = &lur->lur_tail;
+               break;
+       }
+       case MDS_UNLINK64_REC:
+       {
+               struct llog_unlink64_rec *lur =
+                       (struct llog_unlink64_rec *)rec;
+
+               lustre_swab_lu_fid(&lur->lur_fid);
+               __swab32s(&lur->lur_count);
+               tail = &lur->lur_tail;
+               break;
+       }
+       case CHANGELOG_REC:
+       {
+               struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+               __swab16s(&cr->cr.cr_namelen);
+               __swab16s(&cr->cr.cr_flags);
+               __swab32s(&cr->cr.cr_type);
+               __swab64s(&cr->cr.cr_index);
+               __swab64s(&cr->cr.cr_prev);
+               __swab64s(&cr->cr.cr_time);
+               lustre_swab_lu_fid(&cr->cr.cr_tfid);
+               lustre_swab_lu_fid(&cr->cr.cr_pfid);
+               if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+                       struct llog_changelog_ext_rec *ext =
+                               (struct llog_changelog_ext_rec *)rec;
+
+                       lustre_swab_lu_fid(&ext->cr.cr_sfid);
+                       lustre_swab_lu_fid(&ext->cr.cr_spfid);
+                       tail = &ext->cr_tail;
+               } else {
+                       tail = &cr->cr_tail;
+               }
+               break;
+       }
+       case CHANGELOG_USER_REC:
+       {
+               struct llog_changelog_user_rec *cur =
+                       (struct llog_changelog_user_rec*)rec;
+
+               __swab32s(&cur->cur_id);
+               __swab64s(&cur->cur_endrec);
+               tail = &cur->cur_tail;
+               break;
+       }
+
+       case MDS_SETATTR64_REC:
+       {
+               struct llog_setattr64_rec *lsr =
+                       (struct llog_setattr64_rec *)rec;
+
+               lustre_swab_ost_id(&lsr->lsr_oi);
+               __swab32s(&lsr->lsr_uid);
+               __swab32s(&lsr->lsr_uid_h);
+               __swab32s(&lsr->lsr_gid);
+               __swab32s(&lsr->lsr_gid_h);
+               tail = &lsr->lsr_tail;
+               break;
+       }
+       case OBD_CFG_REC:
+               /* these are swabbed as they are consumed */
+               break;
+       case LLOG_HDR_MAGIC:
+       {
+               struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+               __swab64s(&llh->llh_timestamp);
+               __swab32s(&llh->llh_count);
+               __swab32s(&llh->llh_bitmap_offset);
+               __swab32s(&llh->llh_flags);
+               __swab32s(&llh->llh_size);
+               __swab32s(&llh->llh_cat_idx);
+               tail = &llh->llh_tail;
+               break;
+       }
+       case LLOG_LOGID_MAGIC:
+       {
+               struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+               lustre_swab_ost_id(&lid->lid_id.lgl_oi);
+               __swab32s(&lid->lid_id.lgl_ogen);
+               tail = &lid->lid_tail;
+               break;
+       }
+       case LLOG_GEN_REC:
+       {
+               struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+               __swab64s(&lgr->lgr_gen.mnt_cnt);
+               __swab64s(&lgr->lgr_gen.conn_cnt);
+               tail = &lgr->lgr_tail;
+               break;
+       }
+       case LLOG_PAD_MAGIC:
+               break;
+       default:
+               CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+                      rec->lrh_type, rec);
+       }
+
+       if (tail) {
+               __swab32s(&tail->lrt_len);
+               __swab32s(&tail->lrt_index);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+       CDEBUG(D_OTHER, "llog header: %p\n", h);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+       CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+       CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+       CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+       CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+       CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+       CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+       CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+       CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+       ENTRY;
+       print_llog_hdr(h);
+
+       lustre_swab_llog_rec(&h->llh_hdr);
+
+       print_llog_hdr(h);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+       int i;
+       ENTRY;
+
+       if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+               return;
+       CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+       if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+               for (i = 0; i < lcfg->lcfg_bufcount; i++)
+                       CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+                              i, lcfg->lcfg_buflens[i]);
+       EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+       int i;
+       ENTRY;
+
+       __swab32s(&lcfg->lcfg_version);
+
+       if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+               CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+                      lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+               EXIT;
+               return;
+       }
+
+       __swab32s(&lcfg->lcfg_command);
+       __swab32s(&lcfg->lcfg_num);
+       __swab32s(&lcfg->lcfg_flags);
+       __swab64s(&lcfg->lcfg_nid);
+       __swab32s(&lcfg->lcfg_bufcount);
+       for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+               __swab32s(&lcfg->lcfg_buflens[i]);
+
+       print_lustre_cfg(lcfg);
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+       __u32   cm_step;
+       __u32   cm_flags;
+       __u32   cm_vers;
+       __u32   padding;
+       __u32   cm_createtime;
+       __u32   cm_canceltime;
+       char    cm_tgtname[MTI_NAME_MAXLEN];
+       char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+       (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+       struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+       ENTRY;
+
+       if (swab) {
+               __swab32s(&marker->cm_step);
+               __swab32s(&marker->cm_flags);
+               __swab32s(&marker->cm_vers);
+       }
+       if (size == sizeof(*cm32)) {
+               __u32 createtime, canceltime;
+               /* There was a problem with the original declaration of
+                * cfg_marker on 32-bit systems because it used time_t as
+                * a wire protocol structure, and didn't verify this in
+                * wirecheck.  We now have to convert the offsets of the
+                * later fields in order to work on 32- and 64-bit systems.
+                *
+                * Fortunately, the cm_comment field has no functional use
+                * so can be sacrificed when converting the timestamp size.
+                *
+                * Overwrite fields from the end first, so they are not
+                * clobbered, and use memmove() instead of memcpy() because
+                * the source and target buffers overlap.  bug 16771 */
+               createtime = cm32->cm_createtime;
+               canceltime = cm32->cm_canceltime;
+               memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+               marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+               memmove(marker->cm_tgtname, cm32->cm_tgtname,
+                       sizeof(marker->cm_tgtname));
+               if (swab) {
+                       __swab32s(&createtime);
+                       __swab32s(&canceltime);
+               }
+               marker->cm_createtime = createtime;
+               marker->cm_canceltime = canceltime;
+               CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+                      "for target %s, converting\n",
+                      marker->cm_tgtname);
+       } else if (swab) {
+               __swab64s(&marker->cm_createtime);
+               __swab64s(&marker->cm_canceltime);
+       }
+
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644 (file)
index 0000000..d397f78
--- /dev/null
@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+       struct llog_rec_hdr     lmr_hdr;
+       struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+       int i;
+       int last_idx = 0;
+       int active_recs = 0;
+
+       for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+               if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+                       last_idx = i;
+                       active_recs++;
+               }
+       }
+
+       if (active_recs != num_recs) {
+               CERROR("%s: expected %d active recs after write, found %d\n",
+                      test, num_recs, active_recs);
+               RETURN(-ERANGE);
+       }
+
+       if (llh->lgh_hdr->llh_count != num_recs) {
+               CERROR("%s: handle->count is %d, expected %d after write\n",
+                      test, llh->lgh_hdr->llh_count, num_recs);
+               RETURN(-ERANGE);
+       }
+
+       if (llh->lgh_last_idx < last_idx) {
+               CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+                      test, llh->lgh_last_idx, last_idx);
+               RETURN(-ERANGE);
+       }
+
+       RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+                      struct obd_device *obd, char *name)
+{
+       struct llog_handle      *llh;
+       struct llog_ctxt        *ctxt;
+       int rc;
+       int rc2;
+
+       ENTRY;
+
+       CWARN("1a: create a log with name: %s\n", name);
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       rc = llog_open_create(env, ctxt, &llh, NULL, name);
+       if (rc) {
+               CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+               GOTO(out, rc);
+       }
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("1a: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       rc = verify_handle("1", llh, 1);
+
+       CWARN("1b: close newly-created log\n");
+out_close:
+       rc2 = llog_close(env, llh);
+       if (rc2) {
+               CERROR("1b: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+out:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+                      char *name, struct llog_handle **llh)
+{
+       struct llog_ctxt        *ctxt;
+       struct llog_handle      *loghandle;
+       struct llog_logid        logid;
+       int                      rc;
+
+       ENTRY;
+
+       CWARN("2a: re-open a log with name: %s\n", name);
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+               GOTO(out_put, rc);
+       }
+
+       rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2a: can't init llog handle: %d\n", rc);
+               GOTO(out_close_llh, rc);
+       }
+
+       rc = verify_handle("2", *llh, 1);
+       if (rc)
+               GOTO(out_close_llh, rc);
+
+       /* XXX: there is known issue with tests 2b, MGS is not able to create
+        * anonymous llog, exit now to allow following tests run.
+        * It is fixed in upcoming llog over OSD code */
+       GOTO(out_put, rc);
+
+       CWARN("2b: create a log without specified NAME & LOGID\n");
+       rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+       if (rc) {
+               CERROR("2b: create log failed\n");
+               GOTO(out_close_llh, rc);
+       }
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2b: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       logid = loghandle->lgh_id;
+       llog_close(env, loghandle);
+
+       CWARN("2c: re-open the log by LOGID\n");
+       rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("2c: re-open log by LOGID failed\n");
+               GOTO(out_close_llh, rc);
+       }
+
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2c: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       CWARN("2b: destroy this log\n");
+       rc = llog_destroy(env, loghandle);
+       if (rc)
+               CERROR("2d: destroy log failed\n");
+out_close:
+       llog_close(env, loghandle);
+out_close_llh:
+       if (rc)
+               llog_close(env, *llh);
+out_put:
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+                      struct llog_handle *llh)
+{
+       struct llog_gen_rec      lgr;
+       int                      rc, i;
+       int                      num_recs = 1; /* 1 for the header */
+
+       ENTRY;
+
+       lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+       lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+       CWARN("3a: write one create_rec\n");
+       rc = llog_write(env, llh,  &lgr.lgr_hdr, NULL, 0, NULL, -1);
+       num_recs++;
+       if (rc < 0) {
+               CERROR("3a: write one log record failed: %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = verify_handle("3a", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+       for (i = 0; i < 10; i++) {
+               struct llog_rec_hdr     hdr;
+               char                    buf[8];
+
+               hdr.lrh_len = 8;
+               hdr.lrh_type = OBD_CFG_REC;
+               memset(buf, 0, sizeof buf);
+               rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+               if (rc < 0) {
+                       CERROR("3b: write 10 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+
+       rc = verify_handle("3b", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3c: write 1000 more log records\n");
+       for (i = 0; i < 1000; i++) {
+               rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+               if (rc < 0) {
+                       CERROR("3c: write 1000 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+
+       rc = verify_handle("3c", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+       for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+               struct llog_rec_hdr     hdr;
+               char                    buf_even[24];
+               char                    buf_odd[32];
+
+               memset(buf_odd, 0, sizeof buf_odd);
+               memset(buf_even, 0, sizeof buf_even);
+               if ((i % 2) == 0) {
+                       hdr.lrh_len = 24;
+                       hdr.lrh_type = OBD_CFG_REC;
+                       rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+               } else {
+                       hdr.lrh_len = 32;
+                       hdr.lrh_type = OBD_CFG_REC;
+                       rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+               }
+               if (rc == -ENOSPC) {
+                       break;
+               } else if (rc < 0) {
+                       CERROR("3d: write recs failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+       if (rc != -ENOSPC) {
+               CWARN("3d: write record more than BITMAP size!\n");
+               RETURN(-EINVAL);
+       }
+       CWARN("3d: wrote %d more records before end of llog is reached\n",
+             num_recs);
+
+       rc = verify_handle("3d", llh, num_recs);
+
+       RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *cath;
+       char                     name[10];
+       int                      rc, rc2, i, buflen;
+       struct llog_mini_rec     lmr;
+       struct llog_cookie       cookie;
+       struct llog_ctxt        *ctxt;
+       int                      num_recs = 0;
+       char                    *buf;
+       struct llog_rec_hdr      rec;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+       lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+       sprintf(name, "%x", llog_test_rand + 1);
+       CWARN("4a: create a catalog log with name: %s\n", name);
+       rc = llog_open_create(env, ctxt, &cath, NULL, name);
+       if (rc) {
+               CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+               GOTO(ctxt_release, rc);
+       }
+       rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+       if (rc) {
+               CERROR("4a: can't init llog handle: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       num_recs++;
+       cat_logid = cath->lgh_id;
+
+       CWARN("4b: write 1 record into the catalog\n");
+       rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+       if (rc != 1) {
+               CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+               GOTO(out, rc);
+       }
+       num_recs++;
+       rc = verify_handle("4b", cath, 2);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4c: cancel 1 log record\n");
+       rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+       if (rc) {
+               CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       num_recs--;
+
+       rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+       for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+               rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+               if (rc) {
+                       CERROR("4d: write %d records failed at #%d: %d\n",
+                              LLOG_TEST_RECNUM, i + 1, rc);
+                       GOTO(out, rc);
+               }
+               num_recs++;
+       }
+
+       /* make sure new plain llog appears */
+       rc = verify_handle("4d", cath, 3);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4e: add 5 large records, one record per block\n");
+       buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                sizeof(struct llog_rec_tail);
+       OBD_ALLOC(buf, buflen);
+       if (buf == NULL)
+               GOTO(out, rc = -ENOMEM);
+       for (i = 0; i < 5; i++) {
+               rec.lrh_len = buflen;
+               rec.lrh_type = OBD_CFG_REC;
+               rc = llog_cat_add(env, cath, &rec, NULL, buf);
+               if (rc) {
+                       CERROR("4e: write 5 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       GOTO(out_free, rc);
+               }
+               num_recs++;
+       }
+out_free:
+       OBD_FREE(buf, buflen);
+out:
+       CWARN("4f: put newly-created catalog\n");
+       rc2 = llog_cat_close(env, cath);
+       if (rc2) {
+               CERROR("4: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+ctxt_release:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                       struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       struct lu_fid            fid = {0};
+
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+
+       logid_to_fid(&lir->lid_id, &fid);
+
+       CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+             rec->lrh_index, PFID(&fid),
+             PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+       cat_counter++;
+
+       RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                         struct llog_rec_hdr *rec, void *data)
+{
+       struct lu_fid fid = {0};
+
+       if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+               CERROR("log is not plain\n");
+               RETURN(-EINVAL);
+       }
+
+       logid_to_fid(&llh->lgh_id, &fid);
+
+       CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+              rec->lrh_index, PFID(&fid));
+
+       plain_counter++;
+
+       RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+                             struct llog_handle *llh,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_cookie cookie;
+
+       if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+               CERROR("log is not plain\n");
+               RETURN(-EINVAL);
+       }
+
+       cookie.lgc_lgl = llh->lgh_id;
+       cookie.lgc_index = rec->lrh_index;
+
+       llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+       cancel_count++;
+       if (cancel_count == LLOG_TEST_RECNUM)
+               RETURN(-LLOG_EEMPTY);
+       RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *llh = NULL;
+       char                     name[10];
+       int                      rc, rc2;
+       struct llog_mini_rec     lmr;
+       struct llog_ctxt        *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+       lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+       CWARN("5a: re-open catalog by id\n");
+       rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("5a: llog_create with logid failed: %d\n", rc);
+               GOTO(out_put, rc);
+       }
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+       if (rc) {
+               CERROR("5a: can't init llog handle: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       CWARN("5b: print the catalog entries.. we expect 2\n");
+       cat_counter = 0;
+       rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+       if (rc) {
+               CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (cat_counter != 2) {
+               CERROR("5b: %d entries in catalog\n", cat_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+       cancel_count = 0;
+       rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+       if (rc != -LLOG_EEMPTY) {
+               CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       CWARN("5c: print the catalog entries.. we expect 1\n");
+       cat_counter = 0;
+       rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+       if (rc) {
+               CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (cat_counter != 1) {
+               CERROR("5c: %d entries in catalog\n", cat_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+       rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+       if (rc) {
+               CERROR("5d: add record to the log with many canceled empty "
+                      "pages failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("5e: print plain log entries.. expect 6\n");
+       plain_counter = 0;
+       rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+       if (rc) {
+               CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (plain_counter != 6) {
+               CERROR("5e: found %d records\n", plain_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5f: print plain log entries reversely.. expect 6\n");
+       plain_counter = 0;
+       rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+       if (rc) {
+               CERROR("5f: reversely process with plain_print_cb failed:"
+                      "%d\n", rc);
+               GOTO(out, rc);
+       }
+       if (plain_counter != 6) {
+               CERROR("5f: found %d records\n", plain_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+out:
+       CWARN("5g: close re-opened catalog\n");
+       rc2 = llog_cat_close(env, llh);
+       if (rc2) {
+               CERROR("5g: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+out_put:
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+                      char *name)
+{
+       struct obd_device       *mgc_obd;
+       struct llog_ctxt        *ctxt;
+       struct obd_uuid         *mgs_uuid;
+       struct obd_export       *exp;
+       struct obd_uuid          uuid = { "LLOG_TEST6_UUID" };
+       struct llog_handle      *llh = NULL;
+       struct llog_ctxt        *nctxt;
+       int                      rc, rc2;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+       mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+       CWARN("6a: re-open log %s using client API\n", name);
+       mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+       if (mgc_obd == NULL) {
+               CERROR("6a: no MGC devices connected to %s found.\n",
+                      mgs_uuid->uuid);
+               GOTO(ctxt_release, rc = -ENOENT);
+       }
+
+       rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+                        NULL /* obd_connect_data */, NULL);
+       if (rc != -EALREADY) {
+               CERROR("6a: connect on connected MGC (%s) failed to return"
+                      " -EALREADY", mgc_obd->obd_name);
+               if (rc == 0)
+                       obd_disconnect(exp);
+               GOTO(ctxt_release, rc = -EINVAL);
+       }
+
+       nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+       rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("6a: llog_open failed %d\n", rc);
+               GOTO(nctxt_put, rc);
+       }
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc) {
+               CERROR("6a: llog_init_handle failed %d\n", rc);
+               GOTO(parse_out, rc);
+       }
+
+       plain_counter = 1; /* llog header is first record */
+       CWARN("6b: process log %s using client API\n", name);
+       rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+       if (rc)
+               CERROR("6b: llog_process failed %d\n", rc);
+       CWARN("6b: processed %d records\n", plain_counter);
+
+       rc = verify_handle("6b", llh, plain_counter);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       plain_counter = 1; /* llog header is first record */
+       CWARN("6c: process log %s reversely using client API\n", name);
+       rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+       if (rc)
+               CERROR("6c: llog_reverse_process failed %d\n", rc);
+       CWARN("6c: processed %d records\n", plain_counter);
+
+       rc = verify_handle("6c", llh, plain_counter);
+       if (rc)
+               GOTO(parse_out, rc);
+
+parse_out:
+       rc2 = llog_close(env, llh);
+       if (rc2) {
+               CERROR("6: llog_close failed: rc = %d\n", rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+nctxt_put:
+       llog_ctxt_put(nctxt);
+ctxt_release:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+static union {
+       struct llog_rec_hdr             lrh;   /* common header */
+       struct llog_logid_rec           llr;   /* LLOG_LOGID_MAGIC */
+       struct llog_unlink64_rec        lur;   /* MDS_UNLINK64_REC */
+       struct llog_setattr64_rec       lsr64; /* MDS_SETATTR64_REC */
+       struct llog_size_change_rec     lscr;  /* OST_SZ_REC */
+       struct llog_changelog_rec       lcr;   /* CHANGELOG_REC */
+       struct llog_changelog_user_rec  lcur;  /* CHANGELOG_USER_REC */
+       struct llog_gen_rec             lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                          struct llog_rec_hdr *rec, void *data)
+{
+       struct lu_fid fid = {0};
+
+       logid_to_fid(&llh->lgh_id, &fid);
+
+       CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+              rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+       plain_counter++;
+       return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+                           struct llog_rec_hdr *rec, void *data)
+{
+       plain_counter++;
+       /* test LLOG_DEL_RECORD is working */
+       return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct llog_handle      *llh;
+       int                      rc = 0, i, process_count;
+       int                      num_recs = 0;
+
+       ENTRY;
+
+       rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+       if (rc) {
+               CERROR("7_sub: create log failed\n");
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, llh,
+                             LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+                             &uuid);
+       if (rc) {
+               CERROR("7_sub: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+               rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+                               NULL, -1);
+               if (rc == -ENOSPC) {
+                       break;
+               } else if (rc < 0) {
+                       CERROR("7_sub: write recs failed at #%d: %d\n",
+                              i + 1, rc);
+                       GOTO(out_close, rc);
+               }
+               num_recs++;
+       }
+       if (rc != -ENOSPC) {
+               CWARN("7_sub: write record more than BITMAP size!\n");
+               GOTO(out_close, rc = -EINVAL);
+       }
+
+       rc = verify_handle("7_sub", llh, num_recs + 1);
+       if (rc) {
+               CERROR("7_sub: verify handle failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+               CWARN("7_sub: records are not aligned, written %d from %u\n",
+                     num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+       plain_counter = 0;
+       rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+       if (rc) {
+               CERROR("7_sub: llog process failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       process_count = plain_counter;
+       if (process_count != num_recs) {
+               CERROR("7_sub: processed %d records from %d total\n",
+                      process_count, num_recs);
+               GOTO(out_close, rc = -EINVAL);
+       }
+
+       plain_counter = 0;
+       rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+       if (rc) {
+               CERROR("7_sub: reverse llog process failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       if (process_count != plain_counter) {
+               CERROR("7_sub: Reverse/direct processing found different"
+                      "number of records: %d/%d\n",
+                      plain_counter, process_count);
+               GOTO(out_close, rc = -EINVAL);
+       }
+       if (llog_exist(llh)) {
+               CERROR("7_sub: llog exists but should be zapped\n");
+               GOTO(out_close, rc = -EEXIST);
+       }
+
+       rc = verify_handle("7_sub", llh, 1);
+out_close:
+       if (rc)
+               llog_destroy(env, llh);
+       llog_close(env, llh);
+       RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+       CWARN("7a: test llog_logid_rec\n");
+       llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+       llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+       llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7a: llog_logid_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7b: test llog_unlink64_rec\n");
+       llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+       llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+       llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7b: llog_unlink_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7c: test llog_setattr64_rec\n");
+       llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+       llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+       llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7c: llog_setattr64_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7d: test llog_size_change_rec\n");
+       llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+       llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+       llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7d: llog_size_change_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7e: test llog_changelog_rec\n");
+       llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+       llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+       llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7e: llog_changelog_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7f: test llog_changelog_user_rec\n");
+       llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+       llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+       llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7f: llog_changelog_user_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7g: test llog_gen_rec\n");
+       llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+       llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+       llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7g: llog_size_change_rec test failed\n");
+               GOTO(out, rc);
+       }
+out:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *llh = NULL;
+       struct llog_ctxt        *ctxt;
+       int                      rc, err;
+       char                     name[10];
+
+       ENTRY;
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       sprintf(name, "%x", llog_test_rand);
+
+       rc = llog_test_1(env, obd, name);
+       if (rc)
+               GOTO(cleanup_ctxt, rc);
+
+       rc = llog_test_2(env, obd, name, &llh);
+       if (rc)
+               GOTO(cleanup_ctxt, rc);
+
+       rc = llog_test_3(env, obd, llh);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_4(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_5(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_6(env, obd, name);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_7(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+cleanup:
+       err = llog_destroy(env, llh);
+       if (err)
+               CERROR("cleanup: llog_destroy failed: %d\n", err);
+       llog_close(env, llh);
+       if (rc == 0)
+               rc = err;
+cleanup_ctxt:
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_llog_test_module_vars;
+    lvars->obd_vars     = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+       struct obd_device       *tgt;
+       struct lu_env            env;
+       int                      rc;
+
+       ENTRY;
+
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               RETURN(rc);
+
+       tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+       rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+       if (rc)
+               CERROR("failed to llog_test_llog_finish: %d\n", rc);
+       lu_env_fini(&env);
+       RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_device       *tgt;
+       struct llog_ctxt        *ctxt;
+       struct dt_object        *o;
+       struct lu_env            env;
+       struct lu_context        test_session;
+       int                      rc;
+
+       ENTRY;
+
+       if (lcfg->lcfg_bufcount < 2) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       if (lcfg->lcfg_buflens[1] < 1) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       /* disk obd */
+       tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+       if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+               CERROR("target device not attached or not set up (%s)\n",
+                      lustre_cfg_string(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               RETURN(rc);
+
+       rc = lu_context_init(&test_session, LCT_SESSION);
+       if (rc)
+               GOTO(cleanup_env, rc);
+       test_session.lc_thread = (struct ptlrpc_thread *)current;
+       lu_context_enter(&test_session);
+       env.le_ses = &test_session;
+
+       CWARN("Setup llog-test device over %s device\n",
+             lustre_cfg_string(lcfg, 1));
+
+       OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+       obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+       rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+                       &llog_osd_ops);
+       if (rc)
+               GOTO(cleanup_session, rc);
+
+       /* use MGS llog dir for tests */
+       ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+       LASSERT(ctxt);
+       o = ctxt->loc_dir;
+       llog_ctxt_put(ctxt);
+
+       ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+       ctxt->loc_dir = o;
+       llog_ctxt_put(ctxt);
+
+       llog_test_rand = cfs_rand();
+
+       rc = llog_run_tests(&env, tgt);
+       if (rc)
+               llog_test_cleanup(obd);
+cleanup_session:
+       lu_context_exit(&test_session);
+       lu_context_fini(&test_session);
+cleanup_env:
+       lu_env_fini(&env);
+       RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+       .o_owner       = THIS_MODULE,
+       .o_setup       = llog_test_setup,
+       .o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+       struct lprocfs_static_vars lvars;
+
+       lprocfs_llog_test_init_vars(&lvars);
+       return class_register_type(&llog_obd_ops, NULL,
+                                  lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+       class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644 (file)
index 0000000..b11ca67
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_object_conf *unused)
+{
+       struct ls_device        *ls;
+       struct lu_object        *below;
+       struct lu_device        *under;
+
+       ENTRY;
+
+       ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+       under = &ls->ls_osd->dd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+       if (below == NULL)
+               RETURN(-ENOMEM);
+
+       lu_object_add(o, below);
+
+       RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+       struct ls_object        *obj = lu2ls_obj(o);
+       struct lu_object_header *h = o->lo_header;
+
+       dt_object_fini(&obj->ls_obj);
+       lu_object_header_fini(h);
+       OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+       .loo_object_init  = ls_object_init,
+       .loo_object_free  = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+                                 const struct lu_object_header *_h,
+                                 struct lu_device *d)
+{
+       struct lu_object_header *h;
+       struct ls_object        *o;
+       struct lu_object        *l;
+
+       LASSERT(_h == NULL);
+
+       OBD_ALLOC_PTR(o);
+       if (o != NULL) {
+               l = &o->ls_obj.do_lu;
+               h = &o->ls_header;
+
+               lu_object_header_init(h);
+               dt_object_init(&o->ls_obj, h, d);
+               lu_object_add_top(h, l);
+
+               l->lo_ops = &ls_lu_obj_ops;
+
+               return l;
+       } else {
+               return NULL;
+       }
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+       .ldo_object_alloc =     ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+       struct ls_device *ls, *ret = NULL;
+
+       list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+               if (ls->ls_osd == dev) {
+                       atomic_inc(&ls->ls_refcount);
+                       ret = ls;
+                       break;
+               }
+       }
+       return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+       struct ls_device *ls;
+
+       mutex_lock(&ls_list_mutex);
+       ls = __ls_find_dev(dev);
+       mutex_unlock(&ls_list_mutex);
+
+       return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+       .ldto_start = NULL,
+       .ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+       .ldt_name = "local_storage",
+       .ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+       struct ls_device *ls;
+
+       ENTRY;
+
+       mutex_lock(&ls_list_mutex);
+       ls = __ls_find_dev(dev);
+       if (ls)
+               GOTO(out_ls, ls);
+
+       /* not found, then create */
+       OBD_ALLOC_PTR(ls);
+       if (ls == NULL)
+               GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+       atomic_set(&ls->ls_refcount, 1);
+       INIT_LIST_HEAD(&ls->ls_los_list);
+       mutex_init(&ls->ls_los_mutex);
+
+       ls->ls_osd = dev;
+
+       LASSERT(dev->dd_lu_dev.ld_site);
+       lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+       ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+       ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+       /* finally add ls to the list */
+       list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+       mutex_unlock(&ls_list_mutex);
+       RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+       LASSERT(env);
+       if (!atomic_dec_and_test(&ls->ls_refcount))
+               return;
+
+       mutex_lock(&ls_list_mutex);
+       if (atomic_read(&ls->ls_refcount) == 0) {
+               LASSERT(list_empty(&ls->ls_los_list));
+               list_del(&ls->ls_linkage);
+               lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+               lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+               OBD_FREE_PTR(ls);
+       }
+       mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+                             struct local_oid_storage *los,
+                             struct lu_fid *fid)
+{
+       LASSERT(los->los_dev);
+       LASSERT(los->los_obj);
+
+       /* take next OID */
+
+       /* to make it unique after reboot we store
+        * the latest generated fid atomically with
+        * object creation see local_object_create() */
+
+       mutex_lock(&los->los_id_lock);
+       fid->f_seq = los->los_seq;
+       fid->f_oid = los->los_last_oid++;
+       fid->f_ver = 0;
+       mutex_unlock(&los->los_id_lock);
+
+       return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+                               struct local_oid_storage *los,
+                               struct dt_object *o, struct lu_attr *attr,
+                               struct dt_object_format *dof,
+                               struct thandle *th)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       /* update fid generation file */
+       if (los != NULL) {
+               LASSERT(dt_object_exists(los->los_obj));
+               rc = dt_declare_record_write(env, los->los_obj,
+                                            sizeof(struct los_ondisk), 0, th);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       rc = dt_declare_create(env, o, attr, NULL, dof, th);
+       if (rc)
+               RETURN(rc);
+
+       dti->dti_lb.lb_buf = NULL;
+       dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+       rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+       RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+                       struct local_oid_storage *los,
+                       struct dt_object *o, struct lu_attr *attr,
+                       struct dt_object_format *dof, struct thandle *th)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct los_ondisk        losd;
+       int                      rc;
+
+       ENTRY;
+
+       rc = dt_create(env, o, attr, NULL, dof, th);
+       if (rc)
+               RETURN(rc);
+
+       if (los == NULL)
+               RETURN(rc);
+
+       LASSERT(los->los_obj);
+       LASSERT(dt_object_exists(los->los_obj));
+
+       /* many threads can be updated this, serialize
+        * them here to avoid the race where one thread
+        * takes the value first, but writes it last */
+       mutex_lock(&los->los_id_lock);
+
+       /* update local oid number on disk so that
+        * we know the last one used after reboot */
+       losd.lso_magic = cpu_to_le32(LOS_MAGIC);
+       losd.lso_next_oid = cpu_to_le32(los->los_last_oid);
+
+       dti->dti_off = 0;
+       dti->dti_lb.lb_buf = &losd;
+       dti->dti_lb.lb_len = sizeof(losd);
+       rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+                            th);
+       mutex_unlock(&los->los_id_lock);
+
+       RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+                                     const struct lu_fid *fid,
+                                     struct local_oid_storage *los,
+                                     struct ls_device *ls,
+                                     struct dt_object *parent,
+                                     const char *name, struct lu_attr *attr,
+                                     struct dt_object_format *dof)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       struct thandle          *th;
+       int                      rc;
+
+       dto = ls_locate(env, ls, fid);
+       if (unlikely(IS_ERR(dto)))
+               RETURN(dto);
+
+       LASSERT(dto != NULL);
+       if (dt_object_exists(dto))
+               GOTO(out, rc = -EEXIST);
+
+       th = dt_trans_create(env, ls->ls_osd);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = local_object_declare_create(env, los, dto, attr, dof, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       if (dti->dti_dof.dof_type == DFT_DIR) {
+               dt_declare_ref_add(env, dto, th);
+               dt_declare_ref_add(env, parent, th);
+       }
+
+       rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       rc = dt_trans_start_local(env, ls->ls_osd, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       if (dt_object_exists(dto))
+               GOTO(unlock, rc = 0);
+
+       CDEBUG(D_OTHER, "create new object "DFID"\n",
+              PFID(lu_object_fid(&dto->do_lu)));
+       rc = local_object_create(env, los, dto, attr, dof, th);
+       if (rc)
+               GOTO(unlock, rc);
+       LASSERT(dt_object_exists(dto));
+
+       if (dti->dti_dof.dof_type == DFT_DIR) {
+               if (!dt_try_as_dir(env, dto))
+                       GOTO(destroy, rc = -ENOTDIR);
+               /* Add "." and ".." for newly created dir */
+               rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+                              BYPASS_CAPA, 1);
+               if (rc)
+                       GOTO(destroy, rc);
+               dt_ref_add(env, dto, th);
+               rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+                              (void *)"..", th, BYPASS_CAPA, 1);
+               if (rc)
+                       GOTO(destroy, rc);
+       }
+
+       dt_write_lock(env, parent, 0);
+       rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+                      (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+       if (dti->dti_dof.dof_type == DFT_DIR)
+               dt_ref_add(env, parent, th);
+       dt_write_unlock(env, parent);
+       if (rc)
+               GOTO(destroy, rc);
+destroy:
+       if (rc)
+               dt_destroy(env, dto, th);
+unlock:
+       dt_write_unlock(env, dto);
+trans_stop:
+       dt_trans_stop(env, ls->ls_osd, th);
+out:
+       if (rc) {
+               lu_object_put_nocache(env, &dto->do_lu);
+               dto = ERR_PTR(rc);
+       }
+       RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+                                           struct local_oid_storage *los,
+                                           struct dt_object *parent,
+                                           const char *name, __u32 mode)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0)
+               /* name is found, get the object */
+               dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+       else if (rc != -ENOENT)
+               dto = ERR_PTR(rc);
+       else {
+               rc = local_object_fid_generate(env, los, &dti->dti_fid);
+               if (rc < 0) {
+                       dto = ERR_PTR(rc);
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid  = LA_MODE;
+                       dti->dti_attr.la_mode   = mode;
+                       dti->dti_dof.dof_type   = dt_mode_to_dft(mode & S_IFMT);
+                       dto = __local_file_create(env, &dti->dti_fid, los,
+                                                 dt2ls_dev(los->los_dev),
+                                                 parent, name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+                                                    struct dt_device *dt,
+                                                    const struct lu_fid *fid,
+                                                    struct dt_object *parent,
+                                                    const char *name,
+                                                    __u32 mode)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               dto = dt_locate(env, dt, &dti->dti_fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               struct ls_device *ls;
+
+               ls = ls_device_get(dt);
+               if (IS_ERR(ls)) {
+                       dto = ERR_PTR(PTR_ERR(ls));
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid  = LA_MODE;
+                       dti->dti_attr.la_mode   = mode;
+                       dti->dti_dof.dof_type   = dt_mode_to_dft(mode & S_IFMT);
+                       dto = __local_file_create(env, fid, NULL, ls, parent,
+                                                 name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+                       /* ls_device_put() will finalize the ls device, we
+                        * have to open the object in other device stack */
+                       if (!IS_ERR(dto)) {
+                               dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+                               lu_object_put_nocache(env, &dto->do_lu);
+                               dto = dt_locate(env, dt, &dti->dti_fid);
+                       }
+                       ls_device_put(env, ls);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+                                            struct local_oid_storage *los,
+                                            struct dt_object *parent,
+                                            const char *name, __u32 mode,
+                                            const struct dt_index_features *ft)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               /* name is found, get the object */
+               dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               rc = local_object_fid_generate(env, los, &dti->dti_fid);
+               if (rc < 0) {
+                       dto = ERR_PTR(rc);
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid          = LA_MODE;
+                       dti->dti_attr.la_mode           = mode;
+                       dti->dti_dof.dof_type           = DFT_INDEX;
+                       dti->dti_dof.u.dof_idx.di_feat  = ft;
+                       dto = __local_file_create(env, &dti->dti_fid, los,
+                                                 dt2ls_dev(los->los_dev),
+                                                 parent, name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+               }
+       }
+       return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object *parent,
+                                   const char *name, __u32 mode,
+                                   const struct dt_index_features *ft)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               /* name is found, get the object */
+               if (!lu_fid_eq(fid, &dti->dti_fid))
+                       dto = ERR_PTR(-EINVAL);
+               else
+                       dto = dt_locate(env, dt, fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               struct ls_device *ls;
+
+               ls = ls_device_get(dt);
+               if (IS_ERR(ls)) {
+                       dto = ERR_PTR(PTR_ERR(ls));
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid          = LA_MODE;
+                       dti->dti_attr.la_mode           = mode;
+                       dti->dti_dof.dof_type           = DFT_INDEX;
+                       dti->dti_dof.u.dof_idx.di_feat  = ft;
+                       dto = __local_file_create(env, fid, NULL, ls, parent,
+                                                 name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+                       /* ls_device_put() will finalize the ls device, we
+                        * have to open the object in other device stack */
+                       if (!IS_ERR(dto)) {
+                               dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+                               lu_object_put_nocache(env, &dto->do_lu);
+                               dto = dt_locate(env, dt, &dti->dti_fid);
+                       }
+                       ls_device_put(env, ls);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+                                      struct dt_device *dt,
+                                      struct dt_object *p,
+                                      struct dt_object *c, const char *name,
+                                      struct thandle *th)
+{
+       int rc;
+
+       rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+       if (rc < 0)
+               return rc;
+
+       rc = dt_declare_ref_del(env, c, th);
+       if (rc < 0)
+               return rc;
+
+       return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+                       struct dt_object *parent, const char *name)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       struct thandle          *th;
+       int                      rc;
+
+       ENTRY;
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == -ENOENT)
+               RETURN(0);
+       else if (rc < 0)
+               RETURN(rc);
+
+       dto = dt_locate(env, dt, &dti->dti_fid);
+       if (unlikely(IS_ERR(dto)))
+               RETURN(PTR_ERR(dto));
+
+       th = dt_trans_create(env, dt);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+       if (rc < 0)
+               GOTO(stop, rc);
+
+       rc = dt_trans_start_local(env, dt, th);
+       if (rc < 0)
+               GOTO(stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+       if (rc < 0)
+               GOTO(unlock, rc);
+
+       rc = dt_ref_del(env, dto, th);
+       if (rc < 0) {
+               rc = dt_insert(env, parent,
+                              (const struct dt_rec *)&dti->dti_fid,
+                              (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+               GOTO(unlock, rc);
+       }
+
+       rc = dt_destroy(env, dto, th);
+unlock:
+       dt_write_unlock(env, dto);
+stop:
+       dt_trans_stop(env, dt, th);
+out:
+       lu_object_put_nocache(env, &dto->do_lu);
+       return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+       struct local_oid_storage *los, *ret = NULL;
+
+       list_for_each_entry(los, &ls->ls_los_list, los_list) {
+               if (los->los_seq == seq) {
+                       atomic_inc(&los->los_refcount);
+                       ret = los;
+                       break;
+               }
+       }
+       return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+       if (atomic_dec_and_test(&los->los_refcount))
+               /* should never happen, only local_oid_storage_fini should
+                * drop refcount to zero */
+               LBUG();
+       return;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+                          const struct lu_fid *first_fid,
+                          struct local_oid_storage **los)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct ls_device        *ls;
+       struct los_ondisk        losd;
+       struct dt_object        *root = NULL;
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       int                      rc;
+
+       ENTRY;
+
+       ls = ls_device_get(dev);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       *los = dt_los_find(ls, fid_seq(first_fid));
+       if (*los != NULL)
+               GOTO(out, rc = 0);
+
+       /* not found, then create */
+       OBD_ALLOC_PTR(*los);
+       if (*los == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       atomic_set(&(*los)->los_refcount, 1);
+       mutex_init(&(*los)->los_id_lock);
+       (*los)->los_dev = &ls->ls_top_dev;
+       atomic_inc(&ls->ls_refcount);
+       list_add(&(*los)->los_list, &ls->ls_los_list);
+
+       rc = dt_root_get(env, dev, &dti->dti_fid);
+       if (rc)
+               GOTO(out_los, rc);
+
+       root = ls_locate(env, ls, &dti->dti_fid);
+       if (IS_ERR(root))
+               GOTO(out_los, rc = PTR_ERR(root));
+
+       /* initialize data allowing to generate new fids,
+        * literally we need a sequence */
+       snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%Lx-lastid",
+                fid_seq(first_fid));
+       rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+       if (rc == -ENOENT)
+               dti->dti_fid = *first_fid;
+       else if (rc < 0)
+               GOTO(out_los, rc);
+
+       o = ls_locate(env, ls, &dti->dti_fid);
+       if (IS_ERR(o))
+               GOTO(out_los, rc = PTR_ERR(o));
+       LASSERT(fid_seq(&dti->dti_fid) == fid_seq(first_fid));
+       if (!dt_object_exists(o)) {
+               LASSERT(rc == -ENOENT);
+
+               th = dt_trans_create(env, dev);
+               if (IS_ERR(th))
+                       GOTO(out_los, rc = PTR_ERR(th));
+
+               dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+               dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+               dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+               rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+                                      &dti->dti_dof, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_declare_insert(env, root,
+                                      (const struct dt_rec *)&dti->dti_fid,
+                                      (const struct dt_key *)dti->dti_buf,
+                                      th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_declare_record_write(env, o, sizeof(losd), 0, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dev, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               dt_write_lock(env, root, 0);
+               dt_write_lock(env, o, 0);
+               if (dt_object_exists(o))
+                       GOTO(out_lock, rc = 0);
+
+               rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+                              th);
+               if (rc)
+                       GOTO(out_lock, rc);
+
+               losd.lso_magic = cpu_to_le32(LOS_MAGIC);
+               losd.lso_next_oid = cpu_to_le32(fid_oid(first_fid) + 1);
+
+               dti->dti_off = 0;
+               dti->dti_lb.lb_buf = &losd;
+               dti->dti_lb.lb_len = sizeof(losd);
+               rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+               if (rc)
+                       GOTO(out_lock, rc);
+               rc = dt_insert(env, root,
+                              (const struct dt_rec *)&dti->dti_fid,
+                              (const struct dt_key *)dti->dti_buf,
+                              th, BYPASS_CAPA, 1);
+               if (rc)
+                       GOTO(out_lock, rc);
+out_lock:
+               dt_write_unlock(env, o);
+               dt_write_unlock(env, root);
+out_trans:
+               dt_trans_stop(env, dev, th);
+       } else {
+               dti->dti_off = 0;
+               dti->dti_lb.lb_buf = &losd;
+               dti->dti_lb.lb_len = sizeof(losd);
+               dt_read_lock(env, o, 0);
+               rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+               dt_read_unlock(env, o);
+               if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+                       CERROR("local storage file "DFID" is corrupted\n",
+                              PFID(first_fid));
+                       rc = -EINVAL;
+               }
+       }
+out_los:
+       if (root != NULL && !IS_ERR(root))
+               lu_object_put_nocache(env, &root->do_lu);
+
+       if (rc != 0) {
+               list_del(&(*los)->los_list);
+               atomic_dec(&ls->ls_refcount);
+               OBD_FREE_PTR(*los);
+               *los = NULL;
+               if (o != NULL && !IS_ERR(o))
+                       lu_object_put_nocache(env, &o->do_lu);
+       } else {
+               (*los)->los_seq = fid_seq(first_fid);
+               (*los)->los_last_oid = le32_to_cpu(losd.lso_next_oid);
+               (*los)->los_obj = o;
+       }
+out:
+       mutex_unlock(&ls->ls_los_mutex);
+       ls_device_put(env, ls);
+       return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+                           struct local_oid_storage *los)
+{
+       struct ls_device *ls;
+
+       if (!atomic_dec_and_test(&los->los_refcount))
+               return;
+
+       LASSERT(env);
+       LASSERT(los->los_dev);
+       ls = dt2ls_dev(los->los_dev);
+
+       mutex_lock(&ls->ls_los_mutex);
+       if (atomic_read(&los->los_refcount) == 0) {
+               if (los->los_obj)
+                       lu_object_put_nocache(env, &los->los_obj->do_lu);
+               list_del(&los->los_list);
+               OBD_FREE_PTR(los);
+       }
+       mutex_unlock(&ls->ls_los_mutex);
+       ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644 (file)
index 0000000..7c5c0bc
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+       struct dt_device         ls_top_dev;
+       /* all initialized ls_devices on this node linked by this */
+       struct list_head                 ls_linkage;
+       /* how many handle's reference this local storage */
+       atomic_t                 ls_refcount;
+       /* underlaying OSD device */
+       struct dt_device        *ls_osd;
+       /* list of all local OID storages */
+       struct list_head                 ls_los_list;
+       struct mutex             ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+       return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+       struct lu_object_header  ls_header;
+       struct dt_object         ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+       return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+                                         struct ls_device *ls,
+                                         const struct lu_fid *fid)
+{
+       return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644 (file)
index 0000000..7afc2ad
--- /dev/null
@@ -0,0 +1,575 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *               increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *               example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+       struct hlist_node      js_hash;
+       struct list_head            js_list;
+       atomic_t          js_refcount;
+       char              js_jobid[JOBSTATS_JOBID_SIZE];
+       time_t          js_timestamp; /* seconds */
+       struct lprocfs_stats *js_stats;
+       struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       return (strlen(job->js_jobid) == strlen(key)) &&
+              !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+       LASSERT(atomic_read(&job->js_refcount) == 0);
+       LASSERT(job->js_jobstats);
+
+       write_lock(&job->js_jobstats->ojs_lock);
+       list_del_init(&job->js_list);
+       write_unlock(&job->js_jobstats->ojs_lock);
+
+       lprocfs_free_stats(&job->js_stats);
+       OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+       LASSERT(atomic_read(&job->js_refcount) > 0);
+       if (atomic_dec_and_test(&job->js_refcount))
+               job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+       .hs_hash       = job_stat_hash,
+       .hs_key = job_stat_key,
+       .hs_keycmp     = job_stat_keycmp,
+       .hs_object     = job_stat_object,
+       .hs_get = job_stat_get,
+       .hs_put_locked = job_stat_put_locked,
+       .hs_exit       = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                            struct hlist_node *hnode, void *data)
+{
+       time_t oldest = *((time_t *)data);
+       struct job_stat *job;
+
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       if (!oldest || job->js_timestamp < oldest)
+               cfs_hash_bd_del_locked(hs, bd, hnode);
+
+       return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+       time_t oldest, now;
+
+       if (stats->ojs_cleanup_interval == 0)
+               return;
+
+       now = cfs_time_current_sec();
+       if (!force && now < stats->ojs_last_cleanup +
+                           stats->ojs_cleanup_interval)
+               return;
+
+       oldest = now - stats->ojs_cleanup_interval;
+       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+                              &oldest);
+       stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+       struct job_stat *job;
+
+       LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+       OBD_ALLOC_PTR(job);
+       if (job == NULL)
+               return NULL;
+
+       job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+       if (job->js_stats == NULL) {
+               OBD_FREE_PTR(job);
+               return NULL;
+       }
+
+       jobs->ojs_cntr_init_fn(job->js_stats);
+
+       memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+       job->js_timestamp = cfs_time_current_sec();
+       job->js_jobstats = jobs;
+       INIT_HLIST_NODE(&job->js_hash);
+       INIT_LIST_HEAD(&job->js_list);
+       atomic_set(&job->js_refcount, 1);
+
+       return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+                         int event, long amount)
+{
+       struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+       struct job_stat *job, *job2;
+       ENTRY;
+
+       LASSERT(stats && stats->ojs_hash);
+
+       lprocfs_job_cleanup(stats, false);
+
+       if (!jobid || !strlen(jobid))
+               RETURN(-EINVAL);
+
+       if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+               CERROR("Invalid jobid size (%lu), expect(%d)\n",
+                      (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+               RETURN(-EINVAL);
+       }
+
+       job = cfs_hash_lookup(stats->ojs_hash, jobid);
+       if (job)
+               goto found;
+
+       job = job_alloc(jobid, stats);
+       if (job == NULL)
+               RETURN(-ENOMEM);
+
+       job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+                                      &job->js_hash);
+       if (job2 != job) {
+               job_putref(job);
+               job = job2;
+               /* We cannot LASSERT(!list_empty(&job->js_list)) here,
+                * since we just lost the race for inserting "job" into the
+                * ojs_list, and some other thread is doing it _right_now_.
+                * Instead, be content the other thread is doing this, since
+                * "job2" was initialized in job_alloc() already. LU-2163 */
+       } else {
+               LASSERT(list_empty(&job->js_list));
+               write_lock(&stats->ojs_lock);
+               list_add_tail(&job->js_list, &stats->ojs_list);
+               write_unlock(&stats->ojs_lock);
+       }
+
+found:
+       LASSERT(stats == job->js_jobstats);
+       LASSERT(stats->ojs_cntr_num > event);
+       job->js_timestamp = cfs_time_current_sec();
+       lprocfs_counter_add(job->js_stats, event, amount);
+
+       job_putref(job);
+       RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+       struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+       time_t oldest = 0;
+
+       if (stats->ojs_hash == NULL)
+               return;
+       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+       cfs_hash_putref(stats->ojs_hash);
+       stats->ojs_hash = NULL;
+       LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_job_stats *stats = p->private;
+       loff_t off = *pos;
+       struct job_stat *job;
+
+       read_lock(&stats->ojs_lock);
+       if (off == 0)
+               return SEQ_START_TOKEN;
+       off--;
+       list_for_each_entry(job, &stats->ojs_list, js_list) {
+               if (!off--)
+                       return job;
+       }
+       return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+       struct obd_job_stats *stats = p->private;
+
+       read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_job_stats *stats = p->private;
+       struct job_stat *job;
+       struct list_head *next;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN) {
+               next = stats->ojs_list.next;
+       } else {
+               job = (struct job_stat *)v;
+               next = job->js_list.next;
+       }
+
+       return next == &stats->ojs_list ? NULL :
+               list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:   test_id.222.25844
+ *   snapshot_time: 1322494486
+ *   open:       { samples:           3, unit: reqs }
+ *   close:     { samples:            3, unit: reqs }
+ *   mknod:     { samples:            0, unit: reqs }
+ *   link:       { samples:           0, unit: reqs }
+ *   unlink:   { samples:             0, unit: reqs }
+ *   mkdir:     { samples:            0, unit: reqs }
+ *   rmdir:     { samples:            0, unit: reqs }
+ *   rename:   { samples:             1, unit: reqs }
+ *   getattr:       { samples:        7, unit: reqs }
+ *   setattr:       { samples:        0, unit: reqs }
+ *   getxattr:      { samples:        0, unit: reqs }
+ *   setxattr:      { samples:        0, unit: reqs }
+ *   statfs:   { samples:             0, unit: reqs }
+ *   sync:       { samples:           0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id     4854
+ *   snapshot_time: 1322494602
+ *   read:       { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:     { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
+ *   setattr:       { samples:  0, unit: reqs }
+ *   punch:     { samples:  0, unit: reqs }
+ *   sync:       { samples:  0, unit: reqs }
+ */
+
+static const char spaces[] = "             ";
+
+static int inline width(const char *str, int len)
+{
+       return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+       struct job_stat                 *job = v;
+       struct lprocfs_stats            *s;
+       struct lprocfs_counter          ret;
+       struct lprocfs_counter          *cntr;
+       struct lprocfs_counter_header   *cntr_header;
+       int                             i;
+
+       if (v == SEQ_START_TOKEN) {
+               seq_printf(p, "job_stats:\n");
+               return 0;
+       }
+
+       seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+       seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+       s = job->js_stats;
+       for (i = 0; i < s->ls_num; i++) {
+               cntr = lprocfs_stats_counter_get(s, 0, i);
+               cntr_header = &s->ls_cnt_header[i];
+               lprocfs_stats_collect(s, i, &ret);
+
+               seq_printf(p, "  %s:%.*s { samples: %11"LPF64"u",
+                          cntr_header->lc_name,
+                          width(cntr_header->lc_name, 15), spaces,
+                          ret.lc_count);
+               if (cntr_header->lc_units[0] != '\0')
+                       seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+               if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+                       seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+                                  " sum:%16"LPF64"u",
+                                  ret.lc_count ? ret.lc_min : 0,
+                                  ret.lc_count ? ret.lc_max : 0,
+                                  ret.lc_count ? ret.lc_sum : 0);
+               }
+               if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+                       seq_printf(p, ", sumsq: %18"LPF64"u",
+                                  ret.lc_count ? ret.lc_sumsquare : 0);
+               }
+
+               seq_printf(p, " }\n");
+
+       }
+       return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+       start: lprocfs_jobstats_seq_start,
+       stop:  lprocfs_jobstats_seq_stop,
+       next:  lprocfs_jobstats_seq_next,
+       show:  lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file *seq;
+       int rc;
+
+       if (LPROCFS_ENTRY_AND_CHECK(dp))
+               return -ENOENT;
+
+       rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+       if (rc) {
+               LPROCFS_EXIT();
+               return rc;
+       }
+       seq = file->private_data;
+       seq->private = dp->data;
+       return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+                                         size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_job_stats *stats = seq->private;
+       char jobid[JOBSTATS_JOBID_SIZE];
+       int all = 0;
+       struct job_stat *job;
+
+       if (!memcmp(buf, "clear", strlen("clear"))) {
+               all = 1;
+       } else if (len < JOBSTATS_JOBID_SIZE) {
+               memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+               /* Trim '\n' if any */
+               if (buf[len - 1] == '\n')
+                       memcpy(jobid, buf, len - 1);
+               else
+                       memcpy(jobid, buf, len);
+       } else {
+               return -EINVAL;
+       }
+
+       LASSERT(stats->ojs_hash);
+       if (all) {
+               time_t oldest = 0;
+               cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+                                      &oldest);
+               return len;
+       }
+
+       if (!strlen(jobid))
+               return -EINVAL;
+
+       job = cfs_hash_lookup(stats->ojs_hash, jobid);
+       if (!job)
+               return -EINVAL;
+
+       cfs_hash_del_key(stats->ojs_hash, jobid);
+
+       job_putref(job);
+       return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lprocfs_jobstats_seq_open,
+       .read    = seq_read,
+       .write   = lprocfs_jobstats_seq_write,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback init_fn)
+{
+       struct proc_dir_entry *entry;
+       struct obd_job_stats *stats;
+       ENTRY;
+
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->obd_type->typ_name);
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+               CERROR("Invalid obd device type.\n");
+               RETURN(-EINVAL);
+       }
+       stats = &obd->u.obt.obt_jobstats;
+
+       LASSERT(stats->ojs_hash == NULL);
+       stats->ojs_hash = cfs_hash_create("JOB_STATS",
+                                         HASH_JOB_STATS_CUR_BITS,
+                                         HASH_JOB_STATS_MAX_BITS,
+                                         HASH_JOB_STATS_BKT_BITS, 0,
+                                         CFS_HASH_MIN_THETA,
+                                         CFS_HASH_MAX_THETA,
+                                         &job_stats_hash_ops,
+                                         CFS_HASH_DEFAULT);
+       if (stats->ojs_hash == NULL)
+               RETURN(-ENOMEM);
+
+       INIT_LIST_HEAD(&stats->ojs_list);
+       rwlock_init(&stats->ojs_lock);
+       stats->ojs_cntr_num = cntr_num;
+       stats->ojs_cntr_init_fn = init_fn;
+       stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+       stats->ojs_last_cleanup = cfs_time_current_sec();
+
+       LPROCFS_WRITE_ENTRY();
+       entry = create_proc_entry("job_stats", 0644, obd->obd_proc_entry);
+       LPROCFS_WRITE_EXIT();
+       if (entry) {
+               entry->proc_fops = &lprocfs_jobstats_seq_fops;
+               entry->data = stats;
+               RETURN(0);
+       } else {
+               lprocfs_job_stats_fini(obd);
+               RETURN(-ENOMEM);
+       }
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_job_stats *stats;
+
+       LASSERT(obd != NULL);
+       stats = &obd->u.obt.obt_jobstats;
+       *eof = 1;
+       return snprintf(page, count, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_job_stats *stats;
+       int val, rc;
+
+       LASSERT(obd != NULL);
+       stats = &obd->u.obt.obt_jobstats;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       stats->ojs_cleanup_interval = val;
+       lprocfs_job_cleanup(stats, true);
+
+       return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644 (file)
index 0000000..96e568f
--- /dev/null
@@ -0,0 +1,2599 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+               "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+/* for bug 10866, global variable */
+DECLARE_RWSEM(_lprocfs_lock);
+EXPORT_SYMBOL(_lprocfs_lock);
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+       LPROCFS_EXIT();
+       return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+       LPROCFS_EXIT();
+       return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+static struct proc_dir_entry *__lprocfs_srch(struct proc_dir_entry *head,
+                                            const char *name)
+{
+       struct proc_dir_entry *temp;
+
+       if (head == NULL)
+               return NULL;
+
+       temp = head->subdir;
+       while (temp != NULL) {
+               if (strcmp(temp->name, name) == 0) {
+                       return temp;
+               }
+
+               temp = temp->next;
+       }
+       return NULL;
+}
+
+struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head,
+                                   const char *name)
+{
+       struct proc_dir_entry *temp;
+
+       LPROCFS_SRCH_ENTRY();
+       temp = __lprocfs_srch(head, name);
+       LPROCFS_SRCH_EXIT();
+       return temp;
+}
+EXPORT_SYMBOL(lprocfs_srch);
+
+/* lprocfs API calls */
+
+/* Function that emulates snprintf but also has the side effect of advancing
+   the page pointer for the next write into the buffer, incrementing the total
+   length written to the buffer, and decrementing the size left in the
+   buffer. */
+static int lprocfs_obd_snprintf(char **page, int end, int *len,
+                               const char *format, ...)
+{
+       va_list list;
+       int n;
+
+       if (*len >= end)
+               return 0;
+
+       va_start(list, format);
+       n = vsnprintf(*page, end - *len, format, list);
+       va_end(list);
+
+       *page += n; *len += n;
+       return n;
+}
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                        char *name,
+                                        read_proc_t *read_proc,
+                                        write_proc_t *write_proc,
+                                        void *data,
+                                        struct file_operations *fops)
+{
+       proc_dir_entry_t *proc;
+       mode_t mode = 0;
+
+       if (root == NULL || name == NULL)
+               return ERR_PTR(-EINVAL);
+       if (read_proc)
+               mode = 0444;
+       if (write_proc)
+               mode |= 0200;
+       if (fops)
+               mode = 0644;
+       LPROCFS_WRITE_ENTRY();
+       proc = create_proc_entry(name, mode, root);
+       if (!proc) {
+               CERROR("LprocFS: No memory to create /proc entry %s", name);
+               LPROCFS_WRITE_EXIT();
+               return ERR_PTR(-ENOMEM);
+       }
+       proc->read_proc = read_proc;
+       proc->write_proc = write_proc;
+       proc->data = data;
+       if (fops)
+               proc->proc_fops = fops;
+       LPROCFS_WRITE_EXIT();
+       return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+                       struct proc_dir_entry *parent, const char *format, ...)
+{
+       struct proc_dir_entry *entry;
+       char *dest;
+       va_list ap;
+
+       if (parent == NULL || format == NULL)
+               return NULL;
+
+       OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+       if (dest == NULL)
+               return NULL;
+
+       va_start(ap, format);
+       vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+       va_end(ap);
+
+       entry = proc_symlink(name, parent, dest);
+       if (entry == NULL)
+               CERROR("LprocFS: Could not create symbolic link from %s to %s",
+                       name, dest);
+
+       OBD_FREE(dest, MAX_STRING_SIZE + 1);
+       return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static ssize_t lprocfs_fops_read(struct file *f, char __user *buf,
+                                size_t size, loff_t *ppos)
+{
+       struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+       char *page, *start = NULL;
+       int rc = 0, eof = 1, count;
+
+       if (*ppos >= PAGE_CACHE_SIZE)
+               return 0;
+
+       page = (char *)__get_free_page(GFP_KERNEL);
+       if (page == NULL)
+               return -ENOMEM;
+
+       if (LPROCFS_ENTRY_AND_CHECK(dp)) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
+       if (dp->read_proc)
+               rc = dp->read_proc(page, &start, *ppos, PAGE_CACHE_SIZE,
+                                  &eof, dp->data);
+       LPROCFS_EXIT();
+       if (rc <= 0)
+               goto out;
+
+       /* for lustre proc read, the read count must be less than PAGE_SIZE */
+       LASSERT(eof == 1);
+
+       if (start == NULL) {
+               rc -= *ppos;
+               if (rc < 0)
+                       rc = 0;
+               if (rc == 0)
+                       goto out;
+               start = page + *ppos;
+       } else if (start < page) {
+               start = page;
+       }
+
+       count = (rc < size) ? rc : size;
+       if (copy_to_user(buf, start, count)) {
+               rc = -EFAULT;
+               goto out;
+       }
+       *ppos += count;
+
+out:
+       free_page((unsigned long)page);
+       return rc;
+}
+
+static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf,
+                                 size_t size, loff_t *ppos)
+{
+       struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+       int rc = -EIO;
+
+       if (LPROCFS_ENTRY_AND_CHECK(dp))
+               return -ENOENT;
+       if (dp->write_proc)
+               rc = dp->write_proc(f, buf, size, dp->data);
+       LPROCFS_EXIT();
+       return rc;
+}
+
+static struct file_operations lprocfs_generic_fops = {
+       .owner = THIS_MODULE,
+       .read = lprocfs_fops_read,
+       .write = lprocfs_fops_write,
+};
+
+int lprocfs_evict_client_open(struct inode *inode, struct file *f)
+{
+       struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+       struct obd_device *obd = dp->data;
+
+       atomic_inc(&obd->obd_evict_inprogress);
+
+       return 0;
+}
+
+int lprocfs_evict_client_release(struct inode *inode, struct file *f)
+{
+       struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
+       struct obd_device *obd = dp->data;
+
+       atomic_dec(&obd->obd_evict_inprogress);
+       wake_up(&obd->obd_evict_inprogress_waitq);
+
+       return 0;
+}
+
+struct file_operations lprocfs_evict_client_fops = {
+       .owner = THIS_MODULE,
+       .read = lprocfs_fops_read,
+       .write = lprocfs_fops_write,
+       .open = lprocfs_evict_client_open,
+       .release = lprocfs_evict_client_release,
+};
+EXPORT_SYMBOL(lprocfs_evict_client_fops);
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *                are called through /proc file.
+ *
+ * \retval 0   on success
+ *      < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+                    void *data)
+{
+       int rc = 0;
+
+       if (root == NULL || list == NULL)
+               return -EINVAL;
+
+       LPROCFS_WRITE_ENTRY();
+       while (list->name != NULL) {
+               struct proc_dir_entry *cur_root, *proc;
+               char *pathcopy, *cur, *next, pathbuf[64];
+               int pathsize = strlen(list->name) + 1;
+
+               proc = NULL;
+               cur_root = root;
+
+               /* need copy of path for strsep */
+               if (strlen(list->name) > sizeof(pathbuf) - 1) {
+                       OBD_ALLOC(pathcopy, pathsize);
+                       if (pathcopy == NULL)
+                               GOTO(out, rc = -ENOMEM);
+               } else {
+                       pathcopy = pathbuf;
+               }
+
+               next = pathcopy;
+               strcpy(pathcopy, list->name);
+
+               while (cur_root != NULL && (cur = strsep(&next, "/"))) {
+                       if (*cur =='\0') /* skip double/trailing "/" */
+                               continue;
+
+                       proc = __lprocfs_srch(cur_root, cur);
+                       CDEBUG(D_OTHER, "cur_root=%s, cur=%s, next=%s, (%s)\n",
+                              cur_root->name, cur, next,
+                              (proc ? "exists" : "new"));
+                       if (next != NULL) {
+                               cur_root = (proc ? proc :
+                                           proc_mkdir(cur, cur_root));
+                       } else if (proc == NULL) {
+                               mode_t mode = 0;
+                               if (list->proc_mode != 0000) {
+                                       mode = list->proc_mode;
+                               } else {
+                                       if (list->read_fptr)
+                                               mode = 0444;
+                                       if (list->write_fptr)
+                                               mode |= 0200;
+                               }
+                               proc = create_proc_entry(cur, mode, cur_root);
+                       }
+               }
+
+               if (pathcopy != pathbuf)
+                       OBD_FREE(pathcopy, pathsize);
+
+               if (cur_root == NULL || proc == NULL) {
+                       CERROR("LprocFS: No memory to create /proc entry %s",
+                              list->name);
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               if (list->fops)
+                       proc->proc_fops = list->fops;
+               else
+                       proc->proc_fops = &lprocfs_generic_fops;
+               proc->read_proc = list->read_fptr;
+               proc->write_proc = list->write_fptr;
+               proc->data = (list->data ? list->data : data);
+               list++;
+       }
+out:
+       LPROCFS_WRITE_EXIT();
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove_nolock(struct proc_dir_entry **rooth)
+{
+       struct proc_dir_entry *root = *rooth;
+       struct proc_dir_entry *temp = root;
+       struct proc_dir_entry *rm_entry;
+       struct proc_dir_entry *parent;
+
+       if (!root)
+               return;
+       *rooth = NULL;
+
+       parent = root->parent;
+       LASSERT(parent != NULL);
+
+       while (1) {
+               while (temp->subdir != NULL)
+                       temp = temp->subdir;
+
+               rm_entry = temp;
+               temp = temp->parent;
+
+               /* Memory corruption once caused this to fail, and
+                  without this LASSERT we would loop here forever. */
+               LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+                        "0x%p  %s/%s len %d\n", rm_entry, temp->name,
+                        rm_entry->name, (int)strlen(rm_entry->name));
+
+               remove_proc_entry(rm_entry->name, temp);
+               if (temp == parent)
+                       break;
+       }
+}
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+       LPROCFS_WRITE_ENTRY(); /* search vs remove race */
+       lprocfs_remove_nolock(rooth);
+       LPROCFS_WRITE_EXIT();
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+       LASSERT(parent != NULL);
+       remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+void lprocfs_try_remove_proc_entry(const char *name,
+                                  struct proc_dir_entry *parent)
+{
+       struct proc_dir_entry    *t = NULL;
+       struct proc_dir_entry   **p;
+       int                       len, busy = 0;
+
+       LASSERT(parent != NULL);
+       len = strlen(name);
+
+       LPROCFS_WRITE_ENTRY();
+
+       /* lookup target name */
+       for (p = &parent->subdir; *p; p = &(*p)->next) {
+               if ((*p)->namelen != len)
+                       continue;
+               if (memcmp(name, (*p)->name, len))
+                       continue;
+               t = *p;
+               break;
+       }
+
+       if (t) {
+               /* verify it's empty: do not count "num_refs" */
+               for (p = &t->subdir; *p; p = &(*p)->next) {
+                       if ((*p)->namelen != strlen("num_refs")) {
+                               busy = 1;
+                               break;
+                       }
+                       if (memcmp("num_refs", (*p)->name,
+                                  strlen("num_refs"))) {
+                               busy = 1;
+                               break;
+                       }
+               }
+       }
+
+       if (busy == 0)
+               lprocfs_remove_nolock(&t);
+
+       LPROCFS_WRITE_EXIT();
+
+       return;
+}
+EXPORT_SYMBOL(lprocfs_try_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+                                       struct proc_dir_entry *parent,
+                                       struct lprocfs_vars *list, void *data)
+{
+       struct proc_dir_entry *newchild;
+
+       newchild = lprocfs_srch(parent, name);
+       if (newchild != NULL) {
+               CERROR(" Lproc: Attempting to register %s more than once \n",
+                      name);
+               return ERR_PTR(-EALREADY);
+       }
+
+       newchild = proc_mkdir(name, parent);
+       if (newchild != NULL && list != NULL) {
+               int rc = lprocfs_add_vars(newchild, list, data);
+               if (rc) {
+                       lprocfs_remove(&newchild);
+                       return ERR_PTR(rc);
+               }
+       }
+       return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(char *page, char **start, off_t off,
+                   int count, int *eof, void *data)
+{
+       unsigned int *temp = data;
+       return snprintf(page, count, "%u\n", *temp);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char *buffer,
+                   unsigned long count, void *data)
+{
+       unsigned *p = data;
+       char dummy[MAX_STRING_SIZE + 1], *end;
+       unsigned long tmp;
+
+       dummy[MAX_STRING_SIZE] = '\0';
+       if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+               return -EFAULT;
+
+       tmp = simple_strtoul(dummy, &end, 0);
+       if (dummy == end)
+               return -EINVAL;
+
+       *p = (unsigned int)tmp;
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(char *page, char **start, off_t off,
+                  int count, int *eof, void *data)
+{
+       LASSERT(data != NULL);
+       *eof = 1;
+       return snprintf(page, count, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(char *page, char **start, off_t off,
+                  int count, int *eof, void *data)
+{
+       atomic_t *atom = data;
+       LASSERT(atom != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+       atomic_t *atm = data;
+       int val = 0;
+       int rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val <= 0)
+               return -ERANGE;
+
+       atomic_set(atm, val);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(char *page, char **start, off_t off, int count,
+                   int *eof, void *data)
+{
+       struct obd_device *obd = data;
+
+       LASSERT(obd != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(char *page, char **start, off_t off, int count,
+                   int *eof, void *data)
+{
+       struct obd_device *dev = data;
+
+       LASSERT(dev != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(char *page, char **start, off_t off, int count,
+                      int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               *eof = 1;
+               rc = snprintf(page, count, "%u\n", osfs.os_bsize);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, int count,
+                         int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(char *page, char **start, off_t off, int count,
+                         int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_import *imp;
+       char *imp_state_name = NULL;
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+       imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+       *eof = 1;
+       rc = snprintf(page, count, "%s\t%s%s\n",
+                     obd2cli_tgt(obd), imp_state_name,
+                     imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
+                        int *eof,  void *data)
+{
+       struct obd_device *obd = data;
+       struct ptlrpc_connection *conn;
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+
+       LPROCFS_CLIMP_CHECK(obd);
+       conn = obd->u.cli.cl_import->imp_connection;
+       *eof = 1;
+       if (conn && obd->u.cli.cl_import) {
+               rc = snprintf(page, count, "%s\n",
+                             conn->c_remote_uuid.uuid);
+       } else {
+               rc = snprintf(page, count, "%s\n", "<none>");
+       }
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt)
+{
+       unsigned int                    num_entry;
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *cntr_header;
+       int                             i;
+       unsigned long                   flags = 0;
+
+       memset(cnt, 0, sizeof(*cnt));
+
+       if (stats == NULL) {
+               /* set count to 1 to avoid divide-by-zero errs in callers */
+               cnt->lc_count = 1;
+               return;
+       }
+
+       cnt->lc_min = LC_MIN_INIT;
+
+       num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+       for (i = 0; i < num_entry; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               cntr_header = &stats->ls_cnt_header[idx];
+               percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+               cnt->lc_count += percpu_cntr->lc_count;
+               cnt->lc_sum += percpu_cntr->lc_sum;
+               if (percpu_cntr->lc_min < cnt->lc_min)
+                       cnt->lc_min = percpu_cntr->lc_min;
+               if (percpu_cntr->lc_max > cnt->lc_max)
+                       cnt->lc_max = percpu_cntr->lc_max;
+               cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+       }
+
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag) \
+       if (imp->imp_##flag && max - len > 0) \
+            len += snprintf(str + len, max - len, "%s" #flag, len ? ", " : "");
+static int obd_import_flags2str(struct obd_import *imp, char *str, int max)
+{
+       int len = 0;
+
+       if (imp->imp_obd->obd_no_recov)
+               len += snprintf(str, max - len, "no_recov");
+
+       flag2str(invalid);
+       flag2str(deactive);
+       flag2str(replayable);
+       flag2str(pingable);
+       return len;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+       "read_only",
+       "lov_index",
+       "unused",
+       "write_grant",
+       "server_lock",
+       "version",
+       "request_portal",
+       "acl",
+       "xattr",
+       "create_on_write",
+       "truncate_lock",
+       "initial_transno",
+       "inode_bit_locks",
+       "join_file(obsolete)",
+       "getattr_by_fid",
+       "no_oh_for_devices",
+       "remote_client",
+       "remote_client_by_force",
+       "max_byte_per_rpc",
+       "64bit_qdata",
+       "mds_capability",
+       "oss_capability",
+       "early_lock_cancel",
+       "som",
+       "adaptive_timeouts",
+       "lru_resize",
+       "mds_mds_connection",
+       "real_conn",
+       "change_qunit_size",
+       "alt_checksum_algorithm",
+       "fid_is_enabled",
+       "version_recovery",
+       "pools",
+       "grant_shrink",
+       "skip_orphan",
+       "large_ea",
+       "full20",
+       "layout_lock",
+       "64bithash",
+       "object_max_bytes",
+       "imp_recov",
+       "jobstats",
+       "umask",
+       "einprogress",
+       "grant_param",
+       "flock_owner",
+       "lvb_type",
+       "nanoseconds_times",
+       "lightweight_conn",
+       "short_io",
+       "pingless",
+       "unknown",
+       NULL
+};
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+       __u64 mask = 1;
+       int i, ret = 0;
+
+       for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+               if (flags & mask)
+                       ret += snprintf(page + ret, count - ret, "%s%s",
+                                       ret ? sep : "", obd_connect_names[i]);
+       }
+       if (flags & ~(mask - 1))
+               ret += snprintf(page + ret, count - ret,
+                               "%sunknown flags "LPX64,
+                               ret ? sep : "", flags & ~(mask - 1));
+       return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(char *page, char **start, off_t off, int count,
+                     int *eof, void *data)
+{
+       struct lprocfs_counter          ret;
+       struct lprocfs_counter_header   *header;
+       struct obd_device               *obd    = (struct obd_device *)data;
+       struct obd_import               *imp;
+       struct obd_import_conn          *conn;
+       int                             i;
+       int                             j;
+       int                             k;
+       int                             rw      = 0;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+       *eof = 1;
+
+       i = snprintf(page, count,
+                    "import:\n"
+                    "    name: %s\n"
+                    "    target: %s\n"
+                    "    state: %s\n"
+                    "    instance: %u\n"
+                    "    connect_flags: [",
+                    obd->obd_name,
+                    obd2cli_tgt(obd),
+                    ptlrpc_import_state_name(imp->imp_state),
+                    imp->imp_connect_data.ocd_instance);
+       i += obd_connect_flags2str(page + i, count - i,
+                                  imp->imp_connect_data.ocd_connect_flags,
+                                  ", ");
+       i += snprintf(page + i, count - i,
+                     "]\n"
+                     "    import_flags: [");
+       i += obd_import_flags2str(imp, page + i, count - i);
+
+       i += snprintf(page + i, count - i,
+                     "]\n"
+                     "    connection:\n"
+                     "       failover_nids: [");
+       spin_lock(&imp->imp_lock);
+       j = 0;
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               i += snprintf(page + i, count - i, "%s%s", j ? ", " : "",
+                             libcfs_nid2str(conn->oic_conn->c_peer.nid));
+               j++;
+       }
+       i += snprintf(page + i, count - i,
+                     "]\n"
+                     "       current_connection: %s\n"
+                     "       connection_attempts: %u\n"
+                     "       generation: %u\n"
+                     "       in-progress_invalidations: %u\n",
+                     imp->imp_connection == NULL ? "<none>" :
+                             libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                     imp->imp_conn_cnt,
+                     imp->imp_generation,
+                     atomic_read(&imp->imp_inval_count));
+       spin_unlock(&imp->imp_lock);
+
+       if (obd->obd_svc_stats == NULL)
+               goto out_climp;
+
+       header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+       lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+       if (ret.lc_count != 0) {
+               /* first argument to do_div MUST be __u64 */
+               __u64 sum = ret.lc_sum;
+               do_div(sum, ret.lc_count);
+               ret.lc_sum = sum;
+       } else
+               ret.lc_sum = 0;
+       i += snprintf(page + i, count - i,
+                     "    rpcs:\n"
+                     "       inflight: %u\n"
+                     "       unregistering: %u\n"
+                     "       timeouts: %u\n"
+                     "       avg_waittime: "LPU64" %s\n",
+                     atomic_read(&imp->imp_inflight),
+                     atomic_read(&imp->imp_unregistering),
+                     atomic_read(&imp->imp_timeouts),
+                     ret.lc_sum, header->lc_units);
+
+       k = 0;
+       for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+               if (imp->imp_at.iat_portal[j] == 0)
+                       break;
+               k = max_t(unsigned int, k,
+                         at_get(&imp->imp_at.iat_service_estimate[j]));
+       }
+       i += snprintf(page + i, count - i,
+                     "    service_estimates:\n"
+                     "       services: %u sec\n"
+                     "       network: %u sec\n",
+                     k,
+                     at_get(&imp->imp_at.iat_net_latency));
+
+       i += snprintf(page + i, count - i,
+                     "    transactions:\n"
+                     "       last_replay: "LPU64"\n"
+                     "       peer_committed: "LPU64"\n"
+                     "       last_checked: "LPU64"\n",
+                     imp->imp_last_replay_transno,
+                     imp->imp_peer_committed_transno,
+                     imp->imp_last_transno_checked);
+
+       /* avg data rates */
+       for (rw = 0; rw <= 1; rw++) {
+               lprocfs_stats_collect(obd->obd_svc_stats,
+                                     PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+                                     &ret);
+               if (ret.lc_sum > 0 && ret.lc_count > 0) {
+                       /* first argument to do_div MUST be __u64 */
+                       __u64 sum = ret.lc_sum;
+                       do_div(sum, ret.lc_count);
+                       ret.lc_sum = sum;
+                       i += snprintf(page + i, count - i,
+                                     "    %s_data_averages:\n"
+                                     "       bytes_per_rpc: "LPU64"\n",
+                                     rw ? "write" : "read",
+                                     ret.lc_sum);
+               }
+               k = (int)ret.lc_sum;
+               j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+               header = &obd->obd_svc_stats->ls_cnt_header[j];
+               lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+               if (ret.lc_sum > 0 && ret.lc_count != 0) {
+                       /* first argument to do_div MUST be __u64 */
+                       __u64 sum = ret.lc_sum;
+                       do_div(sum, ret.lc_count);
+                       ret.lc_sum = sum;
+                       i += snprintf(page + i, count - i,
+                                     "       %s_per_rpc: "LPU64"\n",
+                                     header->lc_units, ret.lc_sum);
+                       j = (int)ret.lc_sum;
+                       if (j > 0)
+                               i += snprintf(page + i, count - i,
+                                             "       MB_per_sec: %u.%.02u\n",
+                                             k / j, (100 * k / j) % 100);
+               }
+       }
+
+out_climp:
+       LPROCFS_CLIMP_EXIT(obd);
+       return i;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(char *page, char **start, off_t off, int count,
+                     int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_import *imp;
+       int i, j, k;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+       *eof = 1;
+
+       i = snprintf(page, count, "current_state: %s\n",
+                    ptlrpc_import_state_name(imp->imp_state));
+       i += snprintf(page + i, count - i,
+                     "state_history:\n");
+       k = imp->imp_state_hist_idx;
+       for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+               struct import_state_hist *ish =
+                       &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+               if (ish->ish_state == 0)
+                       continue;
+               i += snprintf(page + i, count - i, " - ["CFS_TIME_T", %s]\n",
+                             ish->ish_time,
+                             ptlrpc_import_state_name(ish->ish_state));
+       }
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return i;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(char *page, int count, int rc,
+                          struct adaptive_timeout *at)
+{
+       int i;
+       for (i = 0; i < AT_BINS; i++)
+               rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
+       rc += snprintf(page + rc, count - rc, "\n");
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
+                       int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_import *imp;
+       unsigned int cur, worst;
+       time_t now, worstt;
+       struct dhms ts;
+       int i, rc = 0;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+       *eof = 1;
+
+       now = cfs_time_current_sec();
+
+       /* Some network health info for kicks */
+       s2dhms(&ts, now - imp->imp_last_reply_time);
+       rc += snprintf(page + rc, count - rc,
+                      "%-10s : %ld, "DHMS_FMT" ago\n",
+                      "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+       cur = at_get(&imp->imp_at.iat_net_latency);
+       worst = imp->imp_at.iat_net_latency.at_worst_ever;
+       worstt = imp->imp_at.iat_net_latency.at_worst_time;
+       s2dhms(&ts, now - worstt);
+       rc += snprintf(page + rc, count - rc,
+                      "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+                      "network", cur, worst, worstt, DHMS_VARS(&ts));
+       rc = lprocfs_at_hist_helper(page, count, rc,
+                                   &imp->imp_at.iat_net_latency);
+
+       for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               if (imp->imp_at.iat_portal[i] == 0)
+                       break;
+               cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+               worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+               worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+               s2dhms(&ts, now - worstt);
+               rc += snprintf(page + rc, count - rc,
+                              "portal %-2d  : cur %3u  worst %3u (at %ld, "
+                              DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+                              cur, worst, worstt, DHMS_VARS(&ts));
+               rc = lprocfs_at_hist_helper(page, count, rc,
+                                         &imp->imp_at.iat_service_estimate[i]);
+       }
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       __u64 flags;
+       int ret = 0;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+       ret = snprintf(page, count, "flags="LPX64"\n", flags);
+       ret += obd_connect_flags2str(page + ret, count - ret, flags, "\n");
+       ret += snprintf(page + ret, count - ret, "\n");
+       LPROCFS_CLIMP_EXIT(obd);
+       return ret;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count,
+                          int *eof,  void *data)
+{
+       struct obd_device *obd = data;
+
+       LASSERT(obd != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count,
+                      int *eof, void *data)
+{
+       struct obd_type *class = (struct obd_type*) data;
+
+       LASSERT(class != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_procroot != NULL);
+
+       obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+                                              obd->obd_type->typ_procroot,
+                                              list, obd);
+       if (IS_ERR(obd->obd_proc_entry)) {
+               rc = PTR_ERR(obd->obd_proc_entry);
+               CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+               obd->obd_proc_entry = NULL;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+       if (!obd)
+               return -EINVAL;
+       if (obd->obd_proc_exports_entry) {
+               /* Should be no exports left */
+               LASSERT(obd->obd_proc_exports_entry->subdir == NULL);
+               lprocfs_remove(&obd->obd_proc_exports_entry);
+               obd->obd_proc_exports_entry = NULL;
+       }
+       if (obd->obd_proc_entry) {
+               lprocfs_remove(&obd->obd_proc_entry);
+               obd->obd_proc_entry = NULL;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+       CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+              client_stat->nid_proc, client_stat->nid_stats);
+
+       LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+                "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+                atomic_read(&client_stat->nid_exp_ref_count));
+
+       if (client_stat->nid_proc)
+               lprocfs_remove(&client_stat->nid_proc);
+
+       if (client_stat->nid_stats)
+               lprocfs_free_stats(&client_stat->nid_stats);
+
+       if (client_stat->nid_ldlm_stats)
+               lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+       OBD_FREE_PTR(client_stat);
+       return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+       cfs_hash_t *hash = obd->obd_nid_stats_hash;
+       struct nid_stat *stat;
+       ENTRY;
+
+       /* we need extra list - because hash_exit called to early */
+       /* not need locking because all clients is died */
+       while (!list_empty(&obd->obd_nid_stats)) {
+               stat = list_entry(obd->obd_nid_stats.next,
+                                     struct nid_stat, nid_list);
+               list_del_init(&stat->nid_list);
+               cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+               lprocfs_free_client_stats(stat);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+                                         enum lprocfs_stats_flags flags)
+{
+       struct lprocfs_stats    *stats;
+       unsigned int            num_entry;
+       unsigned int            percpusize = 0;
+       int                     i;
+
+       if (num == 0)
+               return NULL;
+
+       if (lprocfs_no_percpu_stats != 0)
+               flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+       if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+               num_entry = 1;
+       else
+               num_entry = num_possible_cpus();
+
+       /* alloc percpu pointers for all possible cpu slots */
+       LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+       if (stats == NULL)
+               return NULL;
+
+       stats->ls_num = num;
+       stats->ls_flags = flags;
+       spin_lock_init(&stats->ls_lock);
+
+       /* alloc num of counter headers */
+       LIBCFS_ALLOC(stats->ls_cnt_header,
+                    stats->ls_num * sizeof(struct lprocfs_counter_header));
+       if (stats->ls_cnt_header == NULL)
+               goto fail;
+
+       if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+               /* contains only one set counters */
+               percpusize = lprocfs_stats_counter_size(stats);
+               LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+               if (stats->ls_percpu[0] == NULL)
+                       goto fail;
+               stats->ls_biggest_alloc_num = 1;
+       } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+               /* alloc all percpu data, currently only obd_memory use this */
+               for (i = 0; i < num_entry; ++i)
+                       if (lprocfs_stats_alloc_one(stats, i) < 0)
+                               goto fail;
+       }
+
+       return stats;
+
+fail:
+       lprocfs_free_stats(&stats);
+       return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+       struct lprocfs_stats *stats = *statsh;
+       unsigned int num_entry;
+       unsigned int percpusize;
+       unsigned int i;
+
+       if (stats == NULL || stats->ls_num == 0)
+               return;
+       *statsh = NULL;
+
+       if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+               num_entry = 1;
+       else
+               num_entry = num_possible_cpus();
+
+       percpusize = lprocfs_stats_counter_size(stats);
+       for (i = 0; i < num_entry; i++)
+               if (stats->ls_percpu[i] != NULL)
+                       LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+       if (stats->ls_cnt_header != NULL)
+               LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+                                       sizeof(struct lprocfs_counter_header));
+       LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             i;
+       int                             j;
+       unsigned int                    num_entry;
+       unsigned long                   flags = 0;
+
+       num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+       for (i = 0; i < num_entry; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               for (j = 0; j < stats->ls_num; j++) {
+                       header = &stats->ls_cnt_header[j];
+                       percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+                       percpu_cntr->lc_count           = 0;
+                       percpu_cntr->lc_min             = LC_MIN_INIT;
+                       percpu_cntr->lc_max             = 0;
+                       percpu_cntr->lc_sumsquare       = 0;
+                       percpu_cntr->lc_sum             = 0;
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               percpu_cntr->lc_sum_irq = 0;
+               }
+       }
+
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct lprocfs_stats *stats = seq->private;
+
+       lprocfs_clear_stats(stats);
+
+       return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct lprocfs_stats *stats = p->private;
+       /* return 1st cpu location */
+       return (*pos >= stats->ls_num) ? NULL :
+               lprocfs_stats_counter_get(stats, 0, *pos);
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct lprocfs_stats *stats = p->private;
+       ++*pos;
+       return (*pos >= stats->ls_num) ? NULL :
+               lprocfs_stats_counter_get(stats, 0, *pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+       struct lprocfs_stats            *stats  = p->private;
+       struct lprocfs_counter          *cntr   = v;
+       struct lprocfs_counter          ret;
+       struct lprocfs_counter_header   *header;
+       int                             entry_size;
+       int                             idx;
+       int                             rc      = 0;
+
+       if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) {
+               struct timeval now;
+               do_gettimeofday(&now);
+               rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+                               "snapshot_time", now.tv_sec, now.tv_usec);
+               if (rc < 0)
+                       return rc;
+       }
+       entry_size = sizeof(*cntr);
+       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+               entry_size += sizeof(__s64);
+       idx = ((void *)cntr - (void *)&(stats->ls_percpu[0])->lp_cntr[0]) /
+               entry_size;
+
+       header = &stats->ls_cnt_header[idx];
+       lprocfs_stats_collect(stats, idx, &ret);
+
+       if (ret.lc_count == 0)
+               goto out;
+
+       rc = seq_printf(p, "%-25s "LPD64" samples [%s]", header->lc_name,
+                       ret.lc_count, header->lc_units);
+
+       if (rc < 0)
+               goto out;
+
+       if ((header->lc_config & LPROCFS_CNTR_AVGMINMAX) &&
+           (ret.lc_count > 0)) {
+               rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+                               ret.lc_min, ret.lc_max, ret.lc_sum);
+               if (rc < 0)
+                       goto out;
+               if (header->lc_config & LPROCFS_CNTR_STDDEV)
+                       rc = seq_printf(p, " "LPD64, ret.lc_sumsquare);
+               if (rc < 0)
+                       goto out;
+       }
+       rc = seq_printf(p, "\n");
+ out:
+       return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+       start: lprocfs_stats_seq_start,
+       stop:  lprocfs_stats_seq_stop,
+       next:  lprocfs_stats_seq_next,
+       show:  lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file *seq;
+       int rc;
+
+       if (LPROCFS_ENTRY_AND_CHECK(dp))
+               return -ENOENT;
+
+       rc = seq_open(file, &lprocfs_stats_seq_sops);
+       if (rc) {
+               LPROCFS_EXIT();
+               return rc;
+       }
+       seq = file->private_data;
+       seq->private = dp->data;
+       return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lprocfs_stats_seq_open,
+       .read    = seq_read,
+       .write   = lprocfs_stats_seq_write,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+                          struct lprocfs_stats *stats)
+{
+       struct proc_dir_entry *entry;
+       LASSERT(root != NULL);
+
+       LPROCFS_WRITE_ENTRY();
+       entry = create_proc_entry(name, 0644, root);
+       if (entry) {
+               entry->proc_fops = &lprocfs_stats_seq_fops;
+               entry->data = stats;
+       }
+
+       LPROCFS_WRITE_EXIT();
+
+       if (entry == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+                         unsigned conf, const char *name, const char *units)
+{
+       struct lprocfs_counter_header   *header;
+       struct lprocfs_counter          *percpu_cntr;
+       unsigned long                   flags = 0;
+       unsigned int                    i;
+       unsigned int                    num_cpu;
+
+       LASSERT(stats != NULL);
+
+       header = &stats->ls_cnt_header[index];
+       LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+                index, name, units);
+
+       header->lc_config = conf;
+       header->lc_name   = name;
+       header->lc_units  = units;
+
+       num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       for (i = 0; i < num_cpu; ++i) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+               percpu_cntr->lc_count           = 0;
+               percpu_cntr->lc_min             = LC_MIN_INIT;
+               percpu_cntr->lc_max             = 0;
+               percpu_cntr->lc_sumsquare       = 0;
+               percpu_cntr->lc_sum             = 0;
+               if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq = 0;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)                          \
+do {                                                                  \
+       unsigned int coffset = base + OBD_COUNTER_OFFSET(op);         \
+       LASSERT(coffset < stats->ls_num);                                 \
+       lprocfs_counter_init(stats, coffset, 0, #op, "reqs");         \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+       struct lprocfs_stats *stats;
+       unsigned int num_stats;
+       int rc, i;
+
+       LASSERT(obd->obd_stats == NULL);
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->obd_cntr_base == 0);
+
+       num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+               num_private_stats - 1 /* o_owner */;
+       stats = lprocfs_alloc_stats(num_stats, 0);
+       if (stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_ops_stats(num_private_stats, stats);
+
+       for (i = num_private_stats; i < num_stats; i++) {
+               /* If this LBUGs, it is likely that an obd
+                * operation was added to struct obd_ops in
+                * <obd.h>, and that the corresponding line item
+                * LPROCFS_OBD_OP_INIT(.., .., opname)
+                * is missing from the list above. */
+               LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+                        "Missing obd_stat initializer obd_op "
+                        "operation at offset %d.\n", i - num_private_stats);
+       }
+       rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+       if (rc < 0) {
+               lprocfs_free_stats(&stats);
+       } else {
+               obd->obd_stats  = stats;
+               obd->obd_cntr_base = num_private_stats;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+       if (obd->obd_stats)
+               lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)                         \
+do {                                                               \
+       unsigned int coffset = base + MD_COUNTER_OFFSET(op);        \
+       LASSERT(coffset < stats->ls_num);                              \
+       lprocfs_counter_init(stats, coffset, 0, #op, "reqs");      \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+                          unsigned num_private_stats)
+{
+       struct lprocfs_stats *stats;
+       unsigned int num_stats;
+       int rc, i;
+
+       LASSERT(obd->md_stats == NULL);
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->md_cntr_base == 0);
+
+       num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+                   num_private_stats;
+       stats = lprocfs_alloc_stats(num_stats, 0);
+       if (stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_mps_stats(num_private_stats, stats);
+
+       for (i = num_private_stats; i < num_stats; i++) {
+               if (stats->ls_cnt_header[i].lc_name == NULL) {
+                       CERROR("Missing md_stat initializer md_op "
+                              "operation at offset %d. Aborting.\n",
+                              i - num_private_stats);
+                       LBUG();
+               }
+       }
+       rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+       if (rc < 0) {
+               lprocfs_free_stats(&stats);
+       } else {
+               obd->md_stats  = stats;
+               obd->md_cntr_base = num_private_stats;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+       struct lprocfs_stats *stats = obd->md_stats;
+
+       if (stats != NULL) {
+               obd->md_stats = NULL;
+               obd->md_cntr_base = 0;
+               lprocfs_free_stats(&stats);
+       }
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_ENQUEUE - LDLM_FIRST_OPC,
+                            0, "ldlm_enqueue", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CONVERT - LDLM_FIRST_OPC,
+                            0, "ldlm_convert", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CANCEL - LDLM_FIRST_OPC,
+                            0, "ldlm_cancel", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_bl_callback", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_cp_callback", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_rd_nid(char *page, char **start, off_t off, int count,
+                        int *eof,  void *data)
+{
+       struct obd_export *exp = data;
+       LASSERT(exp != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%s\n", obd_export_nid2str(exp));
+}
+
+struct exp_uuid_cb_data {
+       char               *page;
+       int                  count;
+       int                 *eof;
+       int                 *len;
+};
+
+static void
+lprocfs_exp_rd_cb_data_init(struct exp_uuid_cb_data *cb_data, char *page,
+                           int count, int *eof, int *len)
+{
+       cb_data->page = page;
+       cb_data->count = count;
+       cb_data->eof = eof;
+       cb_data->len = len;
+}
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          struct hlist_node *hnode, void *cb_data)
+
+{
+       struct obd_export *exp = cfs_hash_object(hs, hnode);
+       struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data;
+
+       if (exp->exp_nid_stats)
+               *data->len += snprintf((data->page + *data->len),
+                                      data->count, "%s\n",
+                                      obd_uuid2str(&exp->exp_client_uuid));
+       return 0;
+}
+
+int lprocfs_exp_rd_uuid(char *page, char **start, off_t off, int count,
+                       int *eof,  void *data)
+{
+       struct nid_stat *stats = (struct nid_stat *)data;
+       struct exp_uuid_cb_data cb_data;
+       struct obd_device *obd = stats->nid_obd;
+       int len = 0;
+
+       *eof = 1;
+       page[0] = '\0';
+       lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+       cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                             lprocfs_exp_print_uuid, &cb_data);
+       return (*cb_data.len);
+}
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          struct hlist_node *hnode, void *cb_data)
+
+{
+       struct exp_uuid_cb_data *data = cb_data;
+       struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+       if (exp->exp_lock_hash != NULL) {
+               if (!*data->len) {
+                       *data->len += cfs_hash_debug_header(data->page,
+                                                           data->count);
+               }
+               *data->len += cfs_hash_debug_str(hs, data->page + *data->len,
+                                                data->count);
+       }
+
+       return 0;
+}
+
+int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count,
+                       int *eof,  void *data)
+{
+       struct nid_stat *stats = (struct nid_stat *)data;
+       struct exp_uuid_cb_data cb_data;
+       struct obd_device *obd = stats->nid_obd;
+       int len = 0;
+
+       *eof = 1;
+       page[0] = '\0';
+       lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
+
+       cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                             lprocfs_exp_print_hash, &cb_data);
+       return (*cb_data.len);
+}
+
+int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
+                                       int count, int *eof,  void *data)
+{
+       *eof = 1;
+       return snprintf(page, count, "%s\n",
+                       "Write into this file to clear all nid stats and "
+                       "stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+       struct nid_stat *stat = obj;
+       ENTRY;
+
+       CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+       if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+               /* object has only hash references. */
+               spin_lock(&stat->nid_obd->obd_nid_lock);
+               list_move(&stat->nid_list, data);
+               spin_unlock(&stat->nid_obd->obd_nid_lock);
+               RETURN(1);
+       }
+       /* we has reference to object - only clear data*/
+       if (stat->nid_stats)
+               lprocfs_clear_stats(stat->nid_stats);
+
+       RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct nid_stat *client_stat;
+       LIST_HEAD(free_list);
+
+       cfs_hash_cond_del(obd->obd_nid_stats_hash,
+                         lprocfs_nid_stats_clear_write_cb, &free_list);
+
+       while (!list_empty(&free_list)) {
+               client_stat = list_entry(free_list.next, struct nid_stat,
+                                            nid_list);
+               list_del_init(&client_stat->nid_list);
+               lprocfs_free_client_stats(client_stat);
+       }
+
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+       struct nid_stat *new_stat, *old_stat;
+       struct obd_device *obd = NULL;
+       proc_dir_entry_t *entry;
+       char *buffer = NULL;
+       int rc = 0;
+       ENTRY;
+
+       *newnid = 0;
+
+       if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+           !exp->exp_obd->obd_nid_stats_hash)
+               RETURN(-EINVAL);
+
+       /* not test against zero because eric say:
+        * You may only test nid against another nid, or LNET_NID_ANY.
+        * Anything else is nonsense.*/
+       if (!nid || *nid == LNET_NID_ANY)
+               RETURN(0);
+
+       obd = exp->exp_obd;
+
+       CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+       OBD_ALLOC_PTR(new_stat);
+       if (new_stat == NULL)
+               RETURN(-ENOMEM);
+
+       new_stat->nid          = *nid;
+       new_stat->nid_obd          = exp->exp_obd;
+       /* we need set default refcount to 1 to balance obd_disconnect */
+       atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+       old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+                                          nid, &new_stat->nid_hash);
+       CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+              old_stat, libcfs_nid2str(*nid),
+              atomic_read(&new_stat->nid_exp_ref_count));
+
+       /* We need to release old stats because lprocfs_exp_cleanup() hasn't
+        * been and will never be called. */
+       if (exp->exp_nid_stats) {
+               nidstat_putref(exp->exp_nid_stats);
+               exp->exp_nid_stats = NULL;
+       }
+
+       /* Return -EALREADY here so that we know that the /proc
+        * entry already has been created */
+       if (old_stat != new_stat) {
+               exp->exp_nid_stats = old_stat;
+               GOTO(destroy_new, rc = -EALREADY);
+       }
+       /* not found - create */
+       OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+       if (buffer == NULL)
+               GOTO(destroy_new, rc = -ENOMEM);
+
+       memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+       new_stat->nid_proc = lprocfs_register(buffer,
+                                             obd->obd_proc_exports_entry,
+                                             NULL, NULL);
+       OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+       if (new_stat->nid_proc == NULL) {
+               CERROR("Error making export directory for nid %s\n",
+                      libcfs_nid2str(*nid));
+               GOTO(destroy_new_ns, rc = -ENOMEM);
+       }
+
+       entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+                                  lprocfs_exp_rd_uuid, NULL, new_stat, NULL);
+       if (IS_ERR(entry)) {
+               CWARN("Error adding the NID stats file\n");
+               rc = PTR_ERR(entry);
+               GOTO(destroy_new_ns, rc);
+       }
+
+       entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+                                  lprocfs_exp_rd_hash, NULL, new_stat, NULL);
+       if (IS_ERR(entry)) {
+               CWARN("Error adding the hash file\n");
+               rc = PTR_ERR(entry);
+               GOTO(destroy_new_ns, rc);
+       }
+
+       exp->exp_nid_stats = new_stat;
+       *newnid = 1;
+       /* protect competitive add to list, not need locking on destroy */
+       spin_lock(&obd->obd_nid_lock);
+       list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+       spin_unlock(&obd->obd_nid_lock);
+
+       RETURN(rc);
+
+destroy_new_ns:
+       if (new_stat->nid_proc != NULL)
+               lprocfs_remove(&new_stat->nid_proc);
+       cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+       nidstat_putref(new_stat);
+       OBD_FREE_PTR(new_stat);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+       struct nid_stat *stat = exp->exp_nid_stats;
+
+       if(!stat || !exp->exp_obd)
+               RETURN(0);
+
+       nidstat_putref(exp->exp_nid_stats);
+       exp->exp_nid_stats = NULL;
+
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+                        int *val)
+{
+       return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+                             int *val, int mult)
+{
+       char kernbuf[20], *end, *pbuf;
+
+       if (count > (sizeof(kernbuf) - 1))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = '\0';
+       pbuf = kernbuf;
+       if (*pbuf == '-') {
+               mult = -mult;
+               pbuf++;
+       }
+
+       *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+       if (pbuf == end)
+               return -EINVAL;
+
+       if (end != NULL && *end == '.') {
+               int temp_val, pow = 1;
+               int i;
+
+               pbuf = end + 1;
+               if (strlen(pbuf) > 5)
+                       pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+               temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+               if (pbuf < end) {
+                       for (i = 0; i < (end - pbuf); i++)
+                               pow *= 10;
+
+                       *val += temp_val / pow;
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+                            int mult)
+{
+       long decimal_val, frac_val;
+       int prtn;
+
+       if (count < 10)
+               return -EINVAL;
+
+       decimal_val = val / mult;
+       prtn = snprintf(buffer, count, "%ld", decimal_val);
+       frac_val = val % mult;
+
+       if (prtn < (count - 4) && frac_val > 0) {
+               long temp_frac;
+               int i, temp_mult = 1, frac_bits = 0;
+
+               temp_frac = frac_val * 10;
+               buffer[prtn++] = '.';
+               while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+                       /* only reserved 2 bits fraction */
+                       buffer[prtn++] ='0';
+                       temp_frac *= 10;
+                       frac_bits++;
+               }
+               /*
+                * Need to think these cases :
+                *      1. #echo x.00 > /proc/xxx       output result : x
+                *      2. #echo x.0x > /proc/xxx       output result : x.0x
+                *      3. #echo x.x0 > /proc/xxx       output result : x.x
+                *      4. #echo x.xx > /proc/xxx       output result : x.xx
+                *      Only reserved 2 bits fraction.
+                */
+               for (i = 0; i < (5 - prtn); i++)
+                       temp_mult *= 10;
+
+               frac_bits = min((int)count - prtn, 3 - frac_bits);
+               prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+                                frac_val * temp_mult / mult);
+
+               prtn--;
+               while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+                       prtn--;
+                       if (buffer[prtn] == '.') {
+                               prtn--;
+                               break;
+                       }
+               }
+               prtn++;
+       }
+       buffer[prtn++] ='\n';
+       return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+       return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+                             __u64 *val, int mult)
+{
+       char kernbuf[22], *end, *pbuf;
+       __u64 whole, frac = 0, units;
+       unsigned frac_d = 1;
+
+       if (count > (sizeof(kernbuf) - 1))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = '\0';
+       pbuf = kernbuf;
+       if (*pbuf == '-') {
+               mult = -mult;
+               pbuf++;
+       }
+
+       whole = simple_strtoull(pbuf, &end, 10);
+       if (pbuf == end)
+               return -EINVAL;
+
+       if (end != NULL && *end == '.') {
+               int i;
+               pbuf = end + 1;
+
+               /* need to limit frac_d to a __u32 */
+               if (strlen(pbuf) > 10)
+                       pbuf[10] = '\0';
+
+               frac = simple_strtoull(pbuf, &end, 10);
+               /* count decimal places */
+               for (i = 0; i < (end - pbuf); i++)
+                       frac_d *= 10;
+       }
+
+       units = 1;
+       switch(*end) {
+       case 'p': case 'P':
+               units <<= 10;
+       case 't': case 'T':
+               units <<= 10;
+       case 'g': case 'G':
+               units <<= 10;
+       case 'm': case 'M':
+               units <<= 10;
+       case 'k': case 'K':
+               units <<= 10;
+       }
+       /* Specified units override the multiplier */
+       if (units)
+               mult = mult < 0 ? -units : units;
+
+       frac *= mult;
+       do_div(frac, frac_d);
+       *val = whole * mult + frac;
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+       size_t l2;
+
+       l2 = strlen(s2);
+       if (!l2)
+               return (char *)s1;
+       while (len >= l2) {
+               len--;
+               if (!memcmp(s1, s2, l2))
+                       return (char *)s1;
+               s1++;
+       }
+       return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+                               unsigned long *count)
+{
+       char *val;
+       size_t buflen = *count;
+
+       /* there is no strnstr() in rhel5 and ubuntu kernels */
+       val = lprocfs_strnstr(buffer, name, buflen);
+       if (val == NULL)
+               return (char *)buffer;
+
+       val += strlen(name);                         /* skip prefix */
+       while (val < buffer + buflen && isspace(*val)) /* skip separator */
+               val++;
+
+       *count = 0;
+       while (val < buffer + buflen && isalnum(*val)) {
+               ++*count;
+               ++val;
+       }
+
+       return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+                      const char *name,
+                      mode_t mode,
+                      const struct file_operations *seq_fops,
+                      void *data)
+{
+       struct proc_dir_entry *entry;
+       ENTRY;
+
+       /* Disallow secretly (un)writable entries. */
+       LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+
+       LPROCFS_WRITE_ENTRY();
+       entry = create_proc_entry(name, mode, parent);
+       if (entry) {
+               entry->proc_fops = seq_fops;
+               entry->data = data;
+       }
+       LPROCFS_WRITE_EXIT();
+
+       if (entry == NULL)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+                          const char *name,
+                          mode_t mode,
+                          const struct file_operations *seq_fops,
+                          void *data)
+{
+       return (lprocfs_seq_create(dev->obd_proc_entry, name,
+                                  mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+       if (value >= OBD_HIST_MAX)
+               value = OBD_HIST_MAX - 1;
+
+       spin_lock(&oh->oh_lock);
+       oh->oh_buckets[value]++;
+       spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+       unsigned int val;
+
+       for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+               ;
+
+       lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+       unsigned long ret = 0;
+       int i;
+
+       for (i = 0; i < OBD_HIST_MAX; i++)
+               ret +=  oh->oh_buckets[i];
+       return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+       spin_lock(&oh->oh_lock);
+       memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+       spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
+                       int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       int c = 0;
+
+       if (obd == NULL)
+               return 0;
+
+       c += cfs_hash_debug_header(page, count);
+       c += cfs_hash_debug_str(obd->obd_uuid_hash, page + c, count - c);
+       c += cfs_hash_debug_str(obd->obd_nid_hash, page + c, count - c);
+       c += cfs_hash_debug_str(obd->obd_nid_stats_hash, page+c, count-c);
+
+       return c;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_hash);
+
+int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       int len = 0, size;
+
+       LASSERT(obd != NULL);
+       LASSERT(count >= 0);
+
+       /* Set start of user data returned to
+          page + off since the user may have
+          requested to read much smaller than
+          what we need to read */
+       *start = page + off;
+
+       /* We know we are allocated a page here.
+          Also we know that this function will
+          not need to write more than a page
+          so we can truncate at PAGE_CACHE_SIZE.  */
+       size = min(count + (int)off + 1, (int)PAGE_CACHE_SIZE);
+
+       /* Initialize the page */
+       memset(page, 0, size);
+
+       if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0)
+               goto out;
+       if (obd->obd_max_recoverable_clients == 0) {
+               if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0)
+                       goto out;
+
+               goto fclose;
+       }
+
+       /* sampled unlocked, but really... */
+       if (obd->obd_recovering == 0) {
+               if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len,
+                                        "recovery_start: %lu\n",
+                                        obd->obd_recovery_start) <= 0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len,
+                                        "recovery_duration: %lu\n",
+                                        obd->obd_recovery_end -
+                                        obd->obd_recovery_start) <= 0)
+                       goto out;
+               /* Number of clients that have completed recovery */
+               if (lprocfs_obd_snprintf(&page, size, &len,
+                                        "completed_clients: %d/%d\n",
+                                        obd->obd_max_recoverable_clients -
+                                        obd->obd_stale_clients,
+                                        obd->obd_max_recoverable_clients) <= 0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len,
+                                        "replayed_requests: %d\n",
+                                        obd->obd_replayed_requests) <= 0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len,
+                                        "last_transno: "LPD64"\n",
+                                        obd->obd_next_recovery_transno - 1)<=0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len, "VBR: %s\n",
+                                        obd->obd_version_recov ?
+                                        "ENABLED" : "DISABLED") <=0)
+                       goto out;
+               if (lprocfs_obd_snprintf(&page, size, &len, "IR: %s\n",
+                                        obd->obd_no_ir ?
+                                        "DISABLED" : "ENABLED") <= 0)
+                       goto out;
+               goto fclose;
+       }
+
+       if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n",
+                                obd->obd_recovery_start) <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n",
+                                cfs_time_current_sec() >=
+                                obd->obd_recovery_start +
+                                obd->obd_recovery_timeout ? 0 :
+                                obd->obd_recovery_start +
+                                obd->obd_recovery_timeout -
+                                cfs_time_current_sec()) <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n",
+                                atomic_read(&obd->obd_connected_clients),
+                                obd->obd_max_recoverable_clients) <= 0)
+               goto out;
+       /* Number of clients that have completed recovery */
+       if (lprocfs_obd_snprintf(&page, size, &len,"req_replay_clients: %d\n",
+                                atomic_read(&obd->obd_req_replay_clients))
+               <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len,"lock_repay_clients: %d\n",
+                                atomic_read(&obd->obd_lock_replay_clients))
+               <=0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d\n",
+                                atomic_read(&obd->obd_connected_clients) -
+                                atomic_read(&obd->obd_lock_replay_clients))
+               <=0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len,"evicted_clients: %d\n",
+                                obd->obd_stale_clients) <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d\n",
+                                obd->obd_replayed_requests) <= 0)
+               goto out;
+       if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n",
+                                obd->obd_requests_queued_for_recovery) <= 0)
+               goto out;
+
+       if (lprocfs_obd_snprintf(&page, size, &len, "next_transno: "LPD64"\n",
+                                obd->obd_next_recovery_transno) <= 0)
+               goto out;
+
+fclose:
+       *eof = 1;
+out:
+       return min(count, len - (int)off);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
+
+int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       LASSERT(obd != NULL);
+
+       return snprintf(page, count, "%d\n",
+                       obd->obd_recovery_ir_factor);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_ir_factor);
+
+int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       int val, rc;
+       LASSERT(obd != NULL);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX)
+               return -EINVAL;
+
+       obd->obd_recovery_ir_factor = val;
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_ir_factor);
+
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       LASSERT(obd != NULL);
+
+       return snprintf(page, count, "%d\n",
+                       obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft);
+
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                     unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       int val, rc;
+       LASSERT(obd != NULL);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       obd->obd_recovery_timeout = val;
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft);
+
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       LASSERT(obd != NULL);
+
+       return snprintf(page, count, "%u\n", obd->obd_recovery_time_hard);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard);
+
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                     unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       int val, rc;
+       LASSERT(obd != NULL);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       obd->obd_recovery_time_hard = val;
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard);
+
+int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%d\n", cli->cl_max_pages_per_rpc);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+int lprocfs_target_rd_instance(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_device_target *target = &obd->u.obt;
+
+       LASSERT(obd != NULL);
+       LASSERT(target->obt_magic == OBT_MAGIC);
+       *eof = 1;
+       return snprintf(page, count, "%u\n", obd->u.obt.obt_instance);
+}
+EXPORT_SYMBOL(lprocfs_target_rd_instance);
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644 (file)
index 0000000..6c0de3f
--- /dev/null
@@ -0,0 +1,2209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object_header *top;
+       struct lu_site    *site;
+       struct lu_object        *orig;
+       cfs_hash_bd_t       bd;
+       const struct lu_fid     *fid;
+
+       top  = o->lo_header;
+       site = o->lo_dev->ld_site;
+       orig = o;
+
+       /*
+        * till we have full fids-on-OST implemented anonymous objects
+        * are possible in OSP. such an object isn't listed in the site
+        * so we should not remove it from the site.
+        */
+       fid = lu_object_fid(o);
+       if (fid_is_zero(fid)) {
+               LASSERT(top->loh_hash.next == NULL
+                       && top->loh_hash.pprev == NULL);
+               LASSERT(list_empty(&top->loh_lru));
+               if (!atomic_dec_and_test(&top->loh_ref))
+                       return;
+               list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+                       if (o->lo_ops->loo_object_release != NULL)
+                               o->lo_ops->loo_object_release(env, o);
+               }
+               lu_object_free(env, orig);
+               return;
+       }
+
+       cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+       bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+       if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+               if (lu_object_is_dying(top)) {
+
+                       /*
+                        * somebody may be waiting for this, currently only
+                        * used for cl_object, see cl_object_put_last().
+                        */
+                       wake_up_all(&bkt->lsb_marche_funebre);
+               }
+               return;
+       }
+
+       LASSERT(bkt->lsb_busy > 0);
+       bkt->lsb_busy--;
+       /*
+        * When last reference is released, iterate over object
+        * layers, and notify them that object is no longer busy.
+        */
+       list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+               if (o->lo_ops->loo_object_release != NULL)
+                       o->lo_ops->loo_object_release(env, o);
+       }
+
+       if (!lu_object_is_dying(top)) {
+               LASSERT(list_empty(&top->loh_lru));
+               list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+               cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+               return;
+       }
+
+       /*
+        * If object is dying (will not be cached), removed it
+        * from hash table and LRU.
+        *
+        * This is done with hash table and LRU lists locked. As the only
+        * way to acquire first reference to previously unreferenced
+        * object is through hash-table lookup (lu_object_find()),
+        * or LRU scanning (lu_site_purge()), that are done under hash-table
+        * and LRU lock, no race with concurrent object lookup is possible
+        * and we can safely destroy object below.
+        */
+       if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+               cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+       cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+       /*
+        * Object was already removed from hash and lru above, can
+        * kill it.
+        */
+       lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+       return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_object_header *top;
+
+       top = o->lo_header;
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+       if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+               cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+               cfs_hash_bd_t bd;
+
+               cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+               list_del_init(&top->loh_lru);
+               cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+               cfs_hash_bd_unlock(obj_hash, &bd, 1);
+       }
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+                                        struct lu_device *dev,
+                                        const struct lu_fid *f,
+                                        const struct lu_object_conf *conf)
+{
+       struct lu_object *scan;
+       struct lu_object *top;
+       struct list_head *layers;
+       int clean;
+       int result;
+       ENTRY;
+
+       /*
+        * Create top-level object slice. This will also create
+        * lu_object_header.
+        */
+       top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+       if (top == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+       if (IS_ERR(top))
+               RETURN(top);
+       /*
+        * This is the only place where object fid is assigned. It's constant
+        * after this point.
+        */
+       top->lo_header->loh_fid = *f;
+       layers = &top->lo_header->loh_layers;
+       do {
+               /*
+                * Call ->loo_object_init() repeatedly, until no more new
+                * object slices are created.
+                */
+               clean = 1;
+               list_for_each_entry(scan, layers, lo_linkage) {
+                       if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+                               continue;
+                       clean = 0;
+                       scan->lo_header = top->lo_header;
+                       result = scan->lo_ops->loo_object_init(env, scan, conf);
+                       if (result != 0) {
+                               lu_object_free(env, top);
+                               RETURN(ERR_PTR(result));
+                       }
+                       scan->lo_flags |= LU_OBJECT_ALLOCATED;
+               }
+       } while (!clean);
+
+       list_for_each_entry_reverse(scan, layers, lo_linkage) {
+               if (scan->lo_ops->loo_object_start != NULL) {
+                       result = scan->lo_ops->loo_object_start(env, scan);
+                       if (result != 0) {
+                               lu_object_free(env, top);
+                               RETURN(ERR_PTR(result));
+                       }
+               }
+       }
+
+       lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+       RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_site    *site;
+       struct lu_object        *scan;
+       struct list_head              *layers;
+       struct list_head               splice;
+
+       site   = o->lo_dev->ld_site;
+       layers = &o->lo_header->loh_layers;
+       bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+       /*
+        * First call ->loo_object_delete() method to release all resources.
+        */
+       list_for_each_entry_reverse(scan, layers, lo_linkage) {
+               if (scan->lo_ops->loo_object_delete != NULL)
+                       scan->lo_ops->loo_object_delete(env, scan);
+       }
+
+       /*
+        * Then, splice object layers into stand-alone list, and call
+        * ->loo_object_free() on all layers to free memory. Splice is
+        * necessary, because lu_object_header is freed together with the
+        * top-level slice.
+        */
+       INIT_LIST_HEAD(&splice);
+       list_splice_init(layers, &splice);
+       while (!list_empty(&splice)) {
+               /*
+                * Free layers in bottom-to-top order, so that object header
+                * lives as long as possible and ->loo_object_free() methods
+                * can look at its contents.
+                */
+               o = container_of0(splice.prev, struct lu_object, lo_linkage);
+               list_del_init(&o->lo_linkage);
+               LASSERT(o->lo_ops->loo_object_free != NULL);
+               o->lo_ops->loo_object_free(env, o);
+       }
+
+       if (waitqueue_active(&bkt->lsb_marche_funebre))
+               wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+       struct lu_object_header *h;
+       struct lu_object_header *temp;
+       struct lu_site_bkt_data *bkt;
+       cfs_hash_bd_t       bd;
+       cfs_hash_bd_t       bd2;
+       struct list_head               dispose;
+       int                   did_sth;
+       int                   start;
+       int                   count;
+       int                   bnr;
+       int                   i;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+               RETURN(0);
+
+       INIT_LIST_HEAD(&dispose);
+       /*
+        * Under LRU list lock, scan LRU list and move unreferenced objects to
+        * the dispose list, removing them from LRU and hash table.
+        */
+       start = s->ls_purge_start;
+       bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+       did_sth = 0;
+       cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+               if (i < start)
+                       continue;
+               count = bnr;
+               cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+               bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+               list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+                       LASSERT(atomic_read(&h->loh_ref) == 0);
+
+                       cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+                       LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+                       cfs_hash_bd_del_locked(s->ls_obj_hash,
+                                              &bd2, &h->loh_hash);
+                       list_move(&h->loh_lru, &dispose);
+                       if (did_sth == 0)
+                               did_sth = 1;
+
+                       if (nr != ~0 && --nr == 0)
+                               break;
+
+                       if (count > 0 && --count == 0)
+                               break;
+
+               }
+               cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+               cond_resched();
+               /*
+                * Free everything on the dispose list. This is safe against
+                * races due to the reasons described in lu_object_put().
+                */
+               while (!list_empty(&dispose)) {
+                       h = container_of0(dispose.next,
+                                         struct lu_object_header, loh_lru);
+                       list_del_init(&h->loh_lru);
+                       lu_object_free(env, lu_object_top(h));
+                       lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+               }
+
+               if (nr == 0)
+                       break;
+       }
+
+       if (nr != 0 && did_sth && start != 0) {
+               start = 0; /* restart from the first bucket */
+               goto again;
+       }
+       /* race on s->ls_purge_start, but nobody cares */
+       s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+       return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+       /**
+        * Maximal line size.
+        *
+        * XXX overflow is not handled correctly.
+        */
+       LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+       /**
+        * Temporary buffer.
+        */
+       char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+       .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+                   LCT_MG_THREAD | LCT_CL_THREAD,
+       .lct_init = lu_global_key_init,
+       .lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                     void *cookie, const char *format, ...)
+{
+       struct libcfs_debug_msg_data *msgdata = cookie;
+       struct lu_cdebug_data   *key;
+       int used;
+       int complete;
+       va_list args;
+
+       va_start(args, format);
+
+       key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+       LASSERT(key != NULL);
+
+       used = strlen(key->lck_area);
+       complete = format[strlen(format) - 1] == '\n';
+       /*
+        * Append new chunk to the buffer.
+        */
+       vsnprintf(key->lck_area + used,
+                 ARRAY_SIZE(key->lck_area) - used, format, args);
+       if (complete) {
+               if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+                       libcfs_debug_msg(msgdata, "%s", key->lck_area);
+               key->lck_area[0] = 0;
+       }
+       va_end(args);
+       return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t printer,
+                           const struct lu_object_header *hdr)
+{
+       (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+                  hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+                  PFID(&hdr->loh_fid),
+                  hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+                  list_empty((struct list_head *)&hdr->loh_lru) ? \
+                  "" : " lru",
+                  hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+                    lu_printer_t printer, const struct lu_object *o)
+{
+       static const char ruler[] = "........................................";
+       struct lu_object_header *top;
+       int depth;
+
+       top = o->lo_header;
+       lu_object_header_print(env, cookie, printer, top);
+       (*printer)(env, cookie, "{ \n");
+       list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+               depth = o->lo_depth + 4;
+
+               /*
+                * print `.' \a depth times followed by type name and address
+                */
+               (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+                          o->lo_dev->ld_type->ldt_name, o);
+               if (o->lo_ops->loo_object_print != NULL)
+                       o->lo_ops->loo_object_print(env, cookie, printer, o);
+               (*printer)(env, cookie, "\n");
+       }
+       (*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+       struct lu_object_header *top;
+
+       top = o->lo_header;
+       list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+               if (o->lo_ops->loo_object_invariant != NULL &&
+                   !o->lo_ops->loo_object_invariant(o))
+                       return 0;
+       }
+       return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+                                      cfs_hash_bd_t *bd,
+                                      const struct lu_fid *f,
+                                      wait_queue_t *waiter,
+                                      __u64 *version)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object_header *h;
+       struct hlist_node       *hnode;
+       __u64  ver = cfs_hash_bd_version_get(bd);
+
+       if (*version == ver)
+               return NULL;
+
+       *version = ver;
+       bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+       /* cfs_hash_bd_peek_locked is a somehow "internal" function
+        * of cfs_hash, it doesn't add refcount on object. */
+       hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+       if (hnode == NULL) {
+               lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+               return NULL;
+       }
+
+       h = container_of0(hnode, struct lu_object_header, loh_hash);
+       if (likely(!lu_object_is_dying(h))) {
+               cfs_hash_get(s->ls_obj_hash, hnode);
+               lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+               list_del_init(&h->loh_lru);
+               return lu_object_top(h);
+       }
+
+       /*
+        * Lookup found an object being destroyed this object cannot be
+        * returned (to assure that references to dying objects are eventually
+        * drained), and moreover, lookup has to wait until object is freed.
+        */
+
+       init_waitqueue_entry_current(waiter);
+       add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+       return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                struct lu_device *dev, const struct lu_fid *f,
+                                const struct lu_object_conf *conf)
+{
+       return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf)
+{
+       struct lu_object        *o;
+       cfs_hash_t            *hs;
+       cfs_hash_bd_t       bd;
+       struct lu_site_bkt_data *bkt;
+
+       o = lu_object_alloc(env, dev, f, conf);
+       if (unlikely(IS_ERR(o)))
+               return o;
+
+       hs = dev->ld_site->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+       bkt = cfs_hash_bd_extra_get(hs, &bd);
+       cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+       bkt->lsb_busy++;
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+                                           struct lu_device *dev,
+                                           const struct lu_fid *f,
+                                           const struct lu_object_conf *conf,
+                                           wait_queue_t *waiter)
+{
+       struct lu_object      *o;
+       struct lu_object      *shadow;
+       struct lu_site  *s;
+       cfs_hash_t          *hs;
+       cfs_hash_bd_t     bd;
+       __u64             version = 0;
+
+       /*
+        * This uses standard index maintenance protocol:
+        *
+        *     - search index under lock, and return object if found;
+        *     - otherwise, unlock index, allocate new object;
+        *     - lock index and search again;
+        *     - if nothing is found (usual case), insert newly created
+        *       object into index;
+        *     - otherwise (race: other thread inserted object), free
+        *       object just allocated.
+        *     - unlock index;
+        *     - return object.
+        *
+        * For "LOC_F_NEW" case, we are sure the object is new established.
+        * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+        * just alloc and insert directly.
+        *
+        * If dying object is found during index search, add @waiter to the
+        * site wait-queue and return ERR_PTR(-EAGAIN).
+        */
+       if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+               return lu_object_new(env, dev, f, conf);
+
+       s  = dev->ld_site;
+       hs = s->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+       o = htable_lookup(s, &bd, f, waiter, &version);
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       if (o != NULL)
+               return o;
+
+       /*
+        * Allocate new object. This may result in rather complicated
+        * operations, including fld queries, inode loading, etc.
+        */
+       o = lu_object_alloc(env, dev, f, conf);
+       if (unlikely(IS_ERR(o)))
+               return o;
+
+       LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+       cfs_hash_bd_lock(hs, &bd, 1);
+
+       shadow = htable_lookup(s, &bd, f, waiter, &version);
+       if (likely(shadow == NULL)) {
+               struct lu_site_bkt_data *bkt;
+
+               bkt = cfs_hash_bd_extra_get(hs, &bd);
+               cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+               bkt->lsb_busy++;
+               cfs_hash_bd_unlock(hs, &bd, 1);
+               return o;
+       }
+
+       lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       lu_object_free(env, o);
+       return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                   struct lu_device *dev,
+                                   const struct lu_fid *f,
+                                   const struct lu_object_conf *conf)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object        *obj;
+       wait_queue_t       wait;
+
+       while (1) {
+               obj = lu_object_find_try(env, dev, f, conf, &wait);
+               if (obj != ERR_PTR(-EAGAIN))
+                       return obj;
+               /*
+                * lu_object_find_try() already added waiter into the
+                * wait queue.
+                */
+               waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+               bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+               remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+       }
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf)
+{
+       struct lu_object *top;
+       struct lu_object *obj;
+
+       top = lu_object_find(env, dev, f, conf);
+       if (!IS_ERR(top)) {
+               obj = lu_object_locate(top->lo_header, dev->ld_type);
+               if (obj == NULL)
+                       lu_object_put(env, top);
+       } else
+               obj = top;
+       return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+       int result = 0;
+
+       INIT_LIST_HEAD(&ldt->ldt_linkage);
+       if (ldt->ldt_ops->ldto_init)
+               result = ldt->ldt_ops->ldto_init(ldt);
+       if (result == 0)
+               list_add(&ldt->ldt_linkage, &lu_device_types);
+       return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+       list_del_init(&ldt->ldt_linkage);
+       if (ldt->ldt_ops->ldto_fini)
+               ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+       struct lu_device_type *ldt;
+
+       list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+               if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+                       ldt->ldt_ops->ldto_stop(ldt);
+       }
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+       struct lu_env   *lsp_env;
+       void        *lsp_cookie;
+       lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                 struct hlist_node *hnode, void *data)
+{
+       struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+       struct lu_object_header  *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       if (!list_empty(&h->loh_layers)) {
+               const struct lu_object *o;
+
+               o = lu_object_top(h);
+               lu_object_print(arg->lsp_env, arg->lsp_cookie,
+                               arg->lsp_printer, o);
+       } else {
+               lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+                                      arg->lsp_printer, h);
+       }
+       return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                  lu_printer_t printer)
+{
+       struct lu_site_print_arg arg = {
+               .lsp_env     = (struct lu_env *)env,
+               .lsp_cookie  = cookie,
+               .lsp_printer = printer,
+       };
+
+       cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+       LU_CACHE_PERCENT_MAX     = 50,
+       LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+               "Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+       unsigned long cache_size;
+       int bits;
+
+       /*
+        * Calculate hash table size, assuming that we want reasonable
+        * performance when 20% of total memory is occupied by cache of
+        * lu_objects.
+        *
+        * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+        */
+       cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+       /* limit hashtable size for lowmem systems to low RAM */
+       if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+               cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+       /* clear off unreasonable cache setting. */
+       if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+               CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+                     " the range of (0, %u]. Will use default value: %u.\n",
+                     lu_cache_percent, LU_CACHE_PERCENT_MAX,
+                     LU_CACHE_PERCENT_DEFAULT);
+
+               lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+       }
+       cache_size = cache_size / 100 * lu_cache_percent *
+               (PAGE_CACHE_SIZE / 1024);
+
+       for (bits = 1; (1 << bits) < cache_size; ++bits) {
+               ;
+       }
+       return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+                               const void *key, unsigned mask)
+{
+       struct lu_fid  *fid = (struct lu_fid *)key;
+       __u32      hash;
+
+       hash = fid_flatten32(fid);
+       hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+       hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+       /* give me another random factor */
+       hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+       hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+       hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+       return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       if (atomic_add_return(1, &h->loh_ref) == 1) {
+               struct lu_site_bkt_data *bkt;
+               cfs_hash_bd_t       bd;
+
+               cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+               bkt = cfs_hash_bd_extra_get(hs, &bd);
+               bkt->lsb_busy++;
+       }
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+       .hs_hash        = lu_obj_hop_hash,
+       .hs_key  = lu_obj_hop_key,
+       .hs_keycmp      = lu_obj_hop_keycmp,
+       .hs_object      = lu_obj_hop_object,
+       .hs_get  = lu_obj_hop_get,
+       .hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+       spin_lock(&s->ls_ld_lock);
+       if (list_empty(&d->ld_linkage))
+               list_add(&d->ld_linkage, &s->ls_ld_linkage);
+       spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+       spin_lock(&s->ls_ld_lock);
+       list_del_init(&d->ld_linkage);
+       spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+       struct lu_site_bkt_data *bkt;
+       cfs_hash_bd_t bd;
+       char name[16];
+       int bits;
+       int i;
+       ENTRY;
+
+       memset(s, 0, sizeof *s);
+       bits = lu_htable_order();
+       snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+       for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+            bits >= LU_SITE_BITS_MIN; bits--) {
+               s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+                                                bits - LU_SITE_BKT_BITS,
+                                                sizeof(*bkt), 0, 0,
+                                                &lu_site_hash_ops,
+                                                CFS_HASH_SPIN_BKTLOCK |
+                                                CFS_HASH_NO_ITEMREF |
+                                                CFS_HASH_DEPTH |
+                                                CFS_HASH_ASSERT_EMPTY);
+               if (s->ls_obj_hash != NULL)
+                       break;
+       }
+
+       if (s->ls_obj_hash == NULL) {
+               CERROR("failed to create lu_site hash with bits: %d\n", bits);
+               return -ENOMEM;
+       }
+
+       cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+               bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+               INIT_LIST_HEAD(&bkt->lsb_lru);
+               init_waitqueue_head(&bkt->lsb_marche_funebre);
+       }
+
+       s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+       if (s->ls_stats == NULL) {
+               cfs_hash_putref(s->ls_obj_hash);
+               s->ls_obj_hash = NULL;
+               return -ENOMEM;
+       }
+
+       lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+                            0, "created", "created");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+                            0, "cache_hit", "cache_hit");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+                            0, "cache_miss", "cache_miss");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+                            0, "cache_race", "cache_race");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+                            0, "cache_death_race", "cache_death_race");
+       lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+                            0, "lru_purged", "lru_purged");
+
+       INIT_LIST_HEAD(&s->ls_linkage);
+       s->ls_top_dev = top;
+       top->ld_site = s;
+       lu_device_get(top);
+       lu_ref_add(&top->ld_reference, "site-top", s);
+
+       INIT_LIST_HEAD(&s->ls_ld_linkage);
+       spin_lock_init(&s->ls_ld_lock);
+
+       lu_dev_add_linkage(s, top);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+       mutex_lock(&lu_sites_guard);
+       list_del_init(&s->ls_linkage);
+       mutex_unlock(&lu_sites_guard);
+
+       if (s->ls_obj_hash != NULL) {
+               cfs_hash_putref(s->ls_obj_hash);
+               s->ls_obj_hash = NULL;
+       }
+
+       if (s->ls_top_dev != NULL) {
+               s->ls_top_dev->ld_site = NULL;
+               lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+               lu_device_put(s->ls_top_dev);
+               s->ls_top_dev = NULL;
+       }
+
+       if (s->ls_stats != NULL)
+               lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+       int result;
+       mutex_lock(&lu_sites_guard);
+       result = lu_context_refill(&lu_shrink_env.le_ctx);
+       if (result == 0)
+               list_add(&s->ls_linkage, &lu_sites);
+       mutex_unlock(&lu_sites_guard);
+       return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+       atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+       LASSERT(atomic_read(&d->ld_ref) > 0);
+       atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+       if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+               t->ldt_ops->ldto_start(t);
+       memset(d, 0, sizeof *d);
+       atomic_set(&d->ld_ref, 0);
+       d->ld_type = t;
+       lu_ref_init(&d->ld_reference);
+       INIT_LIST_HEAD(&d->ld_linkage);
+       return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+       struct lu_device_type *t;
+
+       t = d->ld_type;
+       if (d->ld_obd != NULL) {
+               d->ld_obd->obd_lu_dev = NULL;
+               d->ld_obd = NULL;
+       }
+
+       lu_ref_fini(&d->ld_reference);
+       LASSERTF(atomic_read(&d->ld_ref) == 0,
+                "Refcount is %u\n", atomic_read(&d->ld_ref));
+       LASSERT(t->ldt_device_nr > 0);
+       if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+               t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+                  struct lu_object_header *h, struct lu_device *d)
+{
+       memset(o, 0, sizeof *o);
+       o->lo_header = h;
+       o->lo_dev    = d;
+       lu_device_get(d);
+       o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+       INIT_LIST_HEAD(&o->lo_linkage);
+       return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+       struct lu_device *dev = o->lo_dev;
+
+       LASSERT(list_empty(&o->lo_linkage));
+
+       if (dev != NULL) {
+               lu_ref_del_at(&dev->ld_reference,
+                             o->lo_dev_ref , "lu_object", o);
+               lu_device_put(dev);
+               o->lo_dev = NULL;
+       }
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+       list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+       list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+       memset(h, 0, sizeof *h);
+       atomic_set(&h->loh_ref, 1);
+       INIT_HLIST_NODE(&h->loh_hash);
+       INIT_LIST_HEAD(&h->loh_lru);
+       INIT_LIST_HEAD(&h->loh_layers);
+       lu_ref_init(&h->loh_reference);
+       return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+       LASSERT(list_empty(&h->loh_layers));
+       LASSERT(list_empty(&h->loh_lru));
+       LASSERT(hlist_unhashed(&h->loh_hash));
+       lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                  const struct lu_device_type *dtype)
+{
+       struct lu_object *o;
+
+       list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+               if (o->lo_dev->ld_type == dtype)
+                       return o;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+       struct lu_site   *site = top->ld_site;
+       struct lu_device *scan;
+       struct lu_device *next;
+
+       lu_site_purge(env, site, ~0);
+       for (scan = top; scan != NULL; scan = next) {
+               next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+               lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+               lu_device_put(scan);
+       }
+
+       /* purge again. */
+       lu_site_purge(env, site, ~0);
+
+       for (scan = top; scan != NULL; scan = next) {
+               const struct lu_device_type *ldt = scan->ld_type;
+               struct obd_type      *type;
+
+               next = ldt->ldt_ops->ldto_device_free(env, scan);
+               type = ldt->ldt_obd_type;
+               if (type != NULL) {
+                       type->typ_refcnt--;
+                       class_put_type(type);
+               }
+       }
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+       /**
+        * Maximal number of tld slots.
+        */
+       LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+       int result;
+       int i;
+
+       LASSERT(key->lct_init != NULL);
+       LASSERT(key->lct_fini != NULL);
+       LASSERT(key->lct_tags != 0);
+       LASSERT(key->lct_owner != NULL);
+
+       result = -ENFILE;
+       spin_lock(&lu_keys_guard);
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+               if (lu_keys[i] == NULL) {
+                       key->lct_index = i;
+                       atomic_set(&key->lct_used, 1);
+                       lu_keys[i] = key;
+                       lu_ref_init(&key->lct_reference);
+                       result = 0;
+                       ++key_set_version;
+                       break;
+               }
+       }
+       spin_unlock(&lu_keys_guard);
+       return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+       if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+               struct lu_context_key *key;
+
+               key = lu_keys[index];
+               LASSERT(key != NULL);
+               LASSERT(key->lct_fini != NULL);
+               LASSERT(atomic_read(&key->lct_used) > 1);
+
+               key->lct_fini(ctx, key, ctx->lc_value[index]);
+               lu_ref_del(&key->lct_reference, "ctx", ctx);
+               atomic_dec(&key->lct_used);
+
+               LASSERT(key->lct_owner != NULL);
+               if ((ctx->lc_tags & LCT_NOREF) == 0) {
+                       LINVRNT(module_refcount(key->lct_owner) > 0);
+                       module_put(key->lct_owner);
+               }
+               ctx->lc_value[index] = NULL;
+       }
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+       LASSERT(atomic_read(&key->lct_used) >= 1);
+       LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+       lu_context_key_quiesce(key);
+
+       ++key_set_version;
+       spin_lock(&lu_keys_guard);
+       key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+       if (lu_keys[key->lct_index]) {
+               lu_keys[key->lct_index] = NULL;
+               lu_ref_fini(&key->lct_reference);
+       }
+       spin_unlock(&lu_keys_guard);
+
+       LASSERTF(atomic_read(&key->lct_used) == 1,
+                "key has instances: %d\n",
+                atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+       struct lu_context_key *key = k;
+       va_list args;
+       int result;
+
+       va_start(args, k);
+       do {
+               result = lu_context_key_register(key);
+               if (result)
+                       break;
+               key = va_arg(args, struct lu_context_key *);
+       } while (key != NULL);
+       va_end(args);
+
+       if (result != 0) {
+               va_start(args, k);
+               while (k != key) {
+                       lu_context_key_degister(k);
+                       k = va_arg(args, struct lu_context_key *);
+               }
+               va_end(args);
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_degister(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_revive(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_quiesce(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+                        const struct lu_context_key *key)
+{
+       LINVRNT(ctx->lc_state == LCS_ENTERED);
+       LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+       LASSERT(lu_keys[key->lct_index] == key);
+       return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+       struct lu_context *ctx;
+
+       if (!(key->lct_tags & LCT_QUIESCENT)) {
+               /*
+                * XXX layering violation.
+                */
+               key->lct_tags |= LCT_QUIESCENT;
+               /*
+                * XXX memory barrier has to go here.
+                */
+               spin_lock(&lu_keys_guard);
+               list_for_each_entry(ctx, &lu_context_remembered,
+                                       lc_remember)
+                       key_fini(ctx, key->lct_index);
+               spin_unlock(&lu_keys_guard);
+               ++key_set_version;
+       }
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+       key->lct_tags &= ~LCT_QUIESCENT;
+       ++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+       int     i;
+
+       if (ctx->lc_value == NULL)
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+               key_fini(ctx, i);
+
+       OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+       ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+       int i;
+
+       LINVRNT(ctx->lc_value != NULL);
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+               struct lu_context_key *key;
+
+               key = lu_keys[i];
+               if (ctx->lc_value[i] == NULL && key != NULL &&
+                   (key->lct_tags & ctx->lc_tags) &&
+                   /*
+                    * Don't create values for a LCT_QUIESCENT key, as this
+                    * will pin module owning a key.
+                    */
+                   !(key->lct_tags & LCT_QUIESCENT)) {
+                       void *value;
+
+                       LINVRNT(key->lct_init != NULL);
+                       LINVRNT(key->lct_index == i);
+
+                       value = key->lct_init(ctx, key);
+                       if (unlikely(IS_ERR(value)))
+                               return PTR_ERR(value);
+
+                       LASSERT(key->lct_owner != NULL);
+                       if (!(ctx->lc_tags & LCT_NOREF))
+                               try_module_get(key->lct_owner);
+                       lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+                       atomic_inc(&key->lct_used);
+                       /*
+                        * This is the only place in the code, where an
+                        * element of ctx->lc_value[] array is set to non-NULL
+                        * value.
+                        */
+                       ctx->lc_value[i] = value;
+                       if (key->lct_exit != NULL)
+                               ctx->lc_tags |= LCT_HAS_EXIT;
+               }
+               ctx->lc_version = key_set_version;
+       }
+       return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+       OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+       if (likely(ctx->lc_value != NULL))
+               return keys_fill(ctx);
+
+       return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+       int     rc;
+
+       memset(ctx, 0, sizeof *ctx);
+       ctx->lc_state = LCS_INITIALIZED;
+       ctx->lc_tags = tags;
+       if (tags & LCT_REMEMBER) {
+               spin_lock(&lu_keys_guard);
+               list_add(&ctx->lc_remember, &lu_context_remembered);
+               spin_unlock(&lu_keys_guard);
+       } else {
+               INIT_LIST_HEAD(&ctx->lc_remember);
+       }
+
+       rc = keys_init(ctx);
+       if (rc != 0)
+               lu_context_fini(ctx);
+
+       return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+       LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+       ctx->lc_state = LCS_FINALIZED;
+
+       if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+               LASSERT(list_empty(&ctx->lc_remember));
+               keys_fini(ctx);
+
+       } else { /* could race with key degister */
+               spin_lock(&lu_keys_guard);
+               keys_fini(ctx);
+               list_del_init(&ctx->lc_remember);
+               spin_unlock(&lu_keys_guard);
+       }
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+       LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+       ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+       int i;
+
+       LINVRNT(ctx->lc_state == LCS_ENTERED);
+       ctx->lc_state = LCS_LEFT;
+       if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+               for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+                       if (ctx->lc_value[i] != NULL) {
+                               struct lu_context_key *key;
+
+                               key = lu_keys[i];
+                               LASSERT(key != NULL);
+                               if (key->lct_exit != NULL)
+                                       key->lct_exit(ctx,
+                                                     key, ctx->lc_value[i]);
+                       }
+               }
+       }
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+       return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_context_tags_default |= tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_context_tags_default &= ~tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_session_tags_default |= tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_session_tags_default &= ~tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+       int result;
+
+       env->le_ses = NULL;
+       result = lu_context_init(&env->le_ctx, tags);
+       if (likely(result == 0))
+               lu_context_enter(&env->le_ctx);
+       return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+       lu_context_exit(&env->le_ctx);
+       lu_context_fini(&env->le_ctx);
+       env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+       int result;
+
+       result = lu_context_refill(&env->le_ctx);
+       if (result == 0 && env->le_ses != NULL)
+               result = lu_context_refill(env->le_ses);
+       return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+                         __u32 stags)
+{
+       int    result;
+
+       if ((env->le_ctx.lc_tags & ctags) != ctags) {
+               env->le_ctx.lc_version = 0;
+               env->le_ctx.lc_tags |= ctags;
+       }
+
+       if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+               env->le_ses->lc_version = 0;
+               env->le_ses->lc_tags |= stags;
+       }
+
+       result = lu_env_refill(env);
+
+       return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+       unsigned        lss_populated;
+       unsigned        lss_max_search;
+       unsigned        lss_total;
+       unsigned        lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+                             lu_site_stats_t *stats, int populated)
+{
+       cfs_hash_bd_t bd;
+       int        i;
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+               struct hlist_head       *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, 1);
+               stats->lss_busy  += bkt->lsb_busy;
+               stats->lss_total += cfs_hash_bd_count_get(&bd);
+               stats->lss_max_search = max((int)stats->lss_max_search,
+                                           cfs_hash_bd_depmax_get(&bd));
+               if (!populated) {
+                       cfs_hash_bd_unlock(hs, &bd, 1);
+                       continue;
+               }
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       if (!hlist_empty(hhead))
+                               stats->lss_populated++;
+               }
+               cfs_hash_bd_unlock(hs, &bd, 1);
+       }
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       lu_site_stats_t stats;
+       struct lu_site *s;
+       struct lu_site *tmp;
+       int cached = 0;
+       int remain = shrink_param(sc, nr_to_scan);
+       LIST_HEAD(splice);
+
+       if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+               if (remain != 0)
+                       return -1;
+               else
+                       /* We must not take the lu_sites_guard lock when
+                        * __GFP_FS is *not* set because of the deadlock
+                        * possibility detailed above. Additionally,
+                        * since we cannot determine the number of
+                        * objects in the cache without taking this
+                        * lock, we're in a particularly tough spot. As
+                        * a result, we'll just lie and say our cache is
+                        * empty. This _should_ be ok, as we can't
+                        * reclaim objects when __GFP_FS is *not* set
+                        * anyways.
+                        */
+                       return 0;
+       }
+
+       CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+       mutex_lock(&lu_sites_guard);
+       list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+               if (shrink_param(sc, nr_to_scan) != 0) {
+                       remain = lu_site_purge(&lu_shrink_env, s, remain);
+                       /*
+                        * Move just shrunk site to the tail of site list to
+                        * assure shrinking fairness.
+                        */
+                       list_move_tail(&s->ls_linkage, &splice);
+               }
+
+               memset(&stats, 0, sizeof(stats));
+               lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+               cached += stats.lss_total - stats.lss_busy;
+               if (shrink_param(sc, nr_to_scan) && remain <= 0)
+                       break;
+       }
+       list_splice(&splice, lu_sites.prev);
+       mutex_unlock(&lu_sites_guard);
+
+       cached = (cached / 100) * sysctl_vfs_cache_pressure;
+       if (shrink_param(sc, nr_to_scan) == 0)
+               CDEBUG(D_INODE, "%d objects cached\n", cached);
+       return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+                     void *unused, const char *format, ...)
+{
+       va_list args;
+
+       va_start(args, format);
+       vprintk(format, args);
+       va_end(args);
+       return 0;
+}
+
+int lu_debugging_setup(void)
+{
+       return lu_env_init(&lu_debugging_env, ~0);
+}
+
+void lu_context_keys_dump(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+               struct lu_context_key *key;
+
+               key = lu_keys[i];
+               if (key != NULL) {
+                       CERROR("[%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n",
+                              i, key, key->lct_tags,
+                              key->lct_init, key->lct_fini, key->lct_exit,
+                              key->lct_index, atomic_read(&key->lct_used),
+                              key->lct_owner ? key->lct_owner->name : "",
+                              key->lct_owner);
+                       lu_ref_print(&key->lct_reference);
+               }
+       }
+}
+EXPORT_SYMBOL(lu_context_keys_dump);
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+       int result;
+
+       CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+       result = lu_ref_global_init();
+       if (result != 0)
+               return result;
+
+       LU_CONTEXT_KEY_INIT(&lu_global_key);
+       result = lu_context_key_register(&lu_global_key);
+       if (result != 0)
+               return result;
+
+       /*
+        * At this level, we don't know what tags are needed, so allocate them
+        * conservatively. This should not be too bad, because this
+        * environment is global.
+        */
+       mutex_lock(&lu_sites_guard);
+       result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+       mutex_unlock(&lu_sites_guard);
+       if (result != 0)
+               return result;
+
+       /*
+        * seeks estimation: 3 seeks to read a record from oi, one to read
+        * inode, one for ea. Unfortunately setting this high value results in
+        * lu_object/inode cache consuming all the memory.
+        */
+       lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+       if (lu_site_shrinker == NULL)
+               return -ENOMEM;
+
+       return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+       if (lu_site_shrinker != NULL) {
+               remove_shrinker(lu_site_shrinker);
+               lu_site_shrinker = NULL;
+       }
+
+       lu_context_key_degister(&lu_global_key);
+
+       /*
+        * Tear shrinker environment down _after_ de-registering
+        * lu_global_key, because the latter has a value in the former.
+        */
+       mutex_lock(&lu_sites_guard);
+       lu_env_fini(&lu_shrink_env);
+       mutex_unlock(&lu_sites_guard);
+
+       lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+       struct lprocfs_counter ret;
+
+       lprocfs_stats_collect(stats, idx, &ret);
+       return (__u32)ret.lc_count;
+#else
+       return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, char *page, int count)
+{
+       lu_site_stats_t stats;
+
+       memset(&stats, 0, sizeof(stats));
+       lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+       return snprintf(page, count, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+                       stats.lss_busy,
+                       stats.lss_total,
+                       stats.lss_populated,
+                       CFS_HASH_NHLIST(s->ls_obj_hash),
+                       stats.lss_max_search,
+                       ls_stats_read(s->ls_stats, LU_SS_CREATED),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+                       ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+       int result;
+       struct lu_kmem_descr *iter = caches;
+
+       for (result = 0; iter->ckd_cache != NULL; ++iter) {
+               *iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+                                                       iter->ckd_size,
+                                                       0, 0, NULL);
+               if (*iter->ckd_cache == NULL) {
+                       result = -ENOMEM;
+                       /* free all previously allocated caches */
+                       lu_kmem_fini(caches);
+                       break;
+               }
+       }
+       return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+       for (; caches->ckd_cache != NULL; ++caches) {
+               if (*caches->ckd_cache != NULL) {
+                       kmem_cache_destroy(*caches->ckd_cache);
+                       *caches->ckd_cache = NULL;
+               }
+       }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_fid *fid)
+{
+       struct lu_site          *s = o->lo_dev->ld_site;
+       struct lu_fid           *old = &o->lo_header->loh_fid;
+       struct lu_site_bkt_data *bkt;
+       struct lu_object        *shadow;
+       wait_queue_t             waiter;
+       cfs_hash_t              *hs;
+       cfs_hash_bd_t            bd;
+       __u64                    version = 0;
+
+       LASSERT(fid_is_zero(old));
+
+       hs = s->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+       shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+       /* supposed to be unique */
+       LASSERT(shadow == NULL);
+       *old = *fid;
+       bkt = cfs_hash_bd_extra_get(hs, &bd);
+       cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+       bkt->lsb_busy++;
+       cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf)
+{
+       struct lu_fid     fid;
+       struct lu_object *o;
+
+       fid_zero(&fid);
+       o = lu_object_alloc(env, dev, &fid, conf);
+
+       return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+       .lb_buf = NULL,
+       .lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+       LASSERT(buf);
+       if (buf->lb_buf) {
+               LASSERT(buf->lb_len > 0);
+               OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+               buf->lb_buf = NULL;
+               buf->lb_len = 0;
+       }
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+       LASSERT(buf);
+       LASSERT(buf->lb_buf == NULL);
+       LASSERT(buf->lb_len == 0);
+       OBD_ALLOC_LARGE(buf->lb_buf, size);
+       if (likely(buf->lb_buf))
+               buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+       lu_buf_free(buf);
+       lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+       if (buf->lb_buf == NULL && buf->lb_len == 0)
+               lu_buf_alloc(buf, len);
+
+       if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+               lu_buf_realloc(buf, len);
+
+       return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+       char *ptr;
+
+       if (len <= buf->lb_len)
+               return 0;
+
+       OBD_ALLOC_LARGE(ptr, len);
+       if (ptr == NULL)
+               return -ENOMEM;
+
+       /* Free the old buf */
+       if (buf->lb_buf != NULL) {
+               memcpy(ptr, buf->lb_buf, buf->lb_len);
+               OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+       }
+
+       buf->lb_buf = ptr;
+       buf->lb_len = len;
+       return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644 (file)
index 0000000..23a76f1
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644 (file)
index 0000000..229db6c
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = lu_ucred_key_init,
+       .lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+       if (!env->le_ses)
+               return NULL;
+       return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+       struct lu_ucred *uc = lu_ucred(env);
+       if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+               return NULL;
+       return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+       struct lu_ucred *uc = lu_ucred_check(env);
+       LASSERT(uc != NULL);
+       return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+       LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+       return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+       lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644 (file)
index 0000000..69d6499
--- /dev/null
@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+       spinlock_t      lock;
+       struct list_head        head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+                      struct portals_handle_ops *ops)
+{
+       struct handle_bucket *bucket;
+       ENTRY;
+
+       LASSERT(h != NULL);
+       LASSERT(list_empty(&h->h_link));
+
+       /*
+        * This is fast, but simplistic cookie generation algorithm, it will
+        * need a re-do at some point in the future for security.
+        */
+       spin_lock(&handle_base_lock);
+       handle_base += HANDLE_INCR;
+
+       if (unlikely(handle_base == 0)) {
+               /*
+                * Cookie of zero is "dangerous", because in many places it's
+                * assumed that 0 means "unassigned" handle, not bound to any
+                * object.
+                */
+               CWARN("The universe has been exhausted: cookie wrap-around.\n");
+               handle_base += HANDLE_INCR;
+       }
+       h->h_cookie = handle_base;
+       spin_unlock(&handle_base_lock);
+
+       h->h_ops = ops;
+       spin_lock_init(&h->h_lock);
+
+       bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+       spin_lock(&bucket->lock);
+       list_add_rcu(&h->h_link, &bucket->head);
+       h->h_in = 1;
+       spin_unlock(&bucket->lock);
+
+       CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+              h, h->h_cookie);
+       EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+       if (list_empty(&h->h_link)) {
+               CERROR("removing an already-removed handle ("LPX64")\n",
+                      h->h_cookie);
+               return;
+       }
+
+       CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+              h, h->h_cookie);
+
+       spin_lock(&h->h_lock);
+       if (h->h_in == 0) {
+               spin_unlock(&h->h_lock);
+               return;
+       }
+       h->h_in = 0;
+       spin_unlock(&h->h_lock);
+       list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+       struct handle_bucket *bucket;
+       bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+       spin_lock(&bucket->lock);
+       class_handle_unhash_nolock(h);
+       spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+       struct handle_bucket *bucket;
+       ENTRY;
+
+       bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+       spin_lock(&bucket->lock);
+       list_add_rcu(&h->h_link, &bucket->head);
+       h->h_in = 1;
+       spin_unlock(&bucket->lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+       struct handle_bucket *bucket;
+       struct portals_handle *h;
+       void *retval = NULL;
+       ENTRY;
+
+       LASSERT(handle_hash != NULL);
+
+       /* Be careful when you want to change this code. See the
+        * rcu_read_lock() definition on top this file. - jxiong */
+       bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(h, &bucket->head, h_link) {
+               if (h->h_cookie != cookie)
+                       continue;
+
+               spin_lock(&h->h_lock);
+               if (likely(h->h_in != 0)) {
+                       h->h_ops->hop_addref(h);
+                       retval = h;
+               }
+               spin_unlock(&h->h_lock);
+               break;
+       }
+       rcu_read_unlock();
+
+       RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+       struct portals_handle *h = RCU2HANDLE(rcu);
+       void *ptr = (void *)(unsigned long)h->h_cookie;
+
+       if (h->h_ops->hop_free != NULL)
+               h->h_ops->hop_free(ptr, h->h_size);
+       else
+               OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+       struct handle_bucket *bucket;
+       struct timeval tv;
+       int seed[2];
+
+       LASSERT(handle_hash == NULL);
+
+       OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+       if (handle_hash == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&handle_base_lock);
+       for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+            bucket--) {
+               INIT_LIST_HEAD(&bucket->head);
+               spin_lock_init(&bucket->lock);
+       }
+
+       /** bug 21430: add randomness to the initial base */
+       cfs_get_random_bytes(seed, sizeof(seed));
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+       cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+       LASSERT(handle_base != 0ULL);
+
+       return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+       int rc;
+       int i;
+
+       for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+               struct portals_handle *h;
+
+               spin_lock(&handle_hash[i].lock);
+               list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+                       CERROR("force clean handle "LPX64" addr %p ops %p\n",
+                              h->h_cookie, h, h->h_ops);
+
+                       class_handle_unhash_nolock(h);
+                       rc++;
+               }
+               spin_unlock(&handle_hash[i].lock);
+       }
+
+       return rc;
+}
+
+void class_handle_cleanup(void)
+{
+       int count;
+       LASSERT(handle_hash != NULL);
+
+       count = cleanup_all_handles();
+
+       OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+       handle_hash = NULL;
+
+       if (count != 0)
+               CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644 (file)
index 0000000..2fa2589
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX       32
+
+struct uuid_nid_data {
+       struct list_head       un_list;
+       struct obd_uuid  un_uuid;
+       int           un_nid_count;
+       lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head        g_uuid_list;
+static spinlock_t      g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+       INIT_LIST_HEAD(&g_uuid_list);
+       spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+       /* delete all */
+       class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+       struct uuid_nid_data *data;
+       struct obd_uuid tmp;
+       int rc = -ENOENT;
+
+       obd_str2uuid(&tmp, uuid);
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(data, &g_uuid_list, un_list) {
+               if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+                       if (index >= data->un_nid_count)
+                               break;
+
+                       rc = 0;
+                       *peer_nid = data->un_nids[index];
+                       break;
+               }
+       }
+       spin_unlock(&g_uuid_lock);
+       return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+       struct uuid_nid_data *data, *entry;
+       int found = 0;
+
+       LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+       if (strlen(uuid) > UUID_MAX - 1)
+               return -EOVERFLOW;
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               return -ENOMEM;
+
+       obd_str2uuid(&data->un_uuid, uuid);
+       data->un_nids[0] = nid;
+       data->un_nid_count = 1;
+
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(entry, &g_uuid_list, un_list) {
+               if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+                       int i;
+
+                       found = 1;
+                       for (i = 0; i < entry->un_nid_count; i++)
+                               if (nid == entry->un_nids[i])
+                                       break;
+
+                       if (i == entry->un_nid_count) {
+                               LASSERT(entry->un_nid_count < NIDS_MAX);
+                               entry->un_nids[entry->un_nid_count++] = nid;
+                       }
+                       break;
+               }
+       }
+       if (!found)
+               list_add(&data->un_list, &g_uuid_list);
+       spin_unlock(&g_uuid_lock);
+
+       if (found) {
+               CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+                      libcfs_nid2str(nid), entry->un_nid_count);
+               OBD_FREE(data, sizeof(*data));
+       } else {
+               CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+       }
+       return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+       LIST_HEAD(deathrow);
+       struct uuid_nid_data *data;
+
+       spin_lock(&g_uuid_lock);
+       if (uuid != NULL) {
+               struct obd_uuid tmp;
+
+               obd_str2uuid(&tmp, uuid);
+               list_for_each_entry(data, &g_uuid_list, un_list) {
+                       if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+                               list_move(&data->un_list, &deathrow);
+                               break;
+                       }
+               }
+       } else
+               list_splice_init(&g_uuid_list, &deathrow);
+       spin_unlock(&g_uuid_lock);
+
+       if (uuid != NULL && list_empty(&deathrow)) {
+               CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+               return -EINVAL;
+       }
+
+       while (!list_empty(&deathrow)) {
+               data = list_entry(deathrow.next, struct uuid_nid_data,
+                                     un_list);
+               list_del(&data->un_list);
+
+               CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+                      obd_uuid2str(&data->un_uuid),
+                      libcfs_nid2str(data->un_nids[0]),
+                      data->un_nid_count);
+
+               OBD_FREE(data, sizeof(*data));
+       }
+
+       return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+       struct uuid_nid_data *entry;
+       int found = 0;
+       ENTRY;
+
+       CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+              obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(entry, &g_uuid_list, un_list) {
+               int i;
+
+               if (!obd_uuid_equals(&entry->un_uuid, uuid))
+                       continue;
+
+               /* found the uuid, check if it has @nid */
+               for (i = 0; i < entry->un_nid_count; i++) {
+                       if (entry->un_nids[i] == nid) {
+                               found = 1;
+                               break;
+                       }
+               }
+               break;
+       }
+       spin_unlock(&g_uuid_lock);
+       RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644 (file)
index 0000000..b71344a
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+                    __u32 incompat)
+{
+       lma->lma_compat   = 0;
+       lma->lma_incompat = incompat;
+       lma->lma_self_fid = *fid;
+
+       /* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+        * and change the test below. */
+       LASSERT(sizeof(*lma) ==
+               (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+                sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&lma->lma_compat);
+               __swab32s(&lma->lma_incompat);
+               lustre_swab_lu_fid(&lma->lma_self_fid);
+       }
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&attrs->som_compat);
+               __swab32s(&attrs->som_incompat);
+               __swab64s(&attrs->som_ioepoch);
+               __swab64s(&attrs->som_size);
+               __swab64s(&attrs->som_blocks);
+               __swab64s(&attrs->som_mountid);
+       }
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc  - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+       struct som_attrs *attrs = (struct som_attrs *)buf;
+       ENTRY;
+
+       if (rc == 0 ||  rc == -ENODATA)
+               /* no SOM attributes */
+               RETURN(-ENODATA);
+
+       if (rc < 0)
+               /* error hit while fetching xattr */
+               RETURN(rc);
+
+       /* check SOM compatibility */
+       if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+               RETURN(-ENODATA);
+
+       /* unpack SOM attributes */
+       lustre_som_swab(attrs);
+
+       /* fill in-memory msd structure */
+       msd->msd_compat   = attrs->som_compat;
+       msd->msd_incompat = attrs->som_incompat;
+       msd->msd_ioepoch  = attrs->som_ioepoch;
+       msd->msd_size     = attrs->som_size;
+       msd->msd_blocks   = attrs->som_blocks;
+       msd->msd_mountid  = attrs->som_mountid;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&attrs->hsm_compat);
+               __swab32s(&attrs->hsm_flags);
+               __swab64s(&attrs->hsm_arch_id);
+               __swab64s(&attrs->hsm_arch_ver);
+       }
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+       struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+       ENTRY;
+
+       if (rc == 0 ||  rc == -ENODATA)
+               /* no HSM attributes */
+               RETURN(-ENODATA);
+
+       if (rc < 0)
+               /* error hit while fetching xattr */
+               RETURN(rc);
+
+       /* unpack HSM attributes */
+       lustre_hsm_swab(attrs);
+
+       /* fill md_hsm structure */
+       mh->mh_compat   = attrs->hsm_compat;
+       mh->mh_flags    = attrs->hsm_flags;
+       mh->mh_arch_id  = attrs->hsm_arch_id;
+       mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+       struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+       ENTRY;
+
+       /* copy HSM attributes */
+       attrs->hsm_compat   = mh->mh_compat;
+       attrs->hsm_flags    = mh->mh_flags;
+       attrs->hsm_arch_id  = mh->mh_arch_id;
+       attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+       /* pack xattr */
+       lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_local_object.c b/drivers/staging/lustre/lustre/obdclass/md_local_object.c
new file mode 100644 (file)
index 0000000..ac5f44f
--- /dev/null
@@ -0,0 +1,459 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/md_local_object.c
+ *
+ * Lustre Local Object create APIs
+ * 'create on first mount' facility. Files registed under llo module will
+ * be created on first mount.
+ *
+ * Author: Pravin Shelar  <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <linux/list.h>
+#include <md_object.h>
+
+
+/** List head to hold list of objects to be created. */
+static struct list_head llo_lobj_list;
+
+/** Lock to protect list manipulations */
+static struct mutex    llo_lock;
+
+/**
+ * Structure used to maintain state of path parsing.
+ * \see llo_find_entry, llo_store_resolve
+ */
+struct llo_find_hint {
+       struct lu_fid   *lfh_cfid;
+       struct md_device     *lfh_md;
+       struct md_object     *lfh_pobj;
+};
+
+/**
+ * Thread Local storage for this module.
+ */
+struct llo_thread_info {
+       /** buffer to resolve path */
+       char                lti_buf[DT_MAX_PATH];
+       /** used for path resolve */
+       struct lu_fid      lti_fid;
+       /** used to pass child object fid */
+       struct lu_fid      lti_cfid;
+       struct llo_find_hint    lti_lfh;
+       struct md_op_spec       lti_spc;
+       struct md_attr    lti_ma;
+       struct lu_name    lti_lname;
+};
+
+LU_KEY_INIT(llod_global, struct llo_thread_info);
+LU_KEY_FINI(llod_global, struct llo_thread_info);
+
+static struct lu_context_key llod_key = {
+       .lct_tags = LCT_MD_THREAD,
+       .lct_init = llod_global_key_init,
+       .lct_fini = llod_global_key_fini
+};
+
+static inline struct llo_thread_info * llo_env_info(const struct lu_env *env)
+{
+       return lu_context_key_get(&env->le_ctx,  &llod_key);
+}
+
+/**
+ * Search md object for given fid.
+ */
+static struct md_object *llo_locate(const struct lu_env *env,
+                                   struct md_device *md,
+                                   const struct lu_fid *fid)
+{
+       struct lu_object *obj;
+       struct md_object *mdo;
+
+       obj = lu_object_find(env, &md->md_lu_dev, fid, NULL);
+       if (!IS_ERR(obj)) {
+               obj = lu_object_locate(obj->lo_header, md->md_lu_dev.ld_type);
+               LASSERT(obj != NULL);
+               mdo = (struct md_object *) obj;
+       } else
+               mdo = (struct md_object *)obj;
+       return mdo;
+}
+
+/**
+ * Lookup FID for object named \a name in directory \a pobj.
+ */
+static int llo_lookup(const struct lu_env  *env,
+                     struct md_object *pobj,
+                     const char *name,
+                     struct lu_fid *fid)
+{
+       struct llo_thread_info *info = llo_env_info(env);
+       struct lu_name    *lname = &info->lti_lname;
+       struct md_op_spec       *spec = &info->lti_spc;
+
+       spec->sp_feat = NULL;
+       spec->sp_cr_flags = 0;
+       spec->sp_cr_lookup = 0;
+       spec->sp_cr_mode = 0;
+
+       lname->ln_name = name;
+       lname->ln_namelen = strlen(name);
+
+       return mdo_lookup(env, pobj, lname, fid, spec);
+}
+
+/**
+ * Function to look up path component, this is passed to parsing
+ * function. \see llo_store_resolve
+ *
+ * \retval      rc returns error code for lookup or locate operation
+ *
+ * pointer to object is returned in data (lfh->lfh_pobj)
+ */
+static int llo_find_entry(const struct lu_env  *env,
+                         const char *name, void *data)
+{
+       struct llo_find_hint    *lfh = data;
+       struct md_device        *md = lfh->lfh_md;
+       struct lu_fid      *fid = lfh->lfh_cfid;
+       struct md_object        *obj = lfh->lfh_pobj;
+       int                  result;
+
+       /* lookup fid for object */
+       result = llo_lookup(env, obj, name, fid);
+       lu_object_put(env, &obj->mo_lu);
+
+       if (result == 0) {
+               /* get md object for fid that we got in lookup */
+               obj = llo_locate(env, md, fid);
+               if (IS_ERR(obj))
+                       result = PTR_ERR(obj);
+       }
+
+       lfh->lfh_pobj = obj;
+       return result;
+}
+
+static struct md_object *llo_reg_open(const struct lu_env *env,
+                                     struct md_device *md,
+                                     struct md_object *p,
+                                     const char *name,
+                                     struct lu_fid *fid)
+{
+       struct md_object *o;
+       int result;
+
+       result = llo_lookup(env, p, name, fid);
+       if (result == 0)
+               o = llo_locate(env, md, fid);
+       else
+               o = ERR_PTR(result);
+
+       return o;
+}
+
+/**
+ * Resolve given \a path, on success function returns
+ * md object for last directory and \a fid points to
+ * its fid.
+ */
+struct md_object *llo_store_resolve(const struct lu_env *env,
+                                   struct md_device *md,
+                                   struct dt_device *dt,
+                                   const char *path,
+                                   struct lu_fid *fid)
+{
+       struct llo_thread_info *info = llo_env_info(env);
+       struct llo_find_hint *lfh = &info->lti_lfh;
+       char *local = info->lti_buf;
+       struct md_object        *obj;
+       int result;
+
+       strncpy(local, path, DT_MAX_PATH);
+       local[DT_MAX_PATH - 1] = '\0';
+
+       lfh->lfh_md = md;
+       lfh->lfh_cfid = fid;
+       /* start path resolution from backend fs root. */
+       result = dt->dd_ops->dt_root_get(env, dt, fid);
+       if (result == 0) {
+               /* get md object for root */
+               obj = llo_locate(env, md, fid);
+               if (!IS_ERR(obj)) {
+                       /* start path parser from root md */
+                       lfh->lfh_pobj = obj;
+                       result = dt_path_parser(env, local, llo_find_entry, lfh);
+                       if (result != 0)
+                               obj = ERR_PTR(result);
+                       else
+                               obj = lfh->lfh_pobj;
+               }
+       } else {
+               obj = ERR_PTR(result);
+       }
+       return obj;
+}
+EXPORT_SYMBOL(llo_store_resolve);
+
+/**
+ * Returns md object for \a objname in given \a dirname.
+ */
+struct md_object *llo_store_open(const struct lu_env *env,
+                                struct md_device *md,
+                                struct dt_device *dt,
+                                const char *dirname,
+                                const char *objname,
+                                struct lu_fid *fid)
+{
+       struct md_object *obj;
+       struct md_object *dir;
+
+       /* search md object for parent dir */
+       dir = llo_store_resolve(env, md, dt, dirname, fid);
+       if (!IS_ERR(dir)) {
+               obj = llo_reg_open(env, md, dir, objname, fid);
+               lu_object_put(env, &dir->mo_lu);
+       } else
+               obj = dir;
+
+       return obj;
+}
+EXPORT_SYMBOL(llo_store_open);
+
+static struct md_object *llo_create_obj(const struct lu_env *env,
+                                       struct md_device *md,
+                                       struct md_object *dir,
+                                       const char *objname,
+                                       const struct lu_fid *fid,
+                                       const struct dt_index_features *feat)
+{
+       struct llo_thread_info *info = llo_env_info(env);
+       struct md_object        *mdo;
+       struct md_attr    *ma = &info->lti_ma;
+       struct md_op_spec       *spec = &info->lti_spc;
+       struct lu_name    *lname = &info->lti_lname;
+       struct lu_attr    *la = &ma->ma_attr;
+       int rc;
+
+       mdo = llo_locate(env, md, fid);
+       if (IS_ERR(mdo))
+               return mdo;
+
+       lname->ln_name = objname;
+       lname->ln_namelen = strlen(objname);
+
+       spec->sp_feat = feat;
+       spec->sp_cr_flags = 0;
+       spec->sp_cr_lookup = 1;
+       spec->sp_cr_mode = 0;
+
+       if (feat == &dt_directory_features)
+               la->la_mode = S_IFDIR | S_IXUGO;
+       else
+               la->la_mode = S_IFREG;
+
+       la->la_mode |= S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+       la->la_uid = la->la_gid = 0;
+       la->la_valid = LA_MODE | LA_UID | LA_GID;
+
+       ma->ma_valid = 0;
+       ma->ma_need = 0;
+
+       rc = mdo_create(env, dir, lname, mdo, spec, ma);
+
+       if (rc) {
+               lu_object_put(env, &mdo->mo_lu);
+               mdo = ERR_PTR(rc);
+       }
+
+       return mdo;
+}
+
+/**
+ * Create md object, object could be diretcory or
+ * special index defined by \a feat in \a directory.
+ *
+ *       \param  md       device
+ *       \param  dirname  parent directory
+ *       \param  objname  file name
+ *       \param  fid      object fid
+ *       \param  feat     index features required for directory create
+ */
+
+struct md_object *llo_store_create_index(const struct lu_env *env,
+                                        struct md_device *md,
+                                        struct dt_device *dt,
+                                        const char *dirname,
+                                        const char *objname,
+                                        const struct lu_fid *fid,
+                                        const struct dt_index_features *feat)
+{
+       struct llo_thread_info *info = llo_env_info(env);
+       struct md_object *obj;
+       struct md_object *dir;
+       struct lu_fid *ignore = &info->lti_fid;
+
+       dir = llo_store_resolve(env, md, dt, dirname, ignore);
+       if (!IS_ERR(dir)) {
+               obj = llo_create_obj(env, md, dir, objname, fid, feat);
+               lu_object_put(env, &dir->mo_lu);
+       } else {
+               obj = dir;
+       }
+       return obj;
+}
+
+EXPORT_SYMBOL(llo_store_create_index);
+
+/**
+ * Create md object for regular file in \a directory.
+ *
+ *       \param  md       device
+ *       \param  dirname  parent directory
+ *       \param  objname  file name
+ *       \param  fid      object fid.
+ */
+
+struct md_object *llo_store_create(const struct lu_env *env,
+                                  struct md_device *md,
+                                  struct dt_device *dt,
+                                  const char *dirname,
+                                  const char *objname,
+                                  const struct lu_fid *fid)
+{
+       return llo_store_create_index(env, md, dt, dirname,
+                                     objname, fid, NULL);
+}
+
+EXPORT_SYMBOL(llo_store_create);
+
+/**
+ * Register object for 'create on first mount' facility.
+ * objects are created in order of registration.
+ */
+
+void llo_local_obj_register(struct lu_local_obj_desc *llod)
+{
+       mutex_lock(&llo_lock);
+       list_add_tail(&llod->llod_linkage, &llo_lobj_list);
+       mutex_unlock(&llo_lock);
+}
+
+EXPORT_SYMBOL(llo_local_obj_register);
+
+void llo_local_obj_unregister(struct lu_local_obj_desc *llod)
+{
+       mutex_lock(&llo_lock);
+       list_del(&llod->llod_linkage);
+       mutex_unlock(&llo_lock);
+}
+
+EXPORT_SYMBOL(llo_local_obj_unregister);
+
+/**
+ * Created registed objects.
+ */
+
+int llo_local_objects_setup(const struct lu_env *env,
+                            struct md_device * md,
+                            struct dt_device *dt)
+{
+       struct llo_thread_info *info = llo_env_info(env);
+       struct lu_fid *fid;
+       struct lu_local_obj_desc *scan;
+       struct md_object *mdo;
+       const char *dir;
+       int rc = 0;
+
+       fid = &info->lti_cfid;
+       mutex_lock(&llo_lock);
+
+       list_for_each_entry(scan, &llo_lobj_list, llod_linkage) {
+               lu_local_obj_fid(fid, scan->llod_oid);
+               dir = "";
+               if (scan->llod_dir)
+                       dir = scan->llod_dir;
+
+               if (scan->llod_is_index)
+                       mdo = llo_store_create_index(env, md, dt ,
+                                                    dir, scan->llod_name,
+                                                    fid,
+                                                    scan->llod_feat);
+               else
+                       mdo = llo_store_create(env, md, dt,
+                                              dir, scan->llod_name,
+                                              fid);
+               if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) {
+                       rc = PTR_ERR(mdo);
+                       CERROR("creating obj [%s] fid = "DFID" rc = %d\n",
+                              scan->llod_name, PFID(fid), rc);
+                       goto out;
+               }
+
+               if (!IS_ERR(mdo))
+                       lu_object_put(env, &mdo->mo_lu);
+       }
+
+out:
+       mutex_unlock(&llo_lock);
+       return rc;
+}
+
+EXPORT_SYMBOL(llo_local_objects_setup);
+
+int llo_global_init(void)
+{
+       int result;
+
+       INIT_LIST_HEAD(&llo_lobj_list);
+       mutex_init(&llo_lock);
+
+       LU_CONTEXT_KEY_INIT(&llod_key);
+       result = lu_context_key_register(&llod_key);
+       return result;
+}
+
+void llo_global_fini(void)
+{
+       lu_context_key_degister(&llod_key);
+       LASSERT(list_empty(&llo_lobj_list));
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644 (file)
index 0000000..c4f0dbc
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h>   /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+       unsigned int c;
+
+       c = name[namelen - 1];
+       if (c == 0)
+               CWARN("looks like wrong len is passed\n");
+       c = c % count;
+       return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+       unsigned int c = 0;
+
+       while (--namelen >= 0)
+               c += name[namelen];
+       c = c % count;
+       return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+       unsigned int    c = 0;
+       int             idx;
+
+       LASSERT(namelen > 0);
+
+       if (filename_is_volatile(name, namelen, &idx)) {
+               if ((idx >= 0) && (idx < count))
+                       return idx;
+               goto hashchoice;
+       }
+
+       if (count <= 1)
+               return 0;
+
+hashchoice:
+       switch (hashtype) {
+       case MEA_MAGIC_LAST_CHAR:
+               c = mea_last_char_hash(count, (char *)name, namelen);
+               break;
+       case MEA_MAGIC_ALL_CHARS:
+               c = mea_all_chars_hash(count, (char *)name, namelen);
+               break;
+       case MEA_MAGIC_HASH_SEGMENT:
+               CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+               break;
+       default:
+               CERROR("Unknown hash type 0x%x\n", hashtype);
+       }
+
+       LASSERT(c < count);
+       return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+       unsigned int c;
+
+       LASSERT(mea && mea->mea_count);
+
+       c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+       LASSERT(c < mea->mea_count);
+       return c;
+}
+EXPORT_SYMBOL(mea_name2idx);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644 (file)
index 0000000..9636aa9
--- /dev/null
@@ -0,0 +1,1899 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+       char *ptr;
+
+       if (!buf)
+               return 1;
+
+       if ((ptr = strstr(buf, key)) == NULL)
+               return 1;
+
+       if (valp)
+               *valp = ptr + strlen(key);
+
+       return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param                        proc parameter
+ * \param ptr                  an array which contains the mapping from
+ *                             old parameters to new ones
+ *
+ * \retval valid-pointer       pointer to the cfg_interop_param structure
+ *                             which contains the old and new parameters
+ * \retval NULL                        \a param or \a ptr is NULL,
+ *                             or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+                                              struct cfg_interop_param *ptr)
+{
+       char *value = NULL;
+       int   name_len = 0;
+
+       if (param == NULL || ptr == NULL)
+               RETURN(NULL);
+
+       value = strchr(param, '=');
+       if (value == NULL)
+               name_len = strlen(param);
+       else
+               name_len = value - param;
+
+       while (ptr->old_param != NULL) {
+               if (strncmp(param, ptr->old_param, name_len) == 0 &&
+                   name_len == strlen(ptr->old_param))
+                       RETURN(ptr);
+               ptr++;
+       }
+
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+       char *q1, *q2, *str;
+       int len;
+
+       str = *params;
+       while (*str == ' ')
+               str++;
+
+       if (*str == '\0') {
+               *params = NULL;
+               return 1;
+       }
+
+       while (1) {
+               q1 = strpbrk(str, " '\"");
+               if (q1 == NULL) {
+                       len = strlen(str);
+                       memcpy(copy, str, len);
+                       copy[len] = '\0';
+                       *params = NULL;
+                       return 0;
+               }
+               len = q1 - str;
+               if (*q1 == ' ') {
+                       memcpy(copy, str, len);
+                       copy[len] = '\0';
+                       *params = str + len;
+                       return 0;
+               }
+
+               memcpy(copy, str, len);
+               copy += len;
+
+               /* search for the matching closing quote */
+               str = q1 + 1;
+               q2 = strchr(str, *q1);
+               if (q2 == NULL) {
+                       CERROR("Unbalanced quota in parameters: \"%s\"\n",
+                              *params);
+                       return -EINVAL;
+               }
+               len = q2 - str;
+               memcpy(copy, str, len);
+               copy += len;
+               str = q2 + 1;
+       }
+       return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+       if (!buf)
+               return 1;
+
+       if (memcmp(buf, key, strlen(key)) != 0)
+               return 1;
+
+       if (valp)
+               *valp = buf + strlen(key);
+
+       return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+       lnet_nid_t *nid = (lnet_nid_t *)value;
+
+       *nid = libcfs_str2nid(buf);
+       if (*nid != LNET_NID_ANY)
+               return 0;
+
+       if (!quiet)
+               LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+       return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+       __u32 *net = (__u32 *)value;
+
+       *net = libcfs_str2net(buf);
+       CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+       return 0;
+}
+
+enum {
+       CLASS_PARSE_NID = 1,
+       CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+                            int quiet)
+{
+       char *endp;
+       char  tmp;
+       int   rc = 0;
+
+       if (!buf)
+               return 1;
+       while (*buf == ',' || *buf == ':')
+               buf++;
+       if (*buf == ' ' || *buf == '/' || *buf == '\0')
+               return 1;
+
+       /* nid separators or end of nids */
+       endp = strpbrk(buf, ",: /");
+       if (endp == NULL)
+               endp = buf + strlen(buf);
+
+       tmp = *endp;
+       *endp = '\0';
+       switch (opc) {
+       default:
+               LBUG();
+       case CLASS_PARSE_NID:
+               rc = parse_nid(buf, value, quiet);
+               break;
+       case CLASS_PARSE_NET:
+               rc = parse_net(buf, value);
+               break;
+       }
+       *endp = tmp;
+       if (rc != 0)
+               return rc;
+       if (endh)
+               *endh = endp;
+       return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+       lnet_nid_t tmp;
+       int   rc = -1;
+
+       while (class_find_param(buf, key, &buf) == 0) {
+               /* please restrict to the nids pertaining to
+                * the specified nids */
+               while (class_parse_nid(buf, &tmp, &buf) == 0) {
+                       if (tmp == nid)
+                               return 1;
+               }
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+       __u32 tmp;
+       int   rc = -1;
+
+       while (class_find_param(buf, key, &buf) == 0) {
+               /* please restrict to the nids pertaining to
+                * the specified networks */
+               while (class_parse_net(buf, &tmp, &buf) == 0) {
+                       if (tmp == net)
+                               return 1;
+               }
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+       struct obd_device *obd = NULL;
+       char *typename, *name, *uuid;
+       int rc, len;
+       ENTRY;
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("No type passed!\n");
+               RETURN(-EINVAL);
+       }
+       typename = lustre_cfg_string(lcfg, 1);
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+               CERROR("No name passed!\n");
+               RETURN(-EINVAL);
+       }
+       name = lustre_cfg_string(lcfg, 0);
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+               CERROR("No UUID passed!\n");
+               RETURN(-EINVAL);
+       }
+       uuid = lustre_cfg_string(lcfg, 2);
+
+       CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+              MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+       obd = class_newdev(typename, name);
+       if (IS_ERR(obd)) {
+               /* Already exists or out of obds */
+               rc = PTR_ERR(obd);
+               obd = NULL;
+               CERROR("Cannot create device %s of type %s : %d\n",
+                      name, typename, rc);
+               GOTO(out, rc);
+       }
+       LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+                name, typename);
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                "obd %p obd_magic %08X != %08X\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+                "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+       rwlock_init(&obd->obd_pool_lock);
+       obd->obd_pool_limit = 0;
+       obd->obd_pool_slv = 0;
+
+       INIT_LIST_HEAD(&obd->obd_exports);
+       INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+       INIT_LIST_HEAD(&obd->obd_delayed_exports);
+       INIT_LIST_HEAD(&obd->obd_exports_timed);
+       INIT_LIST_HEAD(&obd->obd_nid_stats);
+       spin_lock_init(&obd->obd_nid_lock);
+       spin_lock_init(&obd->obd_dev_lock);
+       mutex_init(&obd->obd_dev_mutex);
+       spin_lock_init(&obd->obd_osfs_lock);
+       /* obd->obd_osfs_age must be set to a value in the distant
+        * past to guarantee a fresh statfs is fetched on mount. */
+       obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+       /* XXX belongs in setup not attach  */
+       init_rwsem(&obd->obd_observer_link_sem);
+       /* recovery data */
+       cfs_init_timer(&obd->obd_recovery_timer);
+       spin_lock_init(&obd->obd_recovery_task_lock);
+       init_waitqueue_head(&obd->obd_next_transno_waitq);
+       init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+       INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+       INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+       INIT_LIST_HEAD(&obd->obd_final_req_queue);
+       INIT_LIST_HEAD(&obd->obd_evict_list);
+
+       llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+       obd->obd_conn_inprogress = 0;
+
+       len = strlen(uuid);
+       if (len >= sizeof(obd->obd_uuid)) {
+               CERROR("uuid must be < %d bytes long\n",
+                      (int)sizeof(obd->obd_uuid));
+               GOTO(out, rc = -EINVAL);
+       }
+       memcpy(obd->obd_uuid.uuid, uuid, len);
+
+       /* do the attach */
+       if (OBP(obd, attach)) {
+               rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+               if (rc)
+                       GOTO(out, rc = -EINVAL);
+       }
+
+       /* Detach drops this */
+       spin_lock(&obd->obd_dev_lock);
+       atomic_set(&obd->obd_refcount, 1);
+       spin_unlock(&obd->obd_dev_lock);
+       lu_ref_init(&obd->obd_reference);
+       lu_ref_add(&obd->obd_reference, "attach", obd);
+
+       obd->obd_attached = 1;
+       CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+              obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+       RETURN(0);
+ out:
+       if (obd != NULL) {
+               class_release_dev(obd);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int err = 0;
+       struct obd_export *exp;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+       LASSERTF(obd == class_num2obd(obd->obd_minor),
+                "obd %p != obd_devs[%d] %p\n",
+                obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                "obd %p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+       /* have we attached a type to this device? */
+       if (!obd->obd_attached) {
+               CERROR("Device %d not attached\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+
+       if (obd->obd_set_up) {
+               CERROR("Device %d already setup (type %s)\n",
+                      obd->obd_minor, obd->obd_type->typ_name);
+               RETURN(-EEXIST);
+       }
+
+       /* is someone else setting us up right now? (attach inits spinlock) */
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_starting) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("Device %d setup in progress (type %s)\n",
+                      obd->obd_minor, obd->obd_type->typ_name);
+               RETURN(-EEXIST);
+       }
+       /* just leave this on forever.  I can't use obd_set_up here because
+          other fns check that status, and we're not actually set up yet. */
+       obd->obd_starting = 1;
+       obd->obd_uuid_hash = NULL;
+       obd->obd_nid_hash = NULL;
+       obd->obd_nid_stats_hash = NULL;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* create an uuid-export lustre hash */
+       obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+                                            HASH_UUID_CUR_BITS,
+                                            HASH_UUID_MAX_BITS,
+                                            HASH_UUID_BKT_BITS, 0,
+                                            CFS_HASH_MIN_THETA,
+                                            CFS_HASH_MAX_THETA,
+                                            &uuid_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_uuid_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       /* create a nid-export lustre hash */
+       obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+                                           HASH_NID_CUR_BITS,
+                                           HASH_NID_MAX_BITS,
+                                           HASH_NID_BKT_BITS, 0,
+                                           CFS_HASH_MIN_THETA,
+                                           CFS_HASH_MAX_THETA,
+                                           &nid_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_nid_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       /* create a nid-stats lustre hash */
+       obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+                                                 HASH_NID_STATS_CUR_BITS,
+                                                 HASH_NID_STATS_MAX_BITS,
+                                                 HASH_NID_STATS_BKT_BITS, 0,
+                                                 CFS_HASH_MIN_THETA,
+                                                 CFS_HASH_MAX_THETA,
+                                                 &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_nid_stats_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       exp = class_new_export(obd, &obd->obd_uuid);
+       if (IS_ERR(exp))
+               GOTO(err_hash, err = PTR_ERR(exp));
+
+       obd->obd_self_export = exp;
+       list_del_init(&exp->exp_obd_chain_timed);
+       class_export_put(exp);
+
+       err = obd_setup(obd, lcfg);
+       if (err)
+               GOTO(err_exp, err);
+
+       obd->obd_set_up = 1;
+
+       spin_lock(&obd->obd_dev_lock);
+       /* cleanup drops this */
+       class_incref(obd, "setup", obd);
+       spin_unlock(&obd->obd_dev_lock);
+
+       CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+              obd->obd_name, obd->obd_uuid.uuid);
+
+       RETURN(0);
+err_exp:
+       if (obd->obd_self_export) {
+               class_unlink_export(obd->obd_self_export);
+               obd->obd_self_export = NULL;
+       }
+err_hash:
+       if (obd->obd_uuid_hash) {
+               cfs_hash_putref(obd->obd_uuid_hash);
+               obd->obd_uuid_hash = NULL;
+       }
+       if (obd->obd_nid_hash) {
+               cfs_hash_putref(obd->obd_nid_hash);
+               obd->obd_nid_hash = NULL;
+       }
+       if (obd->obd_nid_stats_hash) {
+               cfs_hash_putref(obd->obd_nid_stats_hash);
+               obd->obd_nid_stats_hash = NULL;
+       }
+       obd->obd_starting = 0;
+       CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+       return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       ENTRY;
+
+       if (obd->obd_set_up) {
+               CERROR("OBD device %d still set up\n", obd->obd_minor);
+               RETURN(-EBUSY);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (!obd->obd_attached) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("OBD device %d not attached\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+       obd->obd_attached = 0;
+       spin_unlock(&obd->obd_dev_lock);
+
+       CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+              obd->obd_name, obd->obd_uuid.uuid);
+
+       class_decref(obd, "attach", obd);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int err = 0;
+       char *flag;
+       ENTRY;
+
+       OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+       if (!obd->obd_set_up) {
+               CERROR("Device %d not setup\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("OBD %d already stopping\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+       /* Leave this on forever */
+       obd->obd_stopping = 1;
+
+       /* wait for already-arrived-connections to finish. */
+       while (obd->obd_conn_inprogress > 0) {
+               spin_unlock(&obd->obd_dev_lock);
+
+               cond_resched();
+
+               spin_lock(&obd->obd_dev_lock);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+               for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+                       switch (*flag) {
+                       case 'F':
+                               obd->obd_force = 1;
+                               break;
+                       case 'A':
+                               LCONSOLE_WARN("Failing over %s\n",
+                                             obd->obd_name);
+                               obd->obd_fail = 1;
+                               obd->obd_no_transno = 1;
+                               obd->obd_no_recov = 1;
+                               if (OBP(obd, iocontrol)) {
+                                       obd_iocontrol(OBD_IOC_SYNC,
+                                                     obd->obd_self_export,
+                                                     0, NULL, NULL);
+                               }
+                               break;
+                       default:
+                               CERROR("Unrecognised flag '%c'\n", *flag);
+                       }
+       }
+
+       LASSERT(obd->obd_self_export);
+
+       /* The three references that should be remaining are the
+        * obd_self_export and the attach and setup references. */
+       if (atomic_read(&obd->obd_refcount) > 3) {
+               /* refcounf - 3 might be the number of real exports
+                  (excluding self export). But class_incref is called
+                  by other things as well, so don't count on it. */
+               CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+                      obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+               dump_exports(obd, 0);
+               class_disconnect_exports(obd);
+       }
+
+       /* Precleanup, we must make sure all exports get destroyed. */
+       err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+       if (err)
+               CERROR("Precleanup %s returned %d\n",
+                      obd->obd_name, err);
+
+       /* destroy an uuid-export hash body */
+       if (obd->obd_uuid_hash) {
+               cfs_hash_putref(obd->obd_uuid_hash);
+               obd->obd_uuid_hash = NULL;
+       }
+
+       /* destroy a nid-export hash body */
+       if (obd->obd_nid_hash) {
+               cfs_hash_putref(obd->obd_nid_hash);
+               obd->obd_nid_hash = NULL;
+       }
+
+       /* destroy a nid-stats hash body */
+       if (obd->obd_nid_stats_hash) {
+               cfs_hash_putref(obd->obd_nid_stats_hash);
+               obd->obd_nid_stats_hash = NULL;
+       }
+
+       class_decref(obd, "setup", obd);
+       obd->obd_set_up = 0;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+                               const char *scope, const void *source)
+{
+       lu_ref_add_atomic(&obd->obd_reference, scope, source);
+       atomic_inc(&obd->obd_refcount);
+       CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+              atomic_read(&obd->obd_refcount));
+
+       return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+       int err;
+       int refs;
+
+       spin_lock(&obd->obd_dev_lock);
+       atomic_dec(&obd->obd_refcount);
+       refs = atomic_read(&obd->obd_refcount);
+       spin_unlock(&obd->obd_dev_lock);
+       lu_ref_del(&obd->obd_reference, scope, source);
+
+       CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+       if ((refs == 1) && obd->obd_stopping) {
+               /* All exports have been destroyed; there should
+                  be no more in-progress ops by this point.*/
+
+               spin_lock(&obd->obd_self_export->exp_lock);
+               obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+               spin_unlock(&obd->obd_self_export->exp_lock);
+
+               /* note that we'll recurse into class_decref again */
+               class_unlink_export(obd->obd_self_export);
+               return;
+       }
+
+       if (refs == 0) {
+               CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+                      obd->obd_name, obd->obd_uuid.uuid);
+               LASSERT(!obd->obd_attached);
+               if (obd->obd_stopping) {
+                       /* If we're not stopping, we were never set up */
+                       err = obd_cleanup(obd);
+                       if (err)
+                               CERROR("Cleanup %s returned %d\n",
+                                      obd->obd_name, err);
+               }
+               if (OBP(obd, detach)) {
+                       err = OBP(obd, detach)(obd);
+                       if (err)
+                               CERROR("Detach returned %d\n", err);
+               }
+               class_release_dev(obd);
+       }
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_import *imp;
+       struct obd_uuid uuid;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+           LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+               CERROR("invalid conn_uuid\n");
+               RETURN(-EINVAL);
+       }
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+               CERROR("can't add connection on non-client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       imp = obd->u.cli.cl_import;
+       if (!imp) {
+               CERROR("try to add conn on immature client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+       rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_import *imp;
+       struct obd_uuid uuid;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+           LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+               CERROR("invalid conn_uuid\n");
+               RETURN(-EINVAL);
+       }
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+               CERROR("can't del connection on non-client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       imp = obd->u.cli.cl_import;
+       if (!imp) {
+               CERROR("try to del conn on immature client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+       rc = obd_del_conn(imp, &uuid);
+
+       RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+       struct lustre_profile *lprof;
+
+       ENTRY;
+       list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+               if (!strcmp(lprof->lp_profile, prof)) {
+                       RETURN(lprof);
+               }
+       }
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+                     int mdclen, char *mdc)
+{
+       struct lustre_profile *lprof;
+       int err = 0;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+       OBD_ALLOC(lprof, sizeof(*lprof));
+       if (lprof == NULL)
+               RETURN(-ENOMEM);
+       INIT_LIST_HEAD(&lprof->lp_list);
+
+       LASSERT(proflen == (strlen(prof) + 1));
+       OBD_ALLOC(lprof->lp_profile, proflen);
+       if (lprof->lp_profile == NULL)
+               GOTO(out, err = -ENOMEM);
+       memcpy(lprof->lp_profile, prof, proflen);
+
+       LASSERT(osclen == (strlen(osc) + 1));
+       OBD_ALLOC(lprof->lp_dt, osclen);
+       if (lprof->lp_dt == NULL)
+               GOTO(out, err = -ENOMEM);
+       memcpy(lprof->lp_dt, osc, osclen);
+
+       if (mdclen > 0) {
+               LASSERT(mdclen == (strlen(mdc) + 1));
+               OBD_ALLOC(lprof->lp_md, mdclen);
+               if (lprof->lp_md == NULL)
+                       GOTO(out, err = -ENOMEM);
+               memcpy(lprof->lp_md, mdc, mdclen);
+       }
+
+       list_add(&lprof->lp_list, &lustre_profile_list);
+       RETURN(err);
+
+out:
+       if (lprof->lp_md)
+               OBD_FREE(lprof->lp_md, mdclen);
+       if (lprof->lp_dt)
+               OBD_FREE(lprof->lp_dt, osclen);
+       if (lprof->lp_profile)
+               OBD_FREE(lprof->lp_profile, proflen);
+       OBD_FREE(lprof, sizeof(*lprof));
+       RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+       struct lustre_profile *lprof;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+       lprof = class_get_profile(prof);
+       if (lprof) {
+               list_del(&lprof->lp_list);
+               OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+               OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+               if (lprof->lp_md)
+                       OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+               OBD_FREE(lprof, sizeof *lprof);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+       struct lustre_profile *lprof, *n;
+       ENTRY;
+
+       list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+               list_del(&lprof->lp_list);
+               OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+               OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+               if (lprof->lp_md)
+                       OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+               OBD_FREE(lprof, sizeof *lprof);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+       ENTRY;
+       if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+               at_min = val;
+       else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+               at_max = val;
+       else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+               at_extra = val;
+       else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+               at_early_margin = val;
+       else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+               at_history = val;
+       else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+               strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+                       JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+       else
+               RETURN(-EINVAL);
+
+       CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+       RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+       client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg     config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *                         which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *                         not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+                                    const char *new_name)
+{
+       struct lustre_cfg_bufs  *bufs = NULL;
+       struct lustre_cfg       *new_cfg = NULL;
+       char                    *param = NULL;
+       char                    *new_param = NULL;
+       char                    *value = NULL;
+       int                      name_len = 0;
+       int                      new_len = 0;
+       ENTRY;
+
+       if (cfg == NULL || new_name == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       param = lustre_cfg_string(cfg, 1);
+       if (param == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       value = strchr(param, '=');
+       if (value == NULL)
+               name_len = strlen(param);
+       else
+               name_len = value - param;
+
+       new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+       OBD_ALLOC(new_param, new_len);
+       if (new_param == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       strcpy(new_param, new_name);
+       if (value != NULL)
+               strcat(new_param, value);
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL) {
+               OBD_FREE(new_param, new_len);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       lustre_cfg_bufs_reset(bufs, NULL);
+       lustre_cfg_bufs_init(bufs, cfg);
+       lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+       new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+       OBD_FREE(new_param, new_len);
+       OBD_FREE_PTR(bufs);
+       if (new_cfg == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       new_cfg->lcfg_num = cfg->lcfg_num;
+       new_cfg->lcfg_flags = cfg->lcfg_flags;
+       new_cfg->lcfg_nid = cfg->lcfg_nid;
+       new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+       RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+       quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+       struct obd_device *obd;
+       int err;
+
+       LASSERT(lcfg && !IS_ERR(lcfg));
+       CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+       /* Commands that don't need a device */
+       switch(lcfg->lcfg_command) {
+       case LCFG_ATTACH: {
+               err = class_attach(lcfg);
+               GOTO(out, err);
+       }
+       case LCFG_ADD_UUID: {
+               CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+                      " (%s)\n", lustre_cfg_string(lcfg, 1),
+                      lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+               err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+               GOTO(out, err);
+       }
+       case LCFG_DEL_UUID: {
+               CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+                      (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+                      ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+               err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+               GOTO(out, err);
+       }
+       case LCFG_MOUNTOPT: {
+               CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+                      lustre_cfg_string(lcfg, 1),
+                      lustre_cfg_string(lcfg, 2),
+                      lustre_cfg_string(lcfg, 3));
+               /* set these mount options somewhere, so ll_fill_super
+                * can find them. */
+               err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+                                       lustre_cfg_string(lcfg, 1),
+                                       LUSTRE_CFG_BUFLEN(lcfg, 2),
+                                       lustre_cfg_string(lcfg, 2),
+                                       LUSTRE_CFG_BUFLEN(lcfg, 3),
+                                       lustre_cfg_string(lcfg, 3));
+               GOTO(out, err);
+       }
+       case LCFG_DEL_MOUNTOPT: {
+               CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               class_del_profile(lustre_cfg_string(lcfg, 1));
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_TIMEOUT: {
+               CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+                      obd_timeout, lcfg->lcfg_num);
+               obd_timeout = max(lcfg->lcfg_num, 1U);
+               obd_timeout_set = 1;
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_LDLM_TIMEOUT: {
+               CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+                      ldlm_timeout, lcfg->lcfg_num);
+               ldlm_timeout = max(lcfg->lcfg_num, 1U);
+               if (ldlm_timeout >= obd_timeout)
+                       ldlm_timeout = max(obd_timeout / 3, 1U);
+               ldlm_timeout_set = 1;
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_UPCALL: {
+               LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+               /* COMPAT_146 Don't fail on old configs */
+               GOTO(out, err = 0);
+       }
+       case LCFG_MARKER: {
+               struct cfg_marker *marker;
+               marker = lustre_cfg_buf(lcfg, 1);
+               CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+                      marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+               GOTO(out, err = 0);
+       }
+       case LCFG_PARAM: {
+               char *tmp;
+               /* llite has no obd */
+               if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                      PARAM_LLITE, 0) == 0) &&
+                   client_process_config) {
+                       err = (*client_process_config)(lcfg);
+                       GOTO(out, err);
+               } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                             PARAM_SYS, &tmp) == 0)) {
+                       /* Global param settings */
+                       err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+                       /*
+                        * Client or server should not fail to mount if
+                        * it hits an unknown configuration parameter.
+                        */
+                       if (err != 0)
+                               CWARN("Ignoring unknown param %s\n", tmp);
+
+                       GOTO(out, err = 0);
+               } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                             PARAM_QUOTA, &tmp) == 0) &&
+                          quota_process_config) {
+                       err = (*quota_process_config)(lcfg);
+                       GOTO(out, err);
+               }
+               /* Fall through */
+               break;
+       }
+       }
+
+       /* Commands that require a device */
+       obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+       if (obd == NULL) {
+               if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+                       CERROR("this lcfg command requires a device name\n");
+               else
+                       CERROR("no device for: %s\n",
+                              lustre_cfg_string(lcfg, 0));
+
+               GOTO(out, err = -EINVAL);
+       }
+
+       switch(lcfg->lcfg_command) {
+       case LCFG_SETUP: {
+               err = class_setup(obd, lcfg);
+               GOTO(out, err);
+       }
+       case LCFG_DETACH: {
+               err = class_detach(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_CLEANUP: {
+               err = class_cleanup(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_ADD_CONN: {
+               err = class_add_conn(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_DEL_CONN: {
+               err = class_del_conn(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_POOL_NEW: {
+               err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_ADD: {
+               err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                  lustre_cfg_string(lcfg, 3));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_REM: {
+               err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                  lustre_cfg_string(lcfg, 3));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_DEL: {
+               err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+               GOTO(out, err = 0);
+               break;
+       }
+       default: {
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               GOTO(out, err);
+
+       }
+       }
+out:
+       if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+               CWARN("Ignoring error %d on optional command %#x\n", err,
+                     lcfg->lcfg_command);
+               err = 0;
+       }
+       return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+                            struct lustre_cfg *lcfg, void *data)
+{
+       struct lprocfs_vars *var;
+       char *key, *sval;
+       int i, keylen, vallen;
+       int matched = 0, j = 0;
+       int rc = 0;
+       int skip = 0;
+       ENTRY;
+
+       if (lcfg->lcfg_command != LCFG_PARAM) {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               RETURN(-EINVAL);
+       }
+
+       /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+          or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+          or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+       for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+               key = lustre_cfg_buf(lcfg, i);
+               /* Strip off prefix */
+               class_match_param(key, prefix, &key);
+               sval = strchr(key, '=');
+               if (!sval || (*(sval + 1) == 0)) {
+                       CERROR("Can't parse param %s (missing '=')\n", key);
+                       /* rc = -EINVAL;        continue parsing other params */
+                       continue;
+               }
+               keylen = sval - key;
+               sval++;
+               vallen = strlen(sval);
+               matched = 0;
+               j = 0;
+               /* Search proc entries */
+               while (lvars[j].name) {
+                       var = &lvars[j];
+                       if (class_match_param(key, (char *)var->name, 0) == 0 &&
+                           keylen == strlen(var->name)) {
+                               matched++;
+                               rc = -EROFS;
+                               if (var->write_fptr) {
+                                       mm_segment_t oldfs;
+                                       oldfs = get_fs();
+                                       set_fs(KERNEL_DS);
+                                       rc = (var->write_fptr)(NULL, sval,
+                                                              vallen, data);
+                                       set_fs(oldfs);
+                               }
+                               break;
+                       }
+                       j++;
+               }
+               if (!matched) {
+                       /* If the prefix doesn't match, return error so we
+                          can pass it down the stack */
+                       if (strnchr(key, keylen, '.'))
+                           RETURN(-ENOSYS);
+                       CERROR("%s: unknown param %s\n",
+                              (char *)lustre_cfg_string(lcfg, 0), key);
+                       /* rc = -EINVAL;        continue parsing other params */
+                       skip++;
+               } else if (rc < 0) {
+                       CERROR("writing proc entry %s err %d\n",
+                              var->name, rc);
+                       rc = 0;
+               } else {
+                       CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+                                        lustre_cfg_string(lcfg, 0),
+                                        (int)strlen(prefix) - 1, prefix,
+                                        (int)(sval - key - 1), key, sval);
+               }
+       }
+
+       if (rc > 0)
+               rc = 0;
+       if (!rc && skip)
+               rc = skip;
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct config_llog_instance *clli = data;
+       int cfg_len = rec->lrh_len;
+       char *cfg_buf = (char*) (rec + 1);
+       int rc = 0;
+       ENTRY;
+
+       //class_config_dump_handler(handle, rec, data);
+
+       switch (rec->lrh_type) {
+       case OBD_CFG_REC: {
+               struct lustre_cfg *lcfg, *lcfg_new;
+               struct lustre_cfg_bufs bufs;
+               char *inst_name = NULL;
+               int inst_len = 0;
+               int inst = 0, swab = 0;
+
+               lcfg = (struct lustre_cfg *)cfg_buf;
+               if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+                       lustre_swab_lustre_cfg(lcfg);
+                       swab = 1;
+               }
+
+               rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* Figure out config state info */
+               if (lcfg->lcfg_command == LCFG_MARKER) {
+                       struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+                       lustre_swab_cfg_marker(marker, swab,
+                                              LUSTRE_CFG_BUFLEN(lcfg, 1));
+                       CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+                              clli->cfg_flags, marker->cm_flags);
+                       if (marker->cm_flags & CM_START) {
+                               /* all previous flags off */
+                               clli->cfg_flags = CFG_F_MARKER;
+                               if (marker->cm_flags & CM_SKIP) {
+                                       clli->cfg_flags |= CFG_F_SKIP;
+                                       CDEBUG(D_CONFIG, "SKIP #%d\n",
+                                              marker->cm_step);
+                               } else if ((marker->cm_flags & CM_EXCLUDE) ||
+                                          (clli->cfg_sb &&
+                                           lustre_check_exclusion(clli->cfg_sb,
+                                                        marker->cm_tgtname))) {
+                                       clli->cfg_flags |= CFG_F_EXCLUDE;
+                                       CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+                                              marker->cm_step);
+                               }
+                       } else if (marker->cm_flags & CM_END) {
+                               clli->cfg_flags = 0;
+                       }
+               }
+               /* A config command without a start marker before it is
+                  illegal (post 146) */
+               if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+                   !(clli->cfg_flags & CFG_F_MARKER) &&
+                   (lcfg->lcfg_command != LCFG_MARKER)) {
+                       CWARN("Config not inside markers, ignoring! "
+                             "(inst: %p, uuid: %s, flags: %#x)\n",
+                             clli->cfg_instance,
+                             clli->cfg_uuid.uuid, clli->cfg_flags);
+                       clli->cfg_flags |= CFG_F_SKIP;
+               }
+               if (clli->cfg_flags & CFG_F_SKIP) {
+                       CDEBUG(D_CONFIG, "skipping %#x\n",
+                              clli->cfg_flags);
+                       rc = 0;
+                       /* No processing! */
+                       break;
+               }
+
+               /*
+                * For interoperability between 1.8 and 2.0,
+                * rename "mds" obd device type to "mdt".
+                */
+               {
+                       char *typename = lustre_cfg_string(lcfg, 1);
+                       char *index = lustre_cfg_string(lcfg, 2);
+
+                       if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+                            strcmp(typename, "mds") == 0)) {
+                               CWARN("For 1.8 interoperability, rename obd "
+                                      "type from mds to mdt\n");
+                               typename[2] = 't';
+                       }
+                       if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+                            strcmp(index, "type") == 0)) {
+                               CDEBUG(D_INFO, "For 1.8 interoperability, "
+                                      "set this index to '0'\n");
+                               index[0] = '0';
+                               index[1] = 0;
+                       }
+               }
+
+
+               if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+                   (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+                       /* Add inactive instead */
+                       lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+               lustre_cfg_bufs_init(&bufs, lcfg);
+
+               if (clli && clli->cfg_instance &&
+                   LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+                       inst = 1;
+                       inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+                                  sizeof(clli->cfg_instance) * 2 + 4;
+                       OBD_ALLOC(inst_name, inst_len);
+                       if (inst_name == NULL)
+                               GOTO(out, rc = -ENOMEM);
+                       sprintf(inst_name, "%s-%p",
+                               lustre_cfg_string(lcfg, 0),
+                               clli->cfg_instance);
+                       lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+                       CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+                              lcfg->lcfg_command, inst_name);
+               }
+
+               /* we override the llog's uuid for clients, to insure they
+               are unique */
+               if (clli && clli->cfg_instance != NULL &&
+                   lcfg->lcfg_command == LCFG_ATTACH) {
+                       lustre_cfg_bufs_set_string(&bufs, 2,
+                                                  clli->cfg_uuid.uuid);
+               }
+               /*
+                * sptlrpc config record, we expect 2 data segments:
+                *  [0]: fs_name/target_name,
+                *  [1]: rule string
+                * moving them to index [1] and [2], and insert MGC's
+                * obdname at index [0].
+                */
+               if (clli && clli->cfg_instance == NULL &&
+                   lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+                       lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+                                           bufs.lcfg_buflen[1]);
+                       lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+                                           bufs.lcfg_buflen[0]);
+                       lustre_cfg_bufs_set_string(&bufs, 0,
+                                                  clli->cfg_obdname);
+               }
+
+               lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+               lcfg_new->lcfg_num   = lcfg->lcfg_num;
+               lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+               /* XXX Hack to try to remain binary compatible with
+                * pre-newconfig logs */
+               if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+                   (lcfg->lcfg_nid >> 32) == 0) {
+                       __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+                       lcfg_new->lcfg_nid =
+                               LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+                       CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+                             lcfg->lcfg_nal, addr,
+                             libcfs_nid2str(lcfg_new->lcfg_nid));
+               } else {
+                       lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+               }
+
+               lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+               rc = class_process_config(lcfg_new);
+               lustre_cfg_free(lcfg_new);
+
+               if (inst)
+                       OBD_FREE(inst_name, inst_len);
+               break;
+       }
+       default:
+               CERROR("Unknown llog record type %#x encountered\n",
+                      rec->lrh_type);
+               break;
+       }
+out:
+       if (rc) {
+               CERROR("%s: cfg command failed: rc = %d\n",
+                      handle->lgh_ctxt->loc_obd->obd_name, rc);
+               class_config_dump_handler(NULL, handle, rec, data);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                           char *name, struct config_llog_instance *cfg)
+{
+       struct llog_process_cat_data     cd = {0, 0};
+       struct llog_handle              *llh;
+       llog_cb_t                        callback;
+       int                              rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "looking up llog %s\n", name);
+       rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       /* continue processing from where we last stopped to end-of-log */
+       if (cfg) {
+               cd.lpcd_first_idx = cfg->cfg_last_idx;
+               callback = cfg->cfg_callback;
+               LASSERT(callback != NULL);
+       } else {
+               callback = class_config_llog_handler;
+       }
+
+       cd.lpcd_last_idx = 0;
+
+       rc = llog_process(env, llh, callback, cfg, &cd);
+
+       CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+              cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+       if (cfg)
+               cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+       llog_close(env, llh);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+       struct lustre_cfg       *lcfg = (struct lustre_cfg *)(rec + 1);
+       char                    *ptr = buf;
+       char                    *end = buf + size;
+       int                      rc = 0;
+
+       ENTRY;
+
+       LASSERT(rec->lrh_type == OBD_CFG_REC);
+       rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+       if (rc < 0)
+               RETURN(rc);
+
+       ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+       if (lcfg->lcfg_flags)
+               ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+                               lcfg->lcfg_flags);
+
+       if (lcfg->lcfg_num)
+               ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+       if (lcfg->lcfg_nid)
+               ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+                               libcfs_nid2str(lcfg->lcfg_nid),
+                               lcfg->lcfg_nid);
+
+       if (lcfg->lcfg_command == LCFG_MARKER) {
+               struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+               ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+                               marker->cm_step, marker->cm_flags,
+                               marker->cm_tgtname, marker->cm_comment);
+       } else {
+               int i;
+
+               for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+                       ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+                                       lustre_cfg_string(lcfg, i));
+               }
+       }
+       /* return consumed bytes */
+       rc = ptr - buf;
+       RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       char    *outstr;
+       int      rc = 0;
+
+       ENTRY;
+
+       OBD_ALLOC(outstr, 256);
+       if (outstr == NULL)
+               RETURN(-ENOMEM);
+
+       if (rec->lrh_type == OBD_CFG_REC) {
+               class_config_parse_rec(rec, outstr, 256);
+               LCONSOLE(D_WARNING, "   %s\n", outstr);
+       } else {
+               LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+               rc = -EINVAL;
+       }
+
+       OBD_FREE(outstr, 256);
+       RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          char *name, struct config_llog_instance *cfg)
+{
+       struct llog_handle      *llh;
+       int                      rc;
+
+       ENTRY;
+
+       LCONSOLE_INFO("Dumping config log %s\n", name);
+
+       rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+       llog_close(env, llh);
+
+       LCONSOLE_INFO("End config log %s\n", name);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+       char                flags[3] = "";
+       struct lustre_cfg      *lcfg;
+       struct lustre_cfg_bufs  bufs;
+       int                  rc;
+       ENTRY;
+
+       if (!obd) {
+               CERROR("empty cleanup\n");
+               RETURN(-EALREADY);
+       }
+
+       if (obd->obd_force)
+               strcat(flags, "F");
+       if (obd->obd_fail)
+               strcat(flags, "A");
+
+       CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+              obd->obd_name, flags);
+
+       lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+       lustre_cfg_bufs_set_string(&bufs, 1, flags);
+       lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+       if (!lcfg)
+               RETURN(-ENOMEM);
+
+       rc = class_process_config(lcfg);
+       if (rc) {
+               CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+               GOTO(out, rc);
+       }
+
+       /* the lcfg is almost the same for both ops */
+       lcfg->lcfg_command = LCFG_DETACH;
+       rc = class_process_config(lcfg);
+       if (rc)
+               CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+       lustre_cfg_free(lcfg);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+                                 sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+       return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       LASSERT(key);
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+       return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+              !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+       class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+       class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+       .hs_hash        = uuid_hash,
+       .hs_key  = uuid_key,
+       .hs_keycmp      = uuid_keycmp,
+       .hs_object      = uuid_export_object,
+       .hs_get  = uuid_export_get,
+       .hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+       RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       LASSERT(key);
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+       RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+              !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+       class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+       class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+       .hs_hash        = nid_hash,
+       .hs_key  = nid_key,
+       .hs_keycmp      = nid_kepcmp,
+       .hs_object      = nid_export_object,
+       .hs_get  = nid_export_get,
+       .hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+       return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+       nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+       nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+       .hs_hash        = nid_hash,
+       .hs_key  = nidstats_key,
+       .hs_keycmp      = nidstats_keycmp,
+       .hs_object      = nidstats_object,
+       .hs_get  = nidstats_get,
+       .hs_put_locked  = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644 (file)
index 0000000..99adad9
--- /dev/null
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+                               struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+                    struct config_llog_instance *cfg)
+{
+       struct lustre_cfg *lcfg;
+       struct lustre_cfg_bufs *bufs;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int rc;
+       ENTRY;
+
+       LASSERT(mgc);
+       LASSERT(cfg);
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL)
+               RETURN(-ENOMEM);
+
+       /* mgc_process_config */
+       lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+       lustre_cfg_bufs_set_string(bufs, 1, logname);
+       lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+       lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+       lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+       rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+       lustre_cfg_free(lcfg);
+
+       OBD_FREE_PTR(bufs);
+
+       if (rc == -EINVAL)
+               LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+                                  "failed from the MGS (%d).  Make sure this "
+                                  "client and the MGS are running compatible "
+                                  "versions of Lustre.\n",
+                                  mgc->obd_name, logname, rc);
+
+       if (rc)
+               LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+                                  "failed (%d). This may be the result of "
+                                  "communication errors between this node and "
+                                  "the MGS, a bad configuration, or other "
+                                  "errors. See the syslog for more "
+                                  "information.\n", mgc->obd_name, logname,
+                                  rc);
+
+       /* class_obd_list(); */
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+                      struct config_llog_instance *cfg)
+{
+       struct lustre_cfg *lcfg;
+       struct lustre_cfg_bufs bufs;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int rc;
+       ENTRY;
+
+       if (!mgc)
+               RETURN(-ENOENT);
+
+       /* mgc_process_config */
+       lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+       lustre_cfg_bufs_set_string(&bufs, 1, logname);
+       if (cfg)
+               lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+       lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+       rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+       lustre_cfg_free(lcfg);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+           char *s1, char *s2, char *s3, char *s4)
+{
+       struct lustre_cfg_bufs bufs;
+       struct lustre_cfg    * lcfg = NULL;
+       int rc;
+
+       CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+              cmd, s1, s2, s3, s4);
+
+       lustre_cfg_bufs_reset(&bufs, cfgname);
+       if (s1)
+               lustre_cfg_bufs_set_string(&bufs, 1, s1);
+       if (s2)
+               lustre_cfg_bufs_set_string(&bufs, 2, s2);
+       if (s3)
+               lustre_cfg_bufs_set_string(&bufs, 3, s3);
+       if (s4)
+               lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+       lcfg = lustre_cfg_new(cmd, &bufs);
+       lcfg->lcfg_nid = nid;
+       rc = class_process_config(lcfg);
+       lustre_cfg_free(lcfg);
+       return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+                       char *s1, char *s2, char *s3, char *s4)
+{
+       int rc;
+       CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+       rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+       if (rc) {
+               CERROR("%s attach error %d\n", obdname, rc);
+               return rc;
+       }
+       rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+       if (rc) {
+               CERROR("%s setup error %d\n", obdname, rc);
+               do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+       }
+       return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+       struct obd_connect_data *data = NULL;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *obd;
+       struct obd_export *exp;
+       struct obd_uuid *uuid;
+       class_uuid_t uuidc;
+       lnet_nid_t nid;
+       char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+       char *ptr;
+       int recov_bk;
+       int rc = 0, i = 0, j, len;
+       ENTRY;
+
+       LASSERT(lsi->lsi_lmd);
+
+       /* Find the first non-lo MGS nid for our MGC name */
+       if (IS_SERVER(lsi)) {
+               /* mount -o mgsnode=nid */
+               ptr = lsi->lsi_lmd->lmd_mgs;
+               if (lsi->lsi_lmd->lmd_mgs &&
+                   (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+                       i++;
+               } else if (IS_MGS(lsi)) {
+                       lnet_process_id_t id;
+                       while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                               if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                                       continue;
+                               nid = id.nid;
+                               i++;
+                               break;
+                       }
+               }
+       } else { /* client */
+               /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+               ptr = lsi->lsi_lmd->lmd_dev;
+               if (class_parse_nid(ptr, &nid, &ptr) == 0)
+                       i++;
+       }
+       if (i == 0) {
+               CERROR("No valid MGS nids found.\n");
+               RETURN(-EINVAL);
+       }
+
+       mutex_lock(&mgc_start_lock);
+
+       len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+       OBD_ALLOC(mgcname, len);
+       OBD_ALLOC(niduuid, len + 2);
+       if (!mgcname || !niduuid)
+               GOTO(out_free, rc = -ENOMEM);
+       sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+       mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               GOTO(out_free, rc = -ENOMEM);
+
+       obd = class_name2obd(mgcname);
+       if (obd && !obd->obd_stopping) {
+               rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                       strlen(KEY_MGSSEC), KEY_MGSSEC,
+                                       strlen(mgssec), mgssec, NULL);
+               if (rc)
+                       GOTO(out_free, rc);
+
+               /* Re-using an existing MGC */
+               atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+               /* IR compatibility check, only for clients */
+               if (lmd_is_client(lsi->lsi_lmd)) {
+                       int has_ir;
+                       int vallen = sizeof(*data);
+                       __u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+                       rc = obd_get_info(NULL, obd->obd_self_export,
+                                         strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+                                         &vallen, data, NULL);
+                       LASSERT(rc == 0);
+                       has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+                       if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+                               /* LMD_FLG_NOIR is for test purpose only */
+                               LCONSOLE_WARN(
+                                   "Trying to mount a client with IR setting "
+                                   "not compatible with current mgc. "
+                                   "Force to use current mgc setting that is "
+                                   "IR %s.\n",
+                                   has_ir ? "enabled" : "disabled");
+                               if (has_ir)
+                                       *flags &= ~LMD_FLG_NOIR;
+                               else
+                                       *flags |= LMD_FLG_NOIR;
+                       }
+               }
+
+               recov_bk = 0;
+               /* If we are restarting the MGS, don't try to keep the MGC's
+                  old connection, or registration will fail. */
+               if (IS_MGS(lsi)) {
+                       CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+                       recov_bk = 1;
+               }
+
+               /* Try all connections, but only once (again).
+                  We don't want to block another target from starting
+                  (using its local copy of the log), but we do want to connect
+                  if at all possible. */
+               recov_bk++;
+               CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+               rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                       sizeof(KEY_INIT_RECOV_BACKUP),
+                                       KEY_INIT_RECOV_BACKUP,
+                                       sizeof(recov_bk), &recov_bk, NULL);
+               GOTO(out, rc = 0);
+       }
+
+       CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+       /* Add the primary nids for the MGS */
+       i = 0;
+       sprintf(niduuid, "%s_%x", mgcname, i);
+       if (IS_SERVER(lsi)) {
+               ptr = lsi->lsi_lmd->lmd_mgs;
+               if (IS_MGS(lsi)) {
+                       /* Use local nids (including LO) */
+                       lnet_process_id_t id;
+                       while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                               rc = do_lcfg(mgcname, id.nid,
+                                            LCFG_ADD_UUID, niduuid, 0,0,0);
+                       }
+               } else {
+                       /* Use mgsnode= nids */
+                       /* mount -o mgsnode=nid */
+                       if (lsi->lsi_lmd->lmd_mgs) {
+                               ptr = lsi->lsi_lmd->lmd_mgs;
+                       } else if (class_find_param(ptr, PARAM_MGSNODE,
+                                                   &ptr) != 0) {
+                               CERROR("No MGS nids given.\n");
+                               GOTO(out_free, rc = -EINVAL);
+                       }
+                       while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                               rc = do_lcfg(mgcname, nid,
+                                            LCFG_ADD_UUID, niduuid, 0,0,0);
+                               i++;
+                       }
+               }
+       } else { /* client */
+               /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+               ptr = lsi->lsi_lmd->lmd_dev;
+               while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                       rc = do_lcfg(mgcname, nid,
+                                    LCFG_ADD_UUID, niduuid, 0,0,0);
+                       i++;
+                       /* Stop at the first failover nid */
+                       if (*ptr == ':')
+                               break;
+               }
+       }
+       if (i == 0) {
+               CERROR("No valid MGS nids found.\n");
+               GOTO(out_free, rc = -EINVAL);
+       }
+       lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+       /* Random uuid for MGC allows easier reconnects */
+       OBD_ALLOC_PTR(uuid);
+       ll_generate_random_uuid(uuidc);
+       class_uuid_unparse(uuidc, uuid);
+
+       /* Start the MGC */
+       rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+                                (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+                                niduuid, 0, 0);
+       OBD_FREE_PTR(uuid);
+       if (rc)
+               GOTO(out_free, rc);
+
+       /* Add any failover MGS nids */
+       i = 1;
+       while (ptr && ((*ptr == ':' ||
+              class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+               /* New failover node */
+               sprintf(niduuid, "%s_%x", mgcname, i);
+               j = 0;
+               while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+                       j++;
+                       rc = do_lcfg(mgcname, nid,
+                                    LCFG_ADD_UUID, niduuid, 0,0,0);
+                       if (*ptr == ':')
+                               break;
+               }
+               if (j > 0) {
+                       rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+                                    niduuid, 0, 0, 0);
+                       i++;
+               } else {
+                       /* at ":/fsname" */
+                       break;
+               }
+       }
+       lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+       obd = class_name2obd(mgcname);
+       if (!obd) {
+               CERROR("Can't find mgcobd %s\n", mgcname);
+               GOTO(out_free, rc = -ENOTCONN);
+       }
+
+       rc = obd_set_info_async(NULL, obd->obd_self_export,
+                               strlen(KEY_MGSSEC), KEY_MGSSEC,
+                               strlen(mgssec), mgssec, NULL);
+       if (rc)
+               GOTO(out_free, rc);
+
+       /* Keep a refcount of servers/clients who started with "mount",
+          so we know when we can get rid of the mgc. */
+       atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+       /* Try all connections, but only once. */
+       recov_bk = 1;
+       rc = obd_set_info_async(NULL, obd->obd_self_export,
+                               sizeof(KEY_INIT_RECOV_BACKUP),
+                               KEY_INIT_RECOV_BACKUP,
+                               sizeof(recov_bk), &recov_bk, NULL);
+       if (rc)
+               /* nonfatal */
+               CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+       /* We connect to the MGS at setup, and don't disconnect until cleanup */
+       data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+                                 OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+                                 OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+       data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+       if (lmd_is_client(lsi->lsi_lmd) &&
+           lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+               data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+       rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+       if (rc) {
+               CERROR("connect failed %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+       /* Keep the mgc info in the sb. Note that many lsi's can point
+          to the same mgc.*/
+       lsi->lsi_mgc = obd;
+out_free:
+       mutex_unlock(&mgc_start_lock);
+
+       if (data)
+               OBD_FREE_PTR(data);
+       if (mgcname)
+               OBD_FREE(mgcname, len);
+       if (niduuid)
+               OBD_FREE(niduuid, len + 2);
+       RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *obd;
+       char *niduuid = 0, *ptr = 0;
+       int i, rc = 0, len = 0;
+       ENTRY;
+
+       if (!lsi)
+               RETURN(-ENOENT);
+       obd = lsi->lsi_mgc;
+       if (!obd)
+               RETURN(-ENOENT);
+       lsi->lsi_mgc = NULL;
+
+       mutex_lock(&mgc_start_lock);
+       LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+       if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+               /* This is not fatal, every client that stops
+                  will call in here. */
+               CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+                      atomic_read(&obd->u.cli.cl_mgc_refcount));
+               GOTO(out, rc = -EBUSY);
+       }
+
+       /* The MGC has no recoverable data in any case.
+        * force shotdown set in umount_begin */
+       obd->obd_no_recov = 1;
+
+       if (obd->u.cli.cl_mgc_mgsexp) {
+               /* An error is not fatal, if we are unable to send the
+                  disconnect mgs ping evictor cleans up the export */
+               rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+               if (rc)
+                       CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+       }
+
+       /* Save the obdname for cleaning the nid uuids, which are
+          obdname_XX */
+       len = strlen(obd->obd_name) + 6;
+       OBD_ALLOC(niduuid, len);
+       if (niduuid) {
+               strcpy(niduuid, obd->obd_name);
+               ptr = niduuid + strlen(niduuid);
+       }
+
+       rc = class_manual_cleanup(obd);
+       if (rc)
+               GOTO(out, rc);
+
+       /* Clean the nid uuids */
+       if (!niduuid)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+               sprintf(ptr, "_%x", i);
+               rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+                            niduuid, 0, 0, 0);
+               if (rc)
+                       CERROR("del MDC UUID %s failed: rc = %d\n",
+                              niduuid, rc);
+       }
+out:
+       if (niduuid)
+               OBD_FREE(niduuid, len);
+
+       /* class_import_put will get rid of the additional connections */
+       mutex_unlock(&mgc_start_lock);
+       RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi;
+       ENTRY;
+
+       OBD_ALLOC_PTR(lsi);
+       if (!lsi)
+               RETURN(NULL);
+       OBD_ALLOC_PTR(lsi->lsi_lmd);
+       if (!lsi->lsi_lmd) {
+               OBD_FREE_PTR(lsi);
+               RETURN(NULL);
+       }
+
+       lsi->lsi_lmd->lmd_exclude_count = 0;
+       lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+       lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+       s2lsi_nocast(sb) = lsi;
+       /* we take 1 extra ref for our setup */
+       atomic_set(&lsi->lsi_mounts, 1);
+
+       /* Default umount style */
+       lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+       RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       ENTRY;
+
+       LASSERT(lsi != NULL);
+       CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+       /* someone didn't call server_put_mount. */
+       LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+       if (lsi->lsi_lmd != NULL) {
+               if (lsi->lsi_lmd->lmd_dev != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_dev,
+                                strlen(lsi->lsi_lmd->lmd_dev) + 1);
+               if (lsi->lsi_lmd->lmd_profile != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_profile,
+                                strlen(lsi->lsi_lmd->lmd_profile) + 1);
+               if (lsi->lsi_lmd->lmd_mgssec != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+                                strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+               if (lsi->lsi_lmd->lmd_opts != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_opts,
+                                strlen(lsi->lsi_lmd->lmd_opts) + 1);
+               if (lsi->lsi_lmd->lmd_exclude_count)
+                       OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+                                sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+                                lsi->lsi_lmd->lmd_exclude_count);
+               if (lsi->lsi_lmd->lmd_mgs != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+                                strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+               if (lsi->lsi_lmd->lmd_osd_type != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+                                strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+               if (lsi->lsi_lmd->lmd_params != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+               OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+       }
+
+       LASSERT(lsi->lsi_llsbi == NULL);
+       OBD_FREE(lsi, sizeof(*lsi));
+       s2lsi_nocast(sb) = NULL;
+
+       RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       ENTRY;
+
+       LASSERT(lsi != NULL);
+
+       CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+       if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+               if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+                       obd_disconnect(lsi->lsi_osd_exp);
+                       /* wait till OSD is gone */
+                       obd_zombie_barrier();
+               }
+               lustre_free_lsi(sb);
+               RETURN(1);
+       }
+       RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+       const char *dash = strrchr(svname, '-');
+       if (!dash) {
+               dash = strrchr(svname, ':');
+               if (!dash)
+                       return -EINVAL;
+       }
+
+       /* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+        * in the fsname, then determine the server index */
+       if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+               dash--;
+               for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+                       ;
+               if (dash == svname)
+                       return -EINVAL;
+       }
+
+       if (fsname != NULL) {
+               strncpy(fsname, svname, dash - svname);
+               fsname[dash - svname] = '\0';
+       }
+
+       if (endptr != NULL)
+               *endptr = dash;
+
+       return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+                      size_t svsize)
+{
+       int rc;
+       const const char *dash;
+
+       /* We use server_name2fsname() just for parsing */
+       rc = server_name2fsname(label, NULL, &dash);
+       if (rc != 0)
+               return rc;
+
+       if (*dash != '-')
+               return -1;
+
+       if (strlcpy(svname, dash + 1, svsize) >= svsize)
+               return -E2BIG;
+
+       return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+       unsigned long index;
+       int rc;
+       const char *dash;
+
+       /* We use server_name2fsname() just for parsing */
+       rc = server_name2fsname(svname, NULL, &dash);
+       if (rc != 0)
+               return rc;
+
+       if (*dash != '-')
+               return -EINVAL;
+
+       dash++;
+
+       if (strncmp(dash, "MDT", 3) == 0)
+               rc = LDD_F_SV_TYPE_MDT;
+       else if (strncmp(dash, "OST", 3) == 0)
+               rc = LDD_F_SV_TYPE_OST;
+       else
+               return -EINVAL;
+
+       dash += 3;
+
+       if (strcmp(dash, "all") == 0)
+               return rc | LDD_F_SV_ALL;
+
+       index = simple_strtoul(dash, (char **)endptr, 16);
+       *idx = index;
+
+       return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+       /* Drop a ref to the MGC */
+       rc = lustre_stop_mgc(sb);
+       if (rc && (rc != -ENOENT)) {
+               if (rc != -EBUSY) {
+                       CERROR("Can't stop MGC: %d\n", rc);
+                       RETURN(rc);
+               }
+               /* BUSY just means that there's some other obd that
+                  needs the mgc.  Let him clean it up. */
+               CDEBUG(D_MOUNT, "MGC still in use\n");
+       }
+       /* Drop a ref to the mounted disk */
+       lustre_put_lsi(sb);
+       lu_types_stop();
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+       int i;
+
+       PRINT_CMD(D_MOUNT, "  mount data:\n");
+       if (lmd_is_client(lmd))
+               PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+       PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+       PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+       if (lmd->lmd_opts)
+               PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+       if (lmd->lmd_recovery_time_soft)
+               PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+                         lmd->lmd_recovery_time_soft);
+
+       if (lmd->lmd_recovery_time_hard)
+               PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+                         lmd->lmd_recovery_time_hard);
+
+       for (i = 0; i < lmd->lmd_exclude_count; i++) {
+               PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+                         lmd->lmd_exclude[i]);
+       }
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct lustre_mount_data *lmd = lsi->lsi_lmd;
+       __u32 index;
+       int i, rc;
+       ENTRY;
+
+       rc = server_name2index(svname, &index, NULL);
+       if (rc != LDD_F_SV_TYPE_OST)
+               /* Only exclude OSTs */
+               RETURN(0);
+
+       CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+              index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+       for(i = 0; i < lmd->lmd_exclude_count; i++) {
+               if (index == lmd->lmd_exclude[i]) {
+                       CWARN("Excluding %s (on exclusion list)\n", svname);
+                       RETURN(1);
+               }
+       }
+       RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+       const char *s1 = ptr, *s2;
+       __u32 index, *exclude_list;
+       int rc = 0, devmax;
+       ENTRY;
+
+       /* The shortest an ost name can be is 8 chars: -OST0000.
+          We don't actually know the fsname at this time, so in fact
+          a user could specify any fsname. */
+       devmax = strlen(ptr) / 8 + 1;
+
+       /* temp storage until we figure out how many we have */
+       OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+       if (!exclude_list)
+               RETURN(-ENOMEM);
+
+       /* we enter this fn pointing at the '=' */
+       while (*s1 && *s1 != ' ' && *s1 != ',') {
+               s1++;
+               rc = server_name2index(s1, &index, &s2);
+               if (rc < 0) {
+                       CERROR("Can't parse server name '%s'\n", s1);
+                       break;
+               }
+               if (rc == LDD_F_SV_TYPE_OST)
+                       exclude_list[lmd->lmd_exclude_count++] = index;
+               else
+                       CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+               s1 = s2;
+               /* now we are pointing at ':' (next exclude)
+                  or ',' (end of excludes) */
+               if (lmd->lmd_exclude_count >= devmax)
+                       break;
+       }
+       if (rc >= 0) /* non-err */
+               rc = 0;
+
+       if (lmd->lmd_exclude_count) {
+               /* permanent, freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+                         lmd->lmd_exclude_count);
+               if (lmd->lmd_exclude) {
+                       memcpy(lmd->lmd_exclude, exclude_list,
+                              sizeof(index) * lmd->lmd_exclude_count);
+               } else {
+                       rc = -ENOMEM;
+                       lmd->lmd_exclude_count = 0;
+               }
+       }
+       OBD_FREE(exclude_list, sizeof(index) * devmax);
+       RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+       char   *tail;
+       int     length;
+
+       if (lmd->lmd_mgssec != NULL) {
+               OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+               lmd->lmd_mgssec = NULL;
+       }
+
+       tail = strchr(ptr, ',');
+       if (tail == NULL)
+               length = strlen(ptr);
+       else
+               length = tail - ptr;
+
+       OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+       if (lmd->lmd_mgssec == NULL)
+               return -ENOMEM;
+
+       memcpy(lmd->lmd_mgssec, ptr, length);
+       lmd->lmd_mgssec[length] = '\0';
+       return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+       char   *tail;
+       int     length;
+
+       if ((handle == NULL) || (ptr == NULL))
+               return -EINVAL;
+
+       if (*handle != NULL) {
+               OBD_FREE(*handle, strlen(*handle) + 1);
+               *handle = NULL;
+       }
+
+       tail = strchr(ptr, ',');
+       if (tail == NULL)
+               length = strlen(ptr);
+       else
+               length = tail - ptr;
+
+       OBD_ALLOC(*handle, length + 1);
+       if (*handle == NULL)
+               return -ENOMEM;
+
+       memcpy(*handle, ptr, length);
+       (*handle)[length] = '\0';
+
+       return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+       lnet_nid_t nid;
+       char *tail = *ptr;
+       char *mgsnid;
+       int   length;
+       int   oldlen = 0;
+
+       /* Find end of nidlist */
+       while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+       length = tail - *ptr;
+       if (length == 0) {
+               LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+               return -EINVAL;
+       }
+
+       if (lmd->lmd_mgs != NULL)
+               oldlen = strlen(lmd->lmd_mgs) + 1;
+
+       OBD_ALLOC(mgsnid, oldlen + length + 1);
+       if (mgsnid == NULL)
+               return -ENOMEM;
+
+       if (lmd->lmd_mgs != NULL) {
+               /* Multiple mgsnid= are taken to mean failover locations */
+               memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+               mgsnid[oldlen - 1] = ':';
+               OBD_FREE(lmd->lmd_mgs, oldlen);
+       }
+       memcpy(mgsnid + oldlen, *ptr, length);
+       mgsnid[oldlen + length] = '\0';
+       lmd->lmd_mgs = mgsnid;
+       *ptr = tail;
+
+       return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+       char *s1, *s2, *devname = NULL;
+       struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(lmd);
+       if (!options) {
+               LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+                                  "/sbin/mount.lustre is installed.\n");
+               RETURN(-EINVAL);
+       }
+
+       /* Options should be a string - try to detect old lmd data */
+       if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+               LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+                                  "/sbin/mount.lustre.  Please install "
+                                  "version %s\n", LUSTRE_VERSION_STRING);
+               RETURN(-EINVAL);
+       }
+       lmd->lmd_magic = LMD_MAGIC;
+
+       OBD_ALLOC(lmd->lmd_params, 4096);
+       if (lmd->lmd_params == NULL)
+               RETURN(-ENOMEM);
+       lmd->lmd_params[0] = '\0';
+
+       /* Set default flags here */
+
+       s1 = options;
+       while (*s1) {
+               int clear = 0;
+               int time_min = OBD_RECOVERY_TIME_MIN;
+
+               /* Skip whitespace and extra commas */
+               while (*s1 == ' ' || *s1 == ',')
+                       s1++;
+
+               /* Client options are parsed in ll_options: eg. flock,
+                  user_xattr, acl */
+
+               /* Parse non-ldiskfs options here. Rather than modifying
+                  ldiskfs, we just zero these out here */
+               if (strncmp(s1, "abort_recov", 11) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+                       clear++;
+               } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+                       lmd->lmd_recovery_time_soft = max_t(int,
+                               simple_strtoul(s1 + 19, NULL, 10), time_min);
+                       clear++;
+               } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+                       lmd->lmd_recovery_time_hard = max_t(int,
+                               simple_strtoul(s1 + 19, NULL, 10), time_min);
+                       clear++;
+               } else if (strncmp(s1, "noir", 4) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+                       clear++;
+               } else if (strncmp(s1, "nosvc", 5) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOSVC;
+                       clear++;
+               } else if (strncmp(s1, "nomgs", 5) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOMGS;
+                       clear++;
+               } else if (strncmp(s1, "noscrub", 7) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+                       clear++;
+               } else if (strncmp(s1, PARAM_MGSNODE,
+                                  sizeof(PARAM_MGSNODE) - 1) == 0) {
+                       s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+                       /* Assume the next mount opt is the first
+                          invalid nid we get to. */
+                       rc = lmd_parse_mgs(lmd, &s2);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "writeconf", 9) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_WRITECONF;
+                       clear++;
+               } else if (strncmp(s1, "update", 6) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_UPDATE;
+                       clear++;
+               } else if (strncmp(s1, "virgin", 6) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_VIRGIN;
+                       clear++;
+               } else if (strncmp(s1, "noprimnode", 10) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+                       clear++;
+               } else if (strncmp(s1, "mgssec=", 7) == 0) {
+                       rc = lmd_parse_mgssec(lmd, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               /* ost exclusion list */
+               } else if (strncmp(s1, "exclude=", 8) == 0) {
+                       rc = lmd_make_exclusion(lmd, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "mgs", 3) == 0) {
+                       /* We are an MGS */
+                       lmd->lmd_flags |= LMD_FLG_MGS;
+                       clear++;
+               } else if (strncmp(s1, "svname=", 7) == 0) {
+                       rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "param=", 6) == 0) {
+                       int length;
+                       char *tail = strchr(s1 + 6, ',');
+                       if (tail == NULL)
+                               length = strlen(s1);
+                       else
+                               length = tail - s1;
+                       length -= 6;
+                       strncat(lmd->lmd_params, s1 + 6, length);
+                       strcat(lmd->lmd_params, " ");
+                       clear++;
+               } else if (strncmp(s1, "osd=", 4) == 0) {
+                       rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               }
+               /* Linux 2.4 doesn't pass the device, so we stuck it at the
+                  end of the options. */
+               else if (strncmp(s1, "device=", 7) == 0) {
+                       devname = s1 + 7;
+                       /* terminate options right before device.  device
+                          must be the last one. */
+                       *s1 = '\0';
+                       break;
+               }
+
+               /* Find next opt */
+               s2 = strchr(s1, ',');
+               if (s2 == NULL) {
+                       if (clear)
+                               *s1 = '\0';
+                       break;
+               }
+               s2++;
+               if (clear)
+                       memmove(s1, s2, strlen(s2) + 1);
+               else
+                       s1 = s2;
+       }
+
+       if (!devname) {
+               LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+                                  "(need mount option 'device=...')\n");
+               goto invalid;
+       }
+
+       s1 = strstr(devname, ":/");
+       if (s1) {
+               ++s1;
+               lmd->lmd_flags |= LMD_FLG_CLIENT;
+               /* Remove leading /s from fsname */
+               while (*++s1 == '/') ;
+               /* Freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+               if (!lmd->lmd_profile)
+                       RETURN(-ENOMEM);
+               sprintf(lmd->lmd_profile, "%s-client", s1);
+       }
+
+       /* Freed in lustre_free_lsi */
+       OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+       if (!lmd->lmd_dev)
+               RETURN(-ENOMEM);
+       strcpy(lmd->lmd_dev, devname);
+
+       /* Save mount options */
+       s1 = options + strlen(options) - 1;
+       while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+               *s1-- = 0;
+       if (*options != 0) {
+               /* Freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+               if (!lmd->lmd_opts)
+                       RETURN(-ENOMEM);
+               strcpy(lmd->lmd_opts, options);
+       }
+
+       lmd_print(lmd);
+       lmd->lmd_magic = LMD_MAGIC;
+
+       RETURN(rc);
+
+invalid:
+       CERROR("Bad mount options %s\n", options);
+       RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+       void *lmd2_data;
+       struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+       struct lustre_mount_data *lmd;
+       struct lustre_mount_data2 *lmd2 = data;
+       struct lustre_sb_info *lsi;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+       lsi = lustre_init_lsi(sb);
+       if (!lsi)
+               RETURN(-ENOMEM);
+       lmd = lsi->lsi_lmd;
+
+       /*
+        * Disable lockdep during mount, because mount locking patterns are
+        * `special'.
+        */
+       lockdep_off();
+
+       /*
+        * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+        */
+       obd_zombie_barrier();
+
+       /* Figure out the lmd from the mount options */
+       if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+               lustre_put_lsi(sb);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (lmd_is_client(lmd)) {
+               CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+               if (!client_fill_super) {
+                       LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+                                          "client mount! Is the 'lustre' "
+                                          "module loaded?\n");
+                       lustre_put_lsi(sb);
+                       rc = -ENODEV;
+               } else {
+                       rc = lustre_start_mgc(sb);
+                       if (rc) {
+                               lustre_put_lsi(sb);
+                               GOTO(out, rc);
+                       }
+                       /* Connect and start */
+                       /* (should always be ll_fill_super) */
+                       rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+                       /* c_f_s will call lustre_common_put_super on failure */
+               }
+       } else {
+               CERROR("This is client-side-only module, "
+                      "cannot handle server mount.\n");
+               rc = -EINVAL;
+       }
+
+       /* If error happens in fill_super() call, @lsi will be killed there.
+        * This is why we do not put it here. */
+       GOTO(out, rc);
+out:
+       if (rc) {
+               CERROR("Unable to mount %s (%d)\n",
+                      s2lsi(sb) ? lmd->lmd_dev : "", rc);
+       } else {
+               CDEBUG(D_SUPER, "Mount %s complete\n",
+                      lmd->lmd_dev);
+       }
+       lockdep_on();
+       return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+                                                 struct vfsmount *mnt))
+{
+       client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+       kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+                               const char *devname, void *data)
+{
+       struct lustre_mount_data2 lmd2 = { data, NULL };
+
+       return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+
+       if (kill_super_cb && lsi && !IS_SERVER(lsi))
+               (*kill_super_cb)(sb);
+
+       kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+       .owner  = THIS_MODULE,
+       .name    = "lustre",
+       .mount  = lustre_mount,
+       .kill_sb      = lustre_kill_super,
+       .fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+                       FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+       return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+       return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c
new file mode 100644 (file)
index 0000000..a3a4409
--- /dev/null
@@ -0,0 +1,1783 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount_server.c
+ *
+ * Server mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */)
+#define PRINT_CMD CDEBUG
+#define PRINT_MASK (D_SUPER | D_CONFIG)
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+/*********** mount lookup *********/
+
+DEFINE_MUTEX(lustre_mount_info_lock);
+static LIST_HEAD(server_mount_info_list);
+
+static struct lustre_mount_info *server_find_mount(const char *name)
+{
+       struct list_head *tmp;
+       struct lustre_mount_info *lmi;
+       ENTRY;
+
+       list_for_each(tmp, &server_mount_info_list) {
+               lmi = list_entry(tmp, struct lustre_mount_info,
+                                    lmi_list_chain);
+               if (strcmp(name, lmi->lmi_name) == 0)
+                       RETURN(lmi);
+       }
+       RETURN(NULL);
+}
+
+/* we must register an obd for a mount before we call the setup routine.
+ *_setup will call lustre_get_mount to get the mnt struct
+ by obd_name, since we can't pass the pointer to setup. */
+static int server_register_mount(const char *name, struct super_block *sb,
+                                struct vfsmount *mnt)
+{
+       struct lustre_mount_info *lmi;
+       char *name_cp;
+       ENTRY;
+
+       LASSERT(sb);
+
+       OBD_ALLOC(lmi, sizeof(*lmi));
+       if (!lmi)
+               RETURN(-ENOMEM);
+       OBD_ALLOC(name_cp, strlen(name) + 1);
+       if (!name_cp) {
+               OBD_FREE(lmi, sizeof(*lmi));
+               RETURN(-ENOMEM);
+       }
+       strcpy(name_cp, name);
+
+       mutex_lock(&lustre_mount_info_lock);
+
+       if (server_find_mount(name)) {
+               mutex_unlock(&lustre_mount_info_lock);
+               OBD_FREE(lmi, sizeof(*lmi));
+               OBD_FREE(name_cp, strlen(name) + 1);
+               CERROR("Already registered %s\n", name);
+               RETURN(-EEXIST);
+       }
+       lmi->lmi_name = name_cp;
+       lmi->lmi_sb = sb;
+       lmi->lmi_mnt = mnt;
+       list_add(&lmi->lmi_list_chain, &server_mount_info_list);
+
+       mutex_unlock(&lustre_mount_info_lock);
+
+       CDEBUG(D_MOUNT, "reg_mnt %p from %s\n", lmi->lmi_mnt, name);
+
+       RETURN(0);
+}
+
+/* when an obd no longer needs a mount */
+static int server_deregister_mount(const char *name)
+{
+       struct lustre_mount_info *lmi;
+       ENTRY;
+
+       mutex_lock(&lustre_mount_info_lock);
+       lmi = server_find_mount(name);
+       if (!lmi) {
+               mutex_unlock(&lustre_mount_info_lock);
+               CERROR("%s not registered\n", name);
+               RETURN(-ENOENT);
+       }
+
+       CDEBUG(D_MOUNT, "dereg_mnt %p from %s\n", lmi->lmi_mnt, name);
+
+       OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+       list_del(&lmi->lmi_list_chain);
+       OBD_FREE(lmi, sizeof(*lmi));
+       mutex_unlock(&lustre_mount_info_lock);
+
+       RETURN(0);
+}
+
+/* obd's look up a registered mount using their obdname. This is just
+   for initial obd setup to find the mount struct.  It should not be
+   called every time you want to mntget. */
+struct lustre_mount_info *server_get_mount(const char *name)
+{
+       struct lustre_mount_info *lmi;
+       struct lustre_sb_info *lsi;
+       ENTRY;
+
+       mutex_lock(&lustre_mount_info_lock);
+       lmi = server_find_mount(name);
+       mutex_unlock(&lustre_mount_info_lock);
+       if (!lmi) {
+               CERROR("Can't find mount for %s\n", name);
+               RETURN(NULL);
+       }
+       lsi = s2lsi(lmi->lmi_sb);
+
+       atomic_inc(&lsi->lsi_mounts);
+
+       CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d\n", lmi->lmi_mnt,
+              name, atomic_read(&lsi->lsi_mounts));
+
+       RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount);
+
+/*
+ * Used by mdt to get mount_info from obdname.
+ * There are no blocking when using the mount_info.
+ * Do not use server_get_mount for this purpose.
+ */
+struct lustre_mount_info *server_get_mount_2(const char *name)
+{
+       struct lustre_mount_info *lmi;
+       ENTRY;
+
+       mutex_lock(&lustre_mount_info_lock);
+       lmi = server_find_mount(name);
+       mutex_unlock(&lustre_mount_info_lock);
+       if (!lmi)
+               CERROR("Can't find mount for %s\n", name);
+
+       RETURN(lmi);
+}
+EXPORT_SYMBOL(server_get_mount_2);
+
+/* to be called from obd_cleanup methods */
+int server_put_mount(const char *name, struct vfsmount *mnt)
+{
+       struct lustre_mount_info *lmi;
+       struct lustre_sb_info *lsi;
+       ENTRY;
+
+       mutex_lock(&lustre_mount_info_lock);
+       lmi = server_find_mount(name);
+       mutex_unlock(&lustre_mount_info_lock);
+       if (!lmi) {
+               CERROR("Can't find mount for %s\n", name);
+               RETURN(-ENOENT);
+       }
+       lsi = s2lsi(lmi->lmi_sb);
+
+       CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d\n",
+              lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts));
+
+       if (lustre_put_lsi(lmi->lmi_sb))
+               CDEBUG(D_MOUNT, "Last put of mnt %p from %s\n",
+                      lmi->lmi_mnt, name);
+
+       /* this obd should never need the mount again */
+       server_deregister_mount(name);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount);
+
+/* Corresponding to server_get_mount_2 */
+int server_put_mount_2(const char *name, struct vfsmount *mnt)
+{
+       ENTRY;
+       RETURN(0);
+}
+EXPORT_SYMBOL(server_put_mount_2);
+
+/* Set up a MGS to serve startup logs */
+static int server_start_mgs(struct super_block *sb)
+{
+       struct lustre_sb_info    *lsi = s2lsi(sb);
+       struct vfsmount   *mnt = lsi->lsi_srv_mnt;
+       struct lustre_mount_info *lmi;
+       int    rc = 0;
+       ENTRY;
+
+       /* It is impossible to have more than 1 MGS per node, since
+          MGC wouldn't know which to connect to */
+       lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+       if (lmi) {
+               lsi = s2lsi(lmi->lmi_sb);
+               LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started"
+                                  " from server\n");
+               RETURN(-EALREADY);
+       }
+
+       CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+       rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
+
+       if (!rc) {
+               rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME,
+                                        LUSTRE_MGS_OBDNAME, 0, 0,
+                                        lsi->lsi_osd_obdname, 0);
+               /* Do NOT call server_deregister_mount() here. This leads to
+                * inability cleanup cleanly and free lsi and other stuff when
+                * mgs calls server_put_mount() in error handling case. -umka */
+       }
+
+       if (rc)
+               LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). "
+                                  "Is the 'mgs' module loaded?\n",
+                                  LUSTRE_MGS_OBDNAME, rc);
+       RETURN(rc);
+}
+
+static int server_stop_mgs(struct super_block *sb)
+{
+       struct obd_device *obd;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+       /* There better be only one MGS */
+       obd = class_name2obd(LUSTRE_MGS_OBDNAME);
+       if (!obd) {
+               CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
+               RETURN(-EALREADY);
+       }
+
+       /* The MGS should always stop when we say so */
+       obd->obd_force = 1;
+       rc = class_manual_cleanup(obd);
+       RETURN(rc);
+}
+
+/* Since there's only one mgc per node, we have to change it's fs to get
+   access to the right disk. */
+static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
+
+       /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
+       rc = obd_set_info_async(NULL, mgc->obd_self_export,
+                               sizeof(KEY_SET_FS), KEY_SET_FS,
+                               sizeof(*sb), sb, NULL);
+       if (rc != 0)
+               CERROR("can't set_fs %d\n", rc);
+
+       RETURN(rc);
+}
+
+static int server_mgc_clear_fs(struct obd_device *mgc)
+{
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "Unassign mgc disk\n");
+
+       rc = obd_set_info_async(NULL, mgc->obd_self_export,
+                               sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS,
+                               0, NULL, NULL);
+       RETURN(rc);
+}
+
+static int is_mdc_device(const char *devname)
+{
+       char *ptr;
+
+       ptr = strrchr(devname, '-');
+       if (ptr != NULL && strcmp(ptr, "-mdc") == 0)
+               return 1;
+
+       return 0;
+}
+
+static inline int tgt_is_mdt0(const char *tgtname)
+{
+       __u32 idx;
+       int   type;
+
+       type = server_name2index(tgtname, &idx, NULL);
+       if (type != LDD_F_SV_TYPE_MDT)
+               return 0;
+
+       return idx == 0;
+}
+
+static inline int is_mdc_for_mdt0(const char *devname)
+{
+       char   *ptr;
+
+       if (!is_mdc_device(devname))
+               return 0;
+
+       ptr = strrchr(devname, '-');
+       if (ptr == NULL)
+               return 0;
+
+       *ptr = 0;
+       if (tgt_is_mdt0(devname)) {
+               *ptr = '-';
+               return 1;
+       }
+       *ptr = '-';
+       return 0;
+}
+
+/**
+ * Convert OST/MDT name(fsname-OSTxxxx) to a lwp name
+ * (fsname-MDT0000-lwp-OSTxxxx)
+ **/
+int tgt_name2lwpname(const char *svname, char *lwpname)
+{
+       char            *fsname;
+       const char      *tgt;
+       int             rc;
+       ENTRY;
+
+       OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+       if (fsname == NULL)
+               RETURN(-ENOMEM);
+
+       rc = server_name2fsname(svname, fsname, &tgt);
+       if (rc != 0) {
+               CERROR("%s: failed to get fsname from svname. %d\n",
+                      svname, rc);
+               GOTO(cleanup, rc);
+       }
+
+       if (*tgt != '-' && *tgt != ':') {
+               CERROR("%s: invalid svname name!\n", svname);
+               GOTO(cleanup, rc = -EINVAL);
+       }
+
+       tgt++;
+       if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) {
+               CERROR("%s is not an OST or MDT target!\n", svname);
+               GOTO(cleanup, rc = -EINVAL);
+       }
+       sprintf(lwpname, "%s-MDT0000-%s-%s", fsname, LUSTRE_LWP_NAME, tgt);
+cleanup:
+       if (fsname != NULL)
+               OBD_FREE(fsname, MTI_NAME_MAXLEN);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(tgt_name2lwpname);
+
+static LIST_HEAD(lwp_register_list);
+DEFINE_MUTEX(lwp_register_list_lock);
+
+int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp,
+                            register_lwp_cb cb_func, void *cb_data)
+{
+       struct obd_device        *lwp;
+       struct lwp_register_item *lri;
+       ENTRY;
+
+       LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n",
+                lwpname);
+       LASSERT(exp != NULL && *exp == NULL);
+
+       OBD_ALLOC_PTR(lri);
+       if (lri == NULL)
+               RETURN(-ENOMEM);
+
+       mutex_lock(&lwp_register_list_lock);
+
+       lwp = class_name2obd(lwpname);
+       if (lwp != NULL && lwp->obd_set_up == 1) {
+               struct obd_uuid *uuid;
+
+               OBD_ALLOC_PTR(uuid);
+               if (uuid == NULL) {
+                       mutex_unlock(&lwp_register_list_lock);
+                       OBD_FREE_PTR(lri);
+                       RETURN(-ENOMEM);
+               }
+               memcpy(uuid->uuid, lwpname, strlen(lwpname));
+               *exp = cfs_hash_lookup(lwp->obd_uuid_hash, uuid);
+               OBD_FREE_PTR(uuid);
+       }
+
+       memcpy(lri->lri_name, lwpname, strlen(lwpname));
+       lri->lri_exp = exp;
+       lri->lri_cb_func = cb_func;
+       lri->lri_cb_data = cb_data;
+       INIT_LIST_HEAD(&lri->lri_list);
+       list_add(&lri->lri_list, &lwp_register_list);
+
+       if (*exp != NULL && cb_func != NULL)
+               cb_func(cb_data);
+
+       mutex_unlock(&lwp_register_list_lock);
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_register_lwp_item);
+
+void lustre_deregister_lwp_item(struct obd_export **exp)
+{
+       struct lwp_register_item *lri, *tmp;
+
+       mutex_lock(&lwp_register_list_lock);
+       list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) {
+               if (exp == lri->lri_exp) {
+                       if (*exp)
+                               class_export_put(*exp);
+                       list_del(&lri->lri_list);
+                       OBD_FREE_PTR(lri);
+                       break;
+               }
+       }
+       mutex_unlock(&lwp_register_list_lock);
+}
+EXPORT_SYMBOL(lustre_deregister_lwp_item);
+
+static void lustre_notify_lwp_list(struct obd_export *exp)
+{
+       struct lwp_register_item *lri, *tmp;
+       LASSERT(exp != NULL);
+
+       mutex_lock(&lwp_register_list_lock);
+       list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) {
+               if (strcmp(exp->exp_obd->obd_name, lri->lri_name))
+                       continue;
+               if (*lri->lri_exp != NULL)
+                       continue;
+               *lri->lri_exp = class_export_get(exp);
+               if (lri->lri_cb_func != NULL)
+                       lri->lri_cb_func(lri->lri_cb_data);
+       }
+       mutex_unlock(&lwp_register_list_lock);
+}
+
+static int lustre_lwp_connect(struct obd_device *lwp)
+{
+       struct lu_env            env;
+       struct lu_context        session_ctx;
+       struct obd_export       *exp;
+       struct obd_uuid         *uuid = NULL;
+       struct obd_connect_data *data = NULL;
+       int                      rc;
+       ENTRY;
+
+       /* log has been fully processed, let clients connect */
+       rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags);
+       if (rc != 0)
+               RETURN(rc);
+
+       lu_context_init(&session_ctx, LCT_SESSION);
+       session_ctx.lc_thread = NULL;
+       lu_context_enter(&session_ctx);
+       env.le_ses = &session_ctx;
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+       data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
+               OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
+               OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
+               OBD_CONNECT_LIGHTWEIGHT;
+       OBD_ALLOC_PTR(uuid);
+       if (uuid == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) {
+               CERROR("%s: Too long lwp name %s, max_size is %d\n",
+                      lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       /* Use lwp name as the uuid, so we find the export by lwp name later */
+       memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name));
+       rc = obd_connect(&env, &exp, lwp, uuid, data, NULL);
+       if (rc != 0)
+               CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc);
+       else
+               lustre_notify_lwp_list(exp);
+
+out:
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (uuid != NULL)
+               OBD_FREE_PTR(uuid);
+
+       lu_env_fini(&env);
+       lu_context_exit(&session_ctx);
+       lu_context_fini(&session_ctx);
+
+       RETURN(rc);
+}
+
+/**
+ * lwp is used by slaves (Non-MDT0 targets) to manage the connection
+ * to MDT0.
+ **/
+static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi)
+{
+       struct obd_connect_data *data = NULL;
+       struct obd_device       *obd;
+       char                    *lwpname = NULL;
+       char                    *lwpuuid = NULL;
+       int                      rc;
+       ENTRY;
+
+       rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+                           lcfg->lcfg_nid);
+       if (rc) {
+               CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc);
+               GOTO(out, rc);
+       }
+
+       OBD_ALLOC(lwpname, MTI_NAME_MAXLEN);
+       if (lwpname == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = tgt_name2lwpname(lsi->lsi_svname, lwpname);
+       if (rc != 0) {
+               CERROR("%s: failed to generate lwp name. %d\n",
+                      lsi->lsi_svname, rc);
+               GOTO(out, rc);
+       }
+
+       OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN);
+       if (lwpuuid == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       sprintf(lwpuuid, "%s_UUID", lwpname);
+       rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME,
+                                lwpuuid, lustre_cfg_string(lcfg, 1),
+                                0, 0, 0);
+       if (rc) {
+               CERROR("%s: setup up failed: rc %d\n", lwpname, rc);
+               GOTO(out, rc);
+       }
+
+       obd = class_name2obd(lwpname);
+       LASSERT(obd != NULL);
+
+       rc = lustre_lwp_connect(obd);
+       if (rc != 0)
+               CERROR("%s: connect failed: rc = %d\n", lwpname, rc);
+out:
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (lwpname != NULL)
+               OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+       if (lwpuuid != NULL)
+               OBD_FREE(lwpuuid, MTI_NAME_MAXLEN);
+
+       RETURN(rc);
+}
+
+/* the caller is responsible for memory free */
+static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi,
+                                         char **lwpname, char **logname)
+{
+       struct obd_device       *lwp;
+       int                      rc = 0;
+       ENTRY;
+
+       LASSERT(lwpname != NULL);
+       LASSERT(IS_OST(lsi) || IS_MDT(lsi));
+
+       OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN);
+       if (*lwpname == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       if (logname != NULL) {
+               OBD_ALLOC(*logname, MTI_NAME_MAXLEN);
+               if (*logname == NULL)
+                       GOTO(out, rc = -ENOMEM);
+               rc = server_name2fsname(lsi->lsi_svname, *lwpname, NULL);
+               if (rc != 0) {
+                       CERROR("%s: failed to get fsname from svname. %d\n",
+                              lsi->lsi_svname, rc);
+                       GOTO(out, rc = -EINVAL);
+               }
+               sprintf(*logname, "%s-client", *lwpname);
+       }
+
+       rc = tgt_name2lwpname(lsi->lsi_svname, *lwpname);
+       if (rc != 0) {
+               CERROR("%s: failed to generate lwp name. %d\n",
+                      lsi->lsi_svname, rc);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       lwp = class_name2obd(*lwpname);
+
+out:
+       if (rc != 0) {
+               if (*lwpname != NULL) {
+                       OBD_FREE(*lwpname, MTI_NAME_MAXLEN);
+                       *lwpname = NULL;
+               }
+               if (logname != NULL && *logname != NULL) {
+                       OBD_FREE(*logname, MTI_NAME_MAXLEN);
+                       *logname = NULL;
+               }
+               lwp = ERR_PTR(rc);
+       }
+
+       RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT));
+}
+
+static int lustre_lwp_add_conn(struct lustre_cfg *cfg,
+                              struct lustre_sb_info *lsi)
+{
+       struct lustre_cfg_bufs *bufs = NULL;
+       struct lustre_cfg      *lcfg = NULL;
+       char                   *lwpname = NULL;
+       struct obd_device      *lwp;
+       int                     rc;
+       ENTRY;
+
+       lwp = lustre_find_lwp(lsi, &lwpname, NULL);
+       if (IS_ERR(lwp)) {
+               CERROR("%s: can't find lwp device.\n", lsi->lsi_svname);
+               GOTO(out, rc = PTR_ERR(lwp));
+       }
+       LASSERT(lwpname != NULL);
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       lustre_cfg_bufs_reset(bufs, lwpname);
+       lustre_cfg_bufs_set_string(bufs, 1,
+                                  lustre_cfg_string(cfg, 1));
+
+       lcfg = lustre_cfg_new(LCFG_ADD_CONN, bufs);
+
+       rc = class_add_conn(lwp, lcfg);
+       if (rc)
+               CERROR("%s: can't add conn: rc = %d\n", lwpname, rc);
+
+out:
+       if (bufs != NULL)
+               OBD_FREE_PTR(bufs);
+       if (lcfg != NULL)
+               lustre_cfg_free(lcfg);
+       if (lwpname != NULL)
+               OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+       RETURN(rc);
+}
+
+/**
+ * Retrieve MDT nids from the client log, then start the lwp device.
+ * there are only two scenarios which would include mdt nid.
+ * 1.
+ * marker   5 (flags=0x01, v2.1.54.0) lustre-MDT0000  'add mdc' xxx-
+ * add_uuid  nid=192.168.122.162@tcp(0x20000c0a87aa2)  0:  1:192.168.122.162@tcp
+ * attach    0:lustre-MDT0000-mdc  1:mdc  2:lustre-clilmv_UUID
+ * setup     0:lustre-MDT0000-mdc  1:lustre-MDT0000_UUID  2:192.168.122.162@tcp
+ * add_uuid  nid=192.168.172.1@tcp(0x20000c0a8ac01)  0:  1:192.168.172.1@tcp
+ * add_conn  0:lustre-MDT0000-mdc  1:192.168.172.1@tcp
+ * modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID xxxx
+ * marker   5 (flags=0x02, v2.1.54.0) lustre-MDT0000  'add mdc' xxxx-
+ * 2.
+ * marker   7 (flags=0x01, v2.1.54.0) lustre-MDT0000  'add failnid' xxxx-
+ * add_uuid  nid=192.168.122.2@tcp(0x20000c0a87a02)  0:  1:192.168.122.2@tcp
+ * add_conn  0:lustre-MDT0000-mdc  1:192.168.122.2@tcp
+ * marker   7 (flags=0x02, v2.1.54.0) lustre-MDT0000  'add failnid' xxxx-
+ **/
+static int client_lwp_config_process(const struct lu_env *env,
+                                    struct llog_handle *handle,
+                                    struct llog_rec_hdr *rec, void *data)
+{
+       struct config_llog_instance *clli = data;
+       int                          cfg_len = rec->lrh_len;
+       char                        *cfg_buf = (char *) (rec + 1);
+       struct lustre_cfg           *lcfg = NULL;
+       struct lustre_sb_info       *lsi;
+       int                          rc = 0, swab = 0;
+       ENTRY;
+
+       if (rec->lrh_type != OBD_CFG_REC) {
+               CERROR("Unknown llog record type %#x encountered\n",
+                      rec->lrh_type);
+               RETURN(-EINVAL);
+       }
+
+       LASSERT(clli->cfg_sb != NULL);
+       lsi = s2lsi(clli->cfg_sb);
+
+       lcfg = (struct lustre_cfg *)cfg_buf;
+       if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+               lustre_swab_lustre_cfg(lcfg);
+               swab = 1;
+       }
+
+       rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+       if (rc)
+               GOTO(out, rc);
+
+       switch (lcfg->lcfg_command) {
+       case LCFG_MARKER: {
+               struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+               lustre_swab_cfg_marker(marker, swab,
+                                      LUSTRE_CFG_BUFLEN(lcfg, 1));
+               if (marker->cm_flags & CM_SKIP ||
+                   marker->cm_flags & CM_EXCLUDE)
+                       GOTO(out, rc = 0);
+
+               if (!tgt_is_mdt0(marker->cm_tgtname))
+                       GOTO(out, rc = 0);
+
+               if (!strncmp(marker->cm_comment, "add mdc", 7) ||
+                   !strncmp(marker->cm_comment, "add failnid", 11)) {
+                       if (marker->cm_flags & CM_START) {
+                               clli->cfg_flags = CFG_F_MARKER;
+                               /* This hack is to differentiate the
+                                * ADD_UUID is come from "add mdc" record
+                                * or from "add failnid" record. */
+                               if (!strncmp(marker->cm_comment,
+                                            "add failnid", 11))
+                                       clli->cfg_flags |= CFG_F_SKIP;
+                       } else if (marker->cm_flags & CM_END) {
+                               clli->cfg_flags = 0;
+                       }
+               }
+               break;
+       }
+       case LCFG_ADD_UUID: {
+               if (clli->cfg_flags == CFG_F_MARKER) {
+                       rc = lustre_lwp_setup(lcfg, lsi);
+                       /* XXX: process only the first nid as
+                        * we don't need another instance of lwp */
+                       clli->cfg_flags |= CFG_F_SKIP;
+               } else if (clli->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) {
+                       rc = class_add_uuid(lustre_cfg_string(lcfg, 1),
+                                           lcfg->lcfg_nid);
+                       if (rc)
+                               CERROR("%s: Fail to add uuid, rc:%d\n",
+                                      lsi->lsi_svname, rc);
+               }
+               break;
+       }
+       case LCFG_ADD_CONN: {
+               if (is_mdc_for_mdt0(lustre_cfg_string(lcfg, 0)))
+                       rc = lustre_lwp_add_conn(lcfg, lsi);
+               break;
+       }
+       default:
+               break;
+       }
+out:
+       RETURN(rc);
+}
+
+static int lustre_disconnect_lwp(struct super_block *sb)
+{
+       struct lustre_sb_info           *lsi = s2lsi(sb);
+       struct obd_device               *lwp;
+       char                            *lwpname = NULL;
+       char                            *logname = NULL;
+       struct lustre_cfg               *lcfg = NULL;
+       struct lustre_cfg_bufs          *bufs = NULL;
+       struct config_llog_instance     *cfg = NULL;
+       int                              rc;
+       ENTRY;
+
+       lwp = lustre_find_lwp(lsi, &lwpname, &logname);
+       if (IS_ERR(lwp) && PTR_ERR(lwp) != -ENOENT)
+               GOTO(out, rc = PTR_ERR(lwp));
+
+       LASSERT(lwpname != NULL);
+       LASSERT(logname != NULL);
+
+       OBD_ALLOC_PTR(cfg);
+       if (cfg == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       /* end log first */
+       cfg->cfg_instance = sb;
+       rc = lustre_end_log(sb, logname, cfg);
+       if (rc != 0) {
+               CERROR("%s: Can't end config log %s.\n", lwpname, logname);
+               GOTO(out, rc);
+       }
+
+       if (PTR_ERR(lwp) == -ENOENT) {
+               CDEBUG(D_CONFIG, "%s: lwp device wasn't started.\n",
+                      lsi->lsi_svname);
+               GOTO(out, rc = 0);
+       }
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       lustre_cfg_bufs_reset(bufs, lwp->obd_name);
+       lustre_cfg_bufs_set_string(bufs, 1, NULL);
+       lcfg = lustre_cfg_new(LCFG_CLEANUP, bufs);
+       if (!lcfg)
+               GOTO(out, rc = -ENOMEM);
+
+       /* Disconnect import first. NULL is passed for the '@env', since
+        * it will not be used. */
+       rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL, lwp->obd_lu_dev,
+                                                        lcfg);
+out:
+       if (lcfg)
+               lustre_cfg_free(lcfg);
+       if (bufs)
+               OBD_FREE_PTR(bufs);
+       if (cfg)
+               OBD_FREE_PTR(cfg);
+       if (lwpname)
+               OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+       if (logname)
+               OBD_FREE(logname, MTI_NAME_MAXLEN);
+       RETURN(rc);
+}
+
+/**
+ * Stop the lwp for an OST/MDT target.
+ **/
+static int lustre_stop_lwp(struct super_block *sb)
+{
+       struct lustre_sb_info   *lsi = s2lsi(sb);
+       struct obd_device       *lwp = NULL;
+       char                    *lwpname = NULL;
+       int                      rc = 0;
+       ENTRY;
+
+       lwp = lustre_find_lwp(lsi, &lwpname, NULL);
+       if (IS_ERR(lwp)) {
+               CDEBUG(PTR_ERR(lwp) == -ENOENT ? D_CONFIG : D_ERROR,
+                      "%s: lwp wasn't started.\n", lsi->lsi_svname);
+               GOTO(out, rc = 0);
+       }
+
+       lwp->obd_force = 1;
+       rc = class_manual_cleanup(lwp);
+
+out:
+       if (lwpname != NULL)
+               OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+       RETURN(rc);
+}
+
+/**
+ * Start the lwp(fsname-MDT0000-lwp-OSTxxxx) for an OST or MDT target,
+ * which would be used to establish connection from OST to MDT0.
+ **/
+static int lustre_start_lwp(struct super_block *sb)
+{
+       struct lustre_sb_info       *lsi = s2lsi(sb);
+       struct config_llog_instance *cfg = NULL;
+       struct obd_device           *lwp;
+       char                        *lwpname = NULL;
+       char                        *logname = NULL;
+       int                          rc;
+       ENTRY;
+
+       lwp = lustre_find_lwp(lsi, &lwpname, &logname);
+
+       /* the lwp device already stared */
+       if (lwp && !IS_ERR(lwp))
+               GOTO(out, rc = 0);
+
+       if (PTR_ERR(lwp) != -ENOENT)
+               GOTO(out, rc = PTR_ERR(lwp));
+
+       LASSERT(lwpname != NULL);
+       LASSERT(logname != NULL);
+
+       OBD_ALLOC_PTR(cfg);
+       if (cfg == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       cfg->cfg_callback = client_lwp_config_process;
+       cfg->cfg_instance = sb;
+
+       rc = lustre_process_log(sb, logname, cfg);
+out:
+       if (lwpname != NULL)
+               OBD_FREE(lwpname, MTI_NAME_MAXLEN);
+       if (logname != NULL)
+               OBD_FREE(logname, MTI_NAME_MAXLEN);
+       if (cfg != NULL)
+               OBD_FREE_PTR(cfg);
+       RETURN(rc);
+}
+
+DEFINE_MUTEX(server_start_lock);
+
+/* Stop MDS/OSS if nobody is using them */
+static int server_stop_servers(int lsiflags)
+{
+       struct obd_device *obd = NULL;
+       struct obd_type *type = NULL;
+       int rc = 0;
+       ENTRY;
+
+       mutex_lock(&server_start_lock);
+
+       /* Either an MDT or an OST or neither  */
+       /* if this was an MDT, and there are no more MDT's, clean up the MDS */
+       if (lsiflags & LDD_F_SV_TYPE_MDT) {
+               obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+               if (obd != NULL)
+                       type = class_search_type(LUSTRE_MDT_NAME);
+       }
+
+       /* if this was an OST, and there are no more OST's, clean up the OSS */
+       if (lsiflags & LDD_F_SV_TYPE_OST) {
+               obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+               if (obd != NULL)
+                       type = class_search_type(LUSTRE_OST_NAME);
+       }
+
+       if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) {
+               int err;
+
+               obd->obd_force = 1;
+               /* obd_fail doesn't mean much on a server obd */
+               err = class_manual_cleanup(obd);
+               if (rc != 0)
+                       rc = err;
+       }
+
+       mutex_unlock(&server_start_lock);
+
+       RETURN(rc);
+}
+
+int server_mti_print(const char *title, struct mgs_target_info *mti)
+{
+       PRINT_CMD(PRINT_MASK, "mti %s\n", title);
+       PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname);
+       PRINT_CMD(PRINT_MASK, "fs:     %s\n", mti->mti_fsname);
+       PRINT_CMD(PRINT_MASK, "uuid:   %s\n", mti->mti_uuid);
+       PRINT_CMD(PRINT_MASK, "ver: %d  flags: %#x\n",
+                 mti->mti_config_ver, mti->mti_flags);
+       return 0;
+}
+EXPORT_SYMBOL(server_mti_print);
+
+/* Generate data for registration */
+static int server_lsi2mti(struct lustre_sb_info *lsi,
+                         struct mgs_target_info *mti)
+{
+       lnet_process_id_t id;
+       int rc, i = 0;
+       int cplen = 0;
+       ENTRY;
+
+       if (!IS_SERVER(lsi))
+               RETURN(-EINVAL);
+
+       if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname))
+           >= sizeof(mti->mti_svname))
+               RETURN(-E2BIG);
+
+       mti->mti_nid_count = 0;
+       while (LNetGetId(i++, &id) != -ENOENT) {
+               if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                       continue;
+
+               /* server use --servicenode param, only allow specified
+                * nids be registered */
+               if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 &&
+                   class_match_nid(lsi->lsi_lmd->lmd_params,
+                                   PARAM_FAILNODE, id.nid) < 1)
+                       continue;
+
+               /* match specified network */
+               if (!class_match_net(lsi->lsi_lmd->lmd_params,
+                                    PARAM_NETWORK, LNET_NIDNET(id.nid)))
+                       continue;
+
+               mti->mti_nids[mti->mti_nid_count] = id.nid;
+               mti->mti_nid_count++;
+               if (mti->mti_nid_count >= MTI_NIDS_MAX) {
+                       CWARN("Only using first %d nids for %s\n",
+                             mti->mti_nid_count, mti->mti_svname);
+                       break;
+               }
+       }
+
+       mti->mti_lustre_ver = LUSTRE_VERSION_CODE;
+       mti->mti_config_ver = 0;
+
+       rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL);
+       if (rc != 0)
+               return rc;
+
+       rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL);
+       if (rc < 0)
+               return rc;
+       /* Orion requires index to be set */
+       LASSERT(!(rc & LDD_F_NEED_INDEX));
+       /* keep only LDD flags */
+       mti->mti_flags = lsi->lsi_flags & LDD_F_MASK;
+       if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN))
+               mti->mti_flags |= LDD_F_UPDATE;
+       cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params,
+                       sizeof(mti->mti_params));
+       if (cplen >= sizeof(mti->mti_params))
+               return -E2BIG;
+       return 0;
+}
+
+/* Register an old or new target with the MGS. If needed MGS will construct
+   startup logs and assign index */
+static int server_register_target(struct lustre_sb_info *lsi)
+{
+       struct obd_device *mgc = lsi->lsi_mgc;
+       struct mgs_target_info *mti = NULL;
+       bool writeconf;
+       int rc;
+       ENTRY;
+
+       LASSERT(mgc);
+
+       if (!IS_SERVER(lsi))
+               RETURN(-EINVAL);
+
+       OBD_ALLOC_PTR(mti);
+       if (!mti)
+               RETURN(-ENOMEM);
+
+       rc = server_lsi2mti(lsi, mti);
+       if (rc)
+               GOTO(out, rc);
+
+       CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
+              mti->mti_svname, mti->mti_fsname,
+              libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
+              mti->mti_flags);
+
+       /* if write_conf is true, the registration must succeed */
+       writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
+       mti->mti_flags |= LDD_F_OPC_REG;
+
+       /* Register the target */
+       /* FIXME use mgc_process_config instead */
+       rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+                               sizeof(KEY_REGISTER_TARGET),
+                               KEY_REGISTER_TARGET,
+                               sizeof(*mti), mti, NULL);
+       if (rc) {
+               if (mti->mti_flags & LDD_F_ERROR) {
+                       LCONSOLE_ERROR_MSG(0x160,
+                               "%s: the MGS refuses to allow this server "
+                               "to start: rc = %d. Please see messages on "
+                               "the MGS.\n", lsi->lsi_svname, rc);
+               } else if (writeconf) {
+                       LCONSOLE_ERROR_MSG(0x15f,
+                               "%s: cannot register this server with the MGS: "
+                               "rc = %d. Is the MGS running?\n",
+                               lsi->lsi_svname, rc);
+               } else {
+                       CERROR("%s: error registering with the MGS: rc = %d "
+                              "(not fatal)\n", lsi->lsi_svname, rc);
+                       /* reset the error code for non-fatal error. */
+                       rc = 0;
+               }
+               GOTO(out, rc);
+       }
+
+out:
+       if (mti)
+               OBD_FREE_PTR(mti);
+       RETURN(rc);
+}
+
+/**
+ * Notify the MGS that this target is ready.
+ * Used by IR - if the MGS receives this message, it will notify clients.
+ */
+static int server_notify_target(struct super_block *sb, struct obd_device *obd)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       struct mgs_target_info *mti = NULL;
+       int rc;
+       ENTRY;
+
+       LASSERT(mgc);
+
+       if (!(IS_SERVER(lsi)))
+               RETURN(-EINVAL);
+
+       OBD_ALLOC_PTR(mti);
+       if (!mti)
+               RETURN(-ENOMEM);
+       rc = server_lsi2mti(lsi, mti);
+       if (rc)
+               GOTO(out, rc);
+
+       mti->mti_instance = obd->u.obt.obt_instance;
+       mti->mti_flags |= LDD_F_OPC_READY;
+
+       /* FIXME use mgc_process_config instead */
+       rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
+                               sizeof(KEY_REGISTER_TARGET),
+                               KEY_REGISTER_TARGET,
+                               sizeof(*mti), mti, NULL);
+
+       /* Imperative recovery: if the mgs informs us to use IR? */
+       if (!rc && !(mti->mti_flags & LDD_F_ERROR) &&
+           (mti->mti_flags & LDD_F_IR_CAPABLE))
+               lsi->lsi_flags |= LDD_F_IR_CAPABLE;
+
+out:
+       if (mti)
+               OBD_FREE_PTR(mti);
+       RETURN(rc);
+
+}
+
+/** Start server targets: MDTs and OSTs
+ */
+static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
+{
+       struct obd_device *obd;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct config_llog_instance cfg;
+       struct lu_env env;
+       struct lu_device *dev;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname);
+
+       if (IS_MDT(lsi)) {
+               /* make sure the MDS is started */
+               mutex_lock(&server_start_lock);
+               obd = class_name2obd(LUSTRE_MDS_OBDNAME);
+               if (!obd) {
+                       rc = lustre_start_simple(LUSTRE_MDS_OBDNAME,
+                                                LUSTRE_MDS_NAME,
+                                                LUSTRE_MDS_OBDNAME"_uuid",
+                                                0, 0, 0, 0);
+                       if (rc) {
+                               mutex_unlock(&server_start_lock);
+                               CERROR("failed to start MDS: %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+               mutex_unlock(&server_start_lock);
+       }
+
+       /* If we're an OST, make sure the global OSS is running */
+       if (IS_OST(lsi)) {
+               /* make sure OSS is started */
+               mutex_lock(&server_start_lock);
+               obd = class_name2obd(LUSTRE_OSS_OBDNAME);
+               if (!obd) {
+                       rc = lustre_start_simple(LUSTRE_OSS_OBDNAME,
+                                                LUSTRE_OSS_NAME,
+                                                LUSTRE_OSS_OBDNAME"_uuid",
+                                                0, 0, 0, 0);
+                       if (rc) {
+                               mutex_unlock(&server_start_lock);
+                               CERROR("failed to start OSS: %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+               mutex_unlock(&server_start_lock);
+       }
+
+       /* Set the mgc fs to our server disk.  This allows the MGC to
+        * read and write configs locally, in case it can't talk to the MGS. */
+       if (lsi->lsi_srv_mnt) {
+               rc = server_mgc_set_fs(lsi->lsi_mgc, sb);
+               if (rc)
+                       GOTO(out_stop_service, rc);
+       }
+
+       /* Register with MGS */
+       rc = server_register_target(lsi);
+       if (rc)
+               GOTO(out_mgc, rc);
+
+       /* Let the target look up the mount using the target's name
+          (we can't pass the sb or mnt through class_process_config.) */
+       rc = server_register_mount(lsi->lsi_svname, sb, mnt);
+       if (rc)
+               GOTO(out_mgc, rc);
+
+       /* Start targets using the llog named for the target */
+       memset(&cfg, 0, sizeof(cfg));
+       cfg.cfg_callback = class_config_llog_handler;
+       rc = lustre_process_log(sb, lsi->lsi_svname, &cfg);
+       if (rc) {
+               CERROR("failed to start server %s: %d\n",
+                      lsi->lsi_svname, rc);
+               /* Do NOT call server_deregister_mount() here. This makes it
+                * impossible to find mount later in cleanup time and leaves
+                * @lsi and othder stuff leaked. -umka */
+               GOTO(out_mgc, rc);
+       }
+
+       obd = class_name2obd(lsi->lsi_svname);
+       if (!obd) {
+               CERROR("no server named %s was started\n", lsi->lsi_svname);
+               GOTO(out_mgc, rc = -ENXIO);
+       }
+
+       if (IS_OST(lsi) || IS_MDT(lsi)) {
+               rc = lustre_start_lwp(sb);
+               if (rc) {
+                       CERROR("%s: failed to start LWP: %d\n",
+                              lsi->lsi_svname, rc);
+                       GOTO(out_mgc, rc);
+               }
+       }
+
+       server_notify_target(sb, obd);
+
+       /* calculate recovery timeout, do it after lustre_process_log */
+       server_calc_timeout(lsi, obd);
+
+       /* log has been fully processed */
+       obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
+
+       /* log has been fully processed, let clients connect */
+       dev = obd->obd_lu_dev;
+       if (dev && dev->ld_ops->ldo_prepare) {
+               rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags);
+               if (rc == 0) {
+                       struct lu_context  session_ctx;
+
+                       lu_context_init(&session_ctx, LCT_SESSION);
+                       session_ctx.lc_thread = NULL;
+                       lu_context_enter(&session_ctx);
+                       env.le_ses = &session_ctx;
+
+                       rc = dev->ld_ops->ldo_prepare(&env, NULL, dev);
+
+                       lu_env_fini(&env);
+                       lu_context_exit(&session_ctx);
+                       lu_context_fini(&session_ctx);
+               }
+       }
+
+       /* abort recovery only on the complete stack:
+        * many devices can be involved */
+       if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) &&
+           (OBP(obd, iocontrol))) {
+               obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0,
+                             NULL, NULL);
+       }
+
+out_mgc:
+       /* Release the mgc fs for others to use */
+       if (lsi->lsi_srv_mnt)
+               server_mgc_clear_fs(lsi->lsi_mgc);
+
+out_stop_service:
+       if (rc != 0)
+               server_stop_servers(lsi->lsi_flags);
+
+       RETURN(rc);
+}
+
+static int lsi_prepare(struct lustre_sb_info *lsi)
+{
+       __u32 index;
+       int rc;
+       ENTRY;
+
+       LASSERT(lsi);
+       LASSERT(lsi->lsi_lmd);
+
+       /* The server name is given as a mount line option */
+       if (lsi->lsi_lmd->lmd_profile == NULL) {
+               LCONSOLE_ERROR("Can't determine server name\n");
+               RETURN(-EINVAL);
+       }
+
+       if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname))
+               RETURN(-ENAMETOOLONG);
+
+       strcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile);
+
+       /* Determine osd type */
+       if (lsi->lsi_lmd->lmd_osd_type != NULL) {
+               if (strlen(lsi->lsi_lmd->lmd_osd_type) >=
+                   sizeof(lsi->lsi_osd_type))
+                       RETURN(-ENAMETOOLONG);
+
+               strcpy(lsi->lsi_osd_type, lsi->lsi_lmd->lmd_osd_type);
+       } else {
+               strcpy(lsi->lsi_osd_type, LUSTRE_OSD_LDISKFS_NAME);
+       }
+
+       /* XXX: a temp. solution for components using fsfilt
+        *      to be removed in one of the subsequent patches */
+       if (!strcmp(lsi->lsi_lmd->lmd_osd_type, "osd-ldiskfs"))
+               strcpy(lsi->lsi_fstype, "ldiskfs");
+       else
+               strcpy(lsi->lsi_fstype, lsi->lsi_lmd->lmd_osd_type);
+
+       /* Determine server type */
+       rc = server_name2index(lsi->lsi_svname, &index, NULL);
+       if (rc < 0) {
+               if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) {
+                       /* Assume we're a bare MGS */
+                       rc = 0;
+                       lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC;
+               } else {
+                       LCONSOLE_ERROR("Can't determine server type of '%s'\n",
+                                      lsi->lsi_svname);
+                       RETURN(rc);
+               }
+       }
+       lsi->lsi_flags |= rc;
+
+       /* Add mount line flags that used to be in ldd:
+        * writeconf, mgs, anything else?
+        */
+       lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ?
+               LDD_F_WRITECONF : 0;
+       lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ?
+               LDD_F_VIRGIN : 0;
+       lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ?
+               LDD_F_UPDATE : 0;
+       lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ?
+               LDD_F_SV_TYPE_MGS : 0;
+       lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ?
+               LDD_F_NO_PRIMNODE : 0;
+
+       RETURN(0);
+}
+
+/*************** server mount ******************/
+
+/** Start the shutdown of servers at umount.
+ */
+static void server_put_super(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device     *obd;
+       char *tmpname, *extraname = NULL;
+       int tmpname_sz;
+       int lsiflags = lsi->lsi_flags;
+       ENTRY;
+
+       LASSERT(IS_SERVER(lsi));
+
+       tmpname_sz = strlen(lsi->lsi_svname) + 1;
+       OBD_ALLOC(tmpname, tmpname_sz);
+       memcpy(tmpname, lsi->lsi_svname, tmpname_sz);
+       CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+       if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
+               snprintf(tmpname, tmpname_sz, "MGS");
+
+       /* disconnect the lwp first to drain off the inflight request */
+       if (IS_OST(lsi) || IS_MDT(lsi)) {
+               int     rc;
+
+               rc = lustre_disconnect_lwp(sb);
+               if (rc && rc != ETIMEDOUT)
+                       CERROR("%s: failed to disconnect lwp. (rc=%d)\n",
+                              tmpname, rc);
+       }
+
+       /* Stop the target */
+       if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+           (IS_MDT(lsi) || IS_OST(lsi))) {
+               struct lustre_profile *lprof = NULL;
+
+               /* tell the mgc to drop the config log */
+               lustre_end_log(sb, lsi->lsi_svname, NULL);
+
+               /* COMPAT_146 - profile may get deleted in mgc_cleanup.
+                  If there are any setup/cleanup errors, save the lov
+                  name for safety cleanup later. */
+               lprof = class_get_profile(lsi->lsi_svname);
+               if (lprof && lprof->lp_dt) {
+                       OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1);
+                       strcpy(extraname, lprof->lp_dt);
+               }
+
+               obd = class_name2obd(lsi->lsi_svname);
+               if (obd) {
+                       CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
+                       if (lsiflags & LSI_UMOUNT_FAILOVER)
+                               obd->obd_fail = 1;
+                       /* We can't seem to give an error return code
+                        * to .put_super, so we better make sure we clean up! */
+                       obd->obd_force = 1;
+                       class_manual_cleanup(obd);
+               } else {
+                       CERROR("no obd %s\n", lsi->lsi_svname);
+                       server_deregister_mount(lsi->lsi_svname);
+               }
+       }
+
+       /* If they wanted the mgs to stop separately from the mdt, they
+          should have put it on a different device. */
+       if (IS_MGS(lsi)) {
+               /* if MDS start with --nomgs, don't stop MGS then */
+               if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS))
+                       server_stop_mgs(sb);
+       }
+
+       if (IS_OST(lsi) || IS_MDT(lsi)) {
+               if (lustre_stop_lwp(sb) < 0)
+                       CERROR("%s: failed to stop lwp!\n", tmpname);
+       }
+
+       /* Clean the mgc and sb */
+       lustre_common_put_super(sb);
+
+       /* wait till all in-progress cleanups are done
+        * specifically we're interested in ofd cleanup
+        * as it pins OSS */
+       obd_zombie_barrier();
+
+       /* Stop the servers (MDS, OSS) if no longer needed.  We must wait
+          until the target is really gone so that our type refcount check
+          is right. */
+       server_stop_servers(lsiflags);
+
+       /* In case of startup or cleanup err, stop related obds */
+       if (extraname) {
+               obd = class_name2obd(extraname);
+               if (obd) {
+                       CWARN("Cleaning orphaned obd %s\n", extraname);
+                       obd->obd_force = 1;
+                       class_manual_cleanup(obd);
+               }
+               OBD_FREE(extraname, strlen(extraname) + 1);
+       }
+
+       LCONSOLE_WARN("server umount %s complete\n", tmpname);
+       OBD_FREE(tmpname, tmpname_sz);
+       EXIT;
+}
+
+/** Called only for 'umount -f'
+ */
+static void server_umount_begin(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "umount -f\n");
+       /* umount = failover
+          umount -f = force
+          no third way to do non-force, non-failover */
+       lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
+       EXIT;
+}
+
+static int server_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       struct super_block *sb = dentry->d_sb;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_statfs statfs;
+       int rc;
+       ENTRY;
+
+       if (lsi->lsi_dt_dev) {
+               rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs);
+               if (rc == 0) {
+                       statfs_unpack(buf, &statfs);
+                       buf->f_type = sb->s_magic;
+                       RETURN(0);
+               }
+       }
+
+       /* just return 0 */
+       buf->f_type = sb->s_magic;
+       buf->f_bsize = sb->s_blocksize;
+       buf->f_blocks = 1;
+       buf->f_bfree = 0;
+       buf->f_bavail = 0;
+       buf->f_files = 1;
+       buf->f_ffree = 0;
+       buf->f_namelen = NAME_MAX;
+       RETURN(0);
+}
+
+/** The operations we support directly on the superblock:
+ * mount, umount, and df.
+ */
+static struct super_operations server_ops = {
+       .put_super      = server_put_super,
+       .umount_begin   = server_umount_begin, /* umount -f */
+       .statfs         = server_statfs,
+};
+
+#define log2(n) ffz(~(n))
+#define LUSTRE_SUPER_MAGIC 0x0BD00BD1
+
+static int server_fill_super_common(struct super_block *sb)
+{
+       struct inode *root = 0;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
+
+       sb->s_blocksize = 4096;
+       sb->s_blocksize_bits = log2(sb->s_blocksize);
+       sb->s_magic = LUSTRE_SUPER_MAGIC;
+       sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */
+       sb->s_flags |= MS_RDONLY;
+       sb->s_op = &server_ops;
+
+       root = new_inode(sb);
+       if (!root) {
+               CERROR("Can't make root inode\n");
+               RETURN(-EIO);
+       }
+
+       /* returns -EIO for every operation */
+       /* make_bad_inode(root); -- badness - can't umount */
+       /* apparently we need to be a directory for the mount to finish */
+       root->i_mode = S_IFDIR;
+
+       sb->s_root = d_make_root(root);
+       if (!sb->s_root) {
+               CERROR("%s: can't make root dentry\n", sb->s_id);
+               RETURN(-EIO);
+       }
+
+       RETURN(0);
+}
+
+static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags)
+{
+       struct lustre_mount_data *lmd = lsi->lsi_lmd;
+       struct obd_device        *obd;
+       struct dt_device_param    p;
+       char                      flagstr[16];
+       int                       rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT,
+              "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n",
+              lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags);
+
+       sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname);
+       strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname);
+       strcat(lsi->lsi_osd_uuid, "_UUID");
+       sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags);
+
+       obd = class_name2obd(lsi->lsi_osd_obdname);
+       if (obd == NULL) {
+               rc = lustre_start_simple(lsi->lsi_osd_obdname,
+                                        lsi->lsi_osd_type,
+                                        lsi->lsi_osd_uuid, lmd->lmd_dev,
+                                        flagstr, lsi->lsi_lmd->lmd_opts,
+                                        lsi->lsi_svname);
+               if (rc)
+                       GOTO(out, rc);
+               obd = class_name2obd(lsi->lsi_osd_obdname);
+               LASSERT(obd);
+       }
+
+       rc = obd_connect(NULL, &lsi->lsi_osd_exp,
+                        obd, &obd->obd_uuid, NULL, NULL);
+       if (rc) {
+               obd->obd_force = 1;
+               class_manual_cleanup(obd);
+               lsi->lsi_dt_dev = NULL;
+       }
+
+       /* XXX: to keep support old components relying on lsi_srv_mnt
+        *      we get this info from OSD just started */
+       LASSERT(obd->obd_lu_dev);
+       lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev);
+       LASSERT(lsi->lsi_dt_dev);
+
+       dt_conf_get(NULL, lsi->lsi_dt_dev, &p);
+
+       lsi->lsi_srv_mnt = p.ddp_mnt;
+
+out:
+       RETURN(rc);
+}
+
+/** Fill in the superblock info for a Lustre server.
+ * Mount the device with the correct options.
+ * Read the on-disk config file.
+ * Start the services.
+ */
+int server_fill_super(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       int rc;
+       ENTRY;
+
+       rc = lsi_prepare(lsi);
+       if (rc)
+               RETURN(rc);
+
+       /* Start low level OSD */
+       rc = osd_start(lsi, sb->s_flags);
+       if (rc) {
+               CERROR("Unable to start osd on %s: %d\n",
+                      lsi->lsi_lmd->lmd_dev, rc);
+               lustre_put_lsi(sb);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_MOUNT, "Found service %s on device %s\n",
+              lsi->lsi_svname, lsi->lsi_lmd->lmd_dev);
+
+       if (class_name2obd(lsi->lsi_svname)) {
+               LCONSOLE_ERROR_MSG(0x161, "The target named %s is already "
+                                  "running. Double-mount may have compromised"
+                                  " the disk journal.\n",
+                                  lsi->lsi_svname);
+               lustre_put_lsi(sb);
+               RETURN(-EALREADY);
+       }
+
+       /* Start MGS before MGC */
+       if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) {
+               rc = server_start_mgs(sb);
+               if (rc)
+                       GOTO(out_mnt, rc);
+       }
+
+       /* Start MGC before servers */
+       rc = lustre_start_mgc(sb);
+       if (rc)
+               GOTO(out_mnt, rc);
+
+       /* Set up all obd devices for service */
+       if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+           (IS_OST(lsi) || IS_MDT(lsi))) {
+               rc = server_start_targets(sb, lsi->lsi_srv_mnt);
+               if (rc < 0) {
+                       CERROR("Unable to start targets: %d\n", rc);
+                       GOTO(out_mnt, rc);
+               }
+               /* FIXME overmount client here, or can we just start a
+                * client log and client_fill_super on this sb?  We
+                * need to make sure server_put_super gets called too
+                * - ll_put_super calls lustre_common_put_super; check
+                * there for LSI_SERVER flag, call s_p_s if so.
+                *
+                * Probably should start client from new thread so we
+                * can return.  Client will not finish until all
+                * servers are connected.  Note - MGS-only server does
+                * NOT get a client, since there is no lustre fs
+                * associated - the MGS is for all lustre fs's */
+       }
+
+       rc = server_fill_super_common(sb);
+       if (rc)
+               GOTO(out_mnt, rc);
+
+       RETURN(0);
+out_mnt:
+       /* We jump here in case of failure while starting targets or MGS.
+        * In this case we can't just put @mnt and have to do real cleanup
+        * with stoping targets, etc. */
+       server_put_super(sb);
+       return rc;
+}
+
+/*
+ * Calculate timeout value for a target.
+ */
+void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
+{
+       struct lustre_mount_data *lmd;
+       int soft = 0;
+       int hard = 0;
+       int factor = 0;
+       bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE);
+       int min = OBD_RECOVERY_TIME_MIN;
+
+       LASSERT(IS_SERVER(lsi));
+
+       lmd = lsi->lsi_lmd;
+       if (lmd) {
+               soft   = lmd->lmd_recovery_time_soft;
+               hard   = lmd->lmd_recovery_time_hard;
+               has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR);
+               obd->obd_no_ir = !has_ir;
+       }
+
+       if (soft == 0)
+               soft = OBD_RECOVERY_TIME_SOFT;
+       if (hard == 0)
+               hard = OBD_RECOVERY_TIME_HARD;
+
+       /* target may have ir_factor configured. */
+       factor = OBD_IR_FACTOR_DEFAULT;
+       if (obd->obd_recovery_ir_factor)
+               factor = obd->obd_recovery_ir_factor;
+
+       if (has_ir) {
+               int new_soft = soft;
+               int new_hard = hard;
+
+               /* adjust timeout value by imperative recovery */
+
+               new_soft = (soft * factor) / OBD_IR_FACTOR_MAX;
+               new_hard = (hard * factor) / OBD_IR_FACTOR_MAX;
+
+               /* make sure the timeout is not too short */
+               new_soft = max(min, new_soft);
+               new_hard = max(new_soft, new_hard);
+
+               LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery "
+                             "window shrunk from %d-%d down to %d-%d\n",
+                             obd->obd_name, soft, hard, new_soft, new_hard);
+
+               soft = new_soft;
+               hard = new_hard;
+       }
+
+       /* we're done */
+       obd->obd_recovery_timeout   = max(obd->obd_recovery_timeout, soft);
+       obd->obd_recovery_time_hard = hard;
+       obd->obd_recovery_ir_factor = factor;
+}
+EXPORT_SYMBOL(server_calc_timeout);
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644 (file)
index 0000000..01a0e1f
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+       dst->o_parent_oid = fid_oid(parent);
+       dst->o_parent_seq = fid_seq(parent);
+       dst->o_parent_ver = fid_ver(parent);
+       dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+       obd_flag newvalid = 0;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+                      valid, LTIME_S(src->i_mtime),
+                      LTIME_S(src->i_ctime));
+
+       if (valid & OBD_MD_FLATIME) {
+               dst->o_atime = LTIME_S(src->i_atime);
+               newvalid |= OBD_MD_FLATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               dst->o_mtime = LTIME_S(src->i_mtime);
+               newvalid |= OBD_MD_FLMTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               dst->o_ctime = LTIME_S(src->i_ctime);
+               newvalid |= OBD_MD_FLCTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               dst->o_size = i_size_read(src);
+               newvalid |= OBD_MD_FLSIZE;
+       }
+       if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+               dst->o_blocks = src->i_blocks;
+               newvalid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+               dst->o_blksize = ll_inode_blksize(src);
+               newvalid |= OBD_MD_FLBLKSZ;
+       }
+       if (valid & OBD_MD_FLTYPE) {
+               dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                             (src->i_mode & S_IFMT);
+               newvalid |= OBD_MD_FLTYPE;
+       }
+       if (valid & OBD_MD_FLMODE) {
+               dst->o_mode = (dst->o_mode & S_IFMT) |
+                             (src->i_mode & S_IALLUGO);
+               newvalid |= OBD_MD_FLMODE;
+       }
+       if (valid & OBD_MD_FLUID) {
+               dst->o_uid = src->i_uid;
+               newvalid |= OBD_MD_FLUID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               dst->o_gid = src->i_gid;
+               newvalid |= OBD_MD_FLGID;
+       }
+       if (valid & OBD_MD_FLFLAGS) {
+               dst->o_flags = ll_inode_flags(src);
+               newvalid |= OBD_MD_FLFLAGS;
+       }
+       dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+       CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+              POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+       if (valid & OBD_MD_FLATIME)
+               dst->o_atime = src->o_atime;
+       if (valid & OBD_MD_FLMTIME)
+               dst->o_mtime = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME)
+               dst->o_ctime = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               dst->o_size = src->o_size;
+       if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+               dst->o_blocks = src->o_blocks;
+       if (valid & OBD_MD_FLBLKSZ)
+               dst->o_blksize = src->o_blksize;
+       if (valid & OBD_MD_FLTYPE)
+               dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+       if (valid & OBD_MD_FLMODE)
+               dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+       if (valid & OBD_MD_FLUID)
+               dst->o_uid = src->o_uid;
+       if (valid & OBD_MD_FLGID)
+               dst->o_gid = src->o_gid;
+       if (valid & OBD_MD_FLFLAGS)
+               dst->o_flags = src->o_flags;
+       if (valid & OBD_MD_FLFID) {
+               dst->o_parent_seq = src->o_parent_seq;
+               dst->o_parent_ver = src->o_parent_ver;
+       }
+       if (valid & OBD_MD_FLGENER)
+               dst->o_parent_oid = src->o_parent_oid;
+       if (valid & OBD_MD_FLHANDLE)
+               dst->o_handle = src->o_handle;
+       if (valid & OBD_MD_FLCOOKIE)
+               dst->o_lcookie = src->o_lcookie;
+
+       dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+       int res = 0;
+
+       if ( compare & OBD_MD_FLATIME )
+               res = (res || (dst->o_atime != src->o_atime));
+       if ( compare & OBD_MD_FLMTIME )
+               res = (res || (dst->o_mtime != src->o_mtime));
+       if ( compare & OBD_MD_FLCTIME )
+               res = (res || (dst->o_ctime != src->o_ctime));
+       if ( compare & OBD_MD_FLSIZE )
+               res = (res || (dst->o_size != src->o_size));
+       if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+               res = (res || (dst->o_blocks != src->o_blocks));
+       if ( compare & OBD_MD_FLBLKSZ )
+               res = (res || (dst->o_blksize != src->o_blksize));
+       if ( compare & OBD_MD_FLTYPE )
+               res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+       if ( compare & OBD_MD_FLMODE )
+               res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+       if ( compare & OBD_MD_FLUID )
+               res = (res || (dst->o_uid != src->o_uid));
+       if ( compare & OBD_MD_FLGID )
+               res = (res || (dst->o_gid != src->o_gid));
+       if ( compare & OBD_MD_FLFLAGS )
+               res = (res || (dst->o_flags != src->o_flags));
+       if ( compare & OBD_MD_FLNLINK )
+               res = (res || (dst->o_nlink != src->o_nlink));
+       if ( compare & OBD_MD_FLFID ) {
+               res = (res || (dst->o_parent_seq != src->o_parent_seq));
+               res = (res || (dst->o_parent_ver != src->o_parent_ver));
+       }
+       if ( compare & OBD_MD_FLGENER )
+               res = (res || (dst->o_parent_oid != src->o_parent_oid));
+       /* XXX Don't know if thses should be included here - wasn't previously
+       if ( compare & OBD_MD_FLINLINE )
+               res = (res || memcmp(dst->o_inline, src->o_inline));
+       */
+       return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+       ioobj->ioo_oid = oa->o_oi;
+       if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+               ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+       /* Since 2.4 this does not contain o_mode in the low 16 bits.
+        * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+       ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+       if (ia_valid & ATTR_ATIME) {
+               oa->o_atime = LTIME_S(attr->ia_atime);
+               oa->o_valid |= OBD_MD_FLATIME;
+       }
+       if (ia_valid & ATTR_MTIME) {
+               oa->o_mtime = LTIME_S(attr->ia_mtime);
+               oa->o_valid |= OBD_MD_FLMTIME;
+       }
+       if (ia_valid & ATTR_CTIME) {
+               oa->o_ctime = LTIME_S(attr->ia_ctime);
+               oa->o_valid |= OBD_MD_FLCTIME;
+       }
+       if (ia_valid & ATTR_SIZE) {
+               oa->o_size = attr->ia_size;
+               oa->o_valid |= OBD_MD_FLSIZE;
+       }
+       if (ia_valid & ATTR_MODE) {
+               oa->o_mode = attr->ia_mode;
+               oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+               if (!current_is_in_group(oa->o_gid) &&
+                   !cfs_capable(CFS_CAP_FSETID))
+                       oa->o_mode &= ~S_ISGID;
+       }
+       if (ia_valid & ATTR_UID) {
+               oa->o_uid = attr->ia_uid;
+               oa->o_valid |= OBD_MD_FLUID;
+       }
+       if (ia_valid & ATTR_GID) {
+               oa->o_gid = attr->ia_gid;
+               oa->o_valid |= OBD_MD_FLGID;
+       }
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+       valid &= oa->o_valid;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+                      oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+       attr->ia_valid = 0;
+       if (valid & OBD_MD_FLATIME) {
+               LTIME_S(attr->ia_atime) = oa->o_atime;
+               attr->ia_valid |= ATTR_ATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               LTIME_S(attr->ia_mtime) = oa->o_mtime;
+               attr->ia_valid |= ATTR_MTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               LTIME_S(attr->ia_ctime) = oa->o_ctime;
+               attr->ia_valid |= ATTR_CTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               attr->ia_size = oa->o_size;
+               attr->ia_valid |= ATTR_SIZE;
+       }
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+       if (valid & OBD_MD_FLTYPE) {
+               attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+               attr->ia_valid |= ATTR_MODE;
+       }
+#endif
+       if (valid & OBD_MD_FLMODE) {
+               attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+               attr->ia_valid |= ATTR_MODE;
+               if (!current_is_in_group(oa->o_gid) &&
+                   !cfs_capable(CFS_CAP_FSETID))
+                       attr->ia_mode &= ~S_ISGID;
+       }
+       if (valid & OBD_MD_FLUID) {
+               attr->ia_uid = oa->o_uid;
+               attr->ia_valid |= ATTR_UID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               attr->ia_gid = oa->o_gid;
+               attr->ia_valid |= ATTR_GID;
+       }
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+       iattr_from_obdo(&op_data->op_attr, oa, valid);
+       if (valid & OBD_MD_FLBLOCKS) {
+               op_data->op_attr_blocks = oa->o_blocks;
+               op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+       }
+       if (valid & OBD_MD_FLFLAGS) {
+               ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+                       oa->o_flags;
+               op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+       }
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+                 unsigned int valid)
+{
+       obdo_from_iattr(oa, &op_data->op_attr, valid);
+       if (valid & ATTR_BLOCKS) {
+               oa->o_blocks = op_data->op_attr_blocks;
+               oa->o_valid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & ATTR_ATTR_FLAG) {
+               oa->o_flags =
+                       ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+               oa->o_valid |= OBD_MD_FLFLAGS;
+       }
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+       dobdo->o_size = cpu_to_le64(sobdo->o_size);
+       dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+       dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+       dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+       dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+       dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+       dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+       dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+       dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+       dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+       dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+       dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+       dobdo->o_size = le64_to_cpu(sobdo->o_size);
+       dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+       dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+       dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+       dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+       dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+       dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+       dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+       dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+       dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+       dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+       dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644 (file)
index 0000000..c3b7a78
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+       memset(osfs, 0, sizeof(*osfs));
+       osfs->os_type = sfs->f_type;
+       osfs->os_blocks = sfs->f_blocks;
+       osfs->os_bfree = sfs->f_bfree;
+       osfs->os_bavail = sfs->f_bavail;
+       osfs->os_files = sfs->f_files;
+       osfs->os_ffree = sfs->f_ffree;
+       osfs->os_bsize = sfs->f_bsize;
+       osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+       memset(sfs, 0, sizeof(*sfs));
+       sfs->f_type = osfs->os_type;
+       sfs->f_blocks = osfs->os_blocks;
+       sfs->f_bfree = osfs->os_bfree;
+       sfs->f_bavail = osfs->os_bavail;
+       sfs->f_files = osfs->os_files;
+       sfs->f_ffree = osfs->os_ffree;
+       sfs->f_bsize = osfs->os_bsize;
+       sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644 (file)
index 0000000..af5f27f
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+       __u32 value;
+
+       LASSERT(nob <= sizeof value);
+
+       for (value = 0; nob > 0; --nob)
+               value = (value << 8) | *((*ptr)++);
+       return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+       __u8 *ptr = in;
+
+       LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+       while (nr-- > 0)
+               CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+       /* uu as an array of __u16's */
+       __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+       CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+       uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+       sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+               uuid[0], uuid[1], uuid[2], uuid[3],
+               uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644 (file)
index 0000000..4c48e24
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdecho/echo.c b/drivers/staging/lustre/lustre/obdecho/echo.c
new file mode 100644 (file)
index 0000000..9e64939
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/* The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */
+#define ECHO_INIT_OID  0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_CACHE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+       LPROC_ECHO_READ_BYTES = 1,
+       LPROC_ECHO_WRITE_BYTES = 2,
+       LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
+};
+
+static int echo_connect(const struct lu_env *env,
+                       struct obd_export **exp, struct obd_device *obd,
+                       struct obd_uuid *cluuid, struct obd_connect_data *data,
+                       void *localdata)
+{
+       struct lustre_handle conn = { 0 };
+       int rc;
+
+       data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc) {
+               CERROR("can't connect %d\n", rc);
+               return rc;
+       }
+       *exp = class_conn2export(&conn);
+
+       return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+       LASSERT (exp != NULL);
+
+       return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+       return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+
+       target_destroy_export(exp);
+       ldlm_destroy_export(exp);
+
+       RETURN(0);
+}
+
+ static __u64 echo_next_id(struct obd_device *obddev)
+{
+       obd_id id;
+
+       spin_lock(&obddev->u.echo.eo_lock);
+       id = ++obddev->u.echo.eo_lastino;
+       spin_unlock(&obddev->u.echo.eo_lock);
+
+       return id;
+}
+
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md **ea,
+                      struct obd_trans_info *oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               return -EINVAL;
+       }
+
+       if (!(oa->o_mode && S_IFMT)) {
+               CERROR("echo obd: no type!\n");
+               return -ENOENT;
+       }
+
+       if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+               CERROR("invalid o_valid "LPX64"\n", oa->o_valid);
+               return -EINVAL;
+       }
+
+       ostid_set_seq_echo(&oa->o_oi);
+       ostid_set_id(&oa->o_oi, echo_next_id(obd));
+       oa->o_valid = OBD_MD_FLID;
+
+       return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+                       struct obdo *oa, struct lov_stripe_md *ea,
+                       struct obd_trans_info *oti, struct obd_export *md_exp,
+                       void *capa)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+           ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+               CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+               RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+                       struct obd_info *oinfo)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       obd_id id = ostid_id(&oinfo->oi_oa->o_oi);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n",
+                      oinfo->oi_oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid);
+       ostid_set_seq_echo(&oinfo->oi_oa->o_oi);
+       ostid_set_id(&oinfo->oi_oa->o_oi, id);
+
+       RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+                       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n",
+                      oinfo->oi_oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
+
+       if (ostid_id(&oinfo->oi_oa->o_oi) & 4) {
+               /* Save lock to force ACKed reply */
+               ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL);
+               oti->oti_ack_locks[0].mode = LCK_NL;
+               oti->oti_ack_locks[0].lock = obd->u.echo.eo_nl_lock;
+       }
+
+       RETURN(0);
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, obd_id id,
+                     __u64 offset, int len)
+{
+       int   page_offset = offset & ~CFS_PAGE_MASK;
+       char *addr      = ((char *)kmap(page)) + page_offset;
+
+       if (len % OBD_ECHO_BLOCK_SIZE != 0)
+               CERROR("Unexpected block size %d\n", len);
+
+       while (len > 0) {
+               if (rw & OBD_BRW_READ)
+                       block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                         offset, id);
+               else
+                       block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                         0xecc0ecc0ecc0ecc0ULL,
+                                         0xecc0ecc0ecc0ecc0ULL);
+
+               addr   += OBD_ECHO_BLOCK_SIZE;
+               offset += OBD_ECHO_BLOCK_SIZE;
+               len    -= OBD_ECHO_BLOCK_SIZE;
+       }
+
+       kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, obd_id id,
+                     __u64 offset, int len)
+{
+       int   page_offset = offset & ~CFS_PAGE_MASK;
+       char *addr      = ((char *)kmap(page)) + page_offset;
+       int   rc          = 0;
+       int   rc2;
+
+       if (len % OBD_ECHO_BLOCK_SIZE != 0)
+               CERROR("Unexpected block size %d\n", len);
+
+       while (len > 0) {
+               rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+                                       offset, id);
+
+               if (rc2 != 0 && rc == 0)
+                       rc = rc2;
+
+               addr   += OBD_ECHO_BLOCK_SIZE;
+               offset += OBD_ECHO_BLOCK_SIZE;
+               len    -= OBD_ECHO_BLOCK_SIZE;
+       }
+
+       kunmap(page);
+
+       return (rc);
+}
+
+/* This allows us to verify that desc_private is passed unmolested */
+#define DESC_PRIV 0x10293847
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+                            struct niobuf_remote *nb, int *pages,
+                            struct niobuf_local *lb, int cmd, int *left)
+{
+       int gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+                       GFP_HIGHUSER : GFP_IOFS;
+       int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+       int debug_setup = (!ispersistent &&
+                          (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                          (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+       struct niobuf_local *res = lb;
+       obd_off offset = nb->offset;
+       int len = nb->len;
+
+       while (len > 0) {
+               int plen = PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1));
+               if (len < plen)
+                       plen = len;
+
+               /* check for local buf overflow */
+               if (*left == 0)
+                       return -EINVAL;
+
+               res->lnb_file_offset = offset;
+               res->len = plen;
+               LASSERT((res->lnb_file_offset & ~CFS_PAGE_MASK) + res->len <=
+                       PAGE_CACHE_SIZE);
+
+               if (ispersistent &&
+                   ((res->lnb_file_offset >> PAGE_CACHE_SHIFT) <
+                     ECHO_PERSISTENT_PAGES)) {
+                       res->page =
+                               echo_persistent_pages[res->lnb_file_offset >>
+                                                     PAGE_CACHE_SHIFT];
+                       /* Take extra ref so __free_pages() can be called OK */
+                       get_page (res->page);
+               } else {
+                       OBD_PAGE_ALLOC(res->page, gfp_mask);
+                       if (res->page == NULL) {
+                               CERROR("can't get page for id " DOSTID"\n",
+                                      POSTID(&obj->ioo_oid));
+                               return -ENOMEM;
+                       }
+               }
+
+               CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+                      res->page, res->lnb_file_offset, res->len);
+
+               if (cmd & OBD_BRW_READ)
+                       res->rc = res->len;
+
+               if (debug_setup)
+                       echo_page_debug_setup(res->page, cmd,
+                                             ostid_id(&obj->ioo_oid),
+                                             res->lnb_file_offset, res->len);
+
+               offset += plen;
+               len -= plen;
+               res++;
+
+               (*left)--;
+               (*pages)++;
+       }
+
+       return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+                           struct niobuf_remote *rb, int *pgs,
+                           struct niobuf_local *lb, int verify)
+{
+       struct niobuf_local *res = lb;
+       obd_off start  = rb->offset >> PAGE_CACHE_SHIFT;
+       obd_off end    = (rb->offset + rb->len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       int     count  = (int)(end - start);
+       int     rc     = 0;
+       int     i;
+
+       for (i = 0; i < count; i++, (*pgs) ++, res++) {
+               struct page *page = res->page;
+               void       *addr;
+
+               if (page == NULL) {
+                       CERROR("null page objid "LPU64":%p, buf %d/%d\n",
+                              ostid_id(&obj->ioo_oid), page, i,
+                              obj->ioo_bufcnt);
+                       return -EFAULT;
+               }
+
+               addr = kmap(page);
+
+               CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
+                      res->page, addr, res->lnb_file_offset);
+
+               if (verify) {
+                       int vrc = echo_page_debug_check(page,
+                                                       ostid_id(&obj->ioo_oid),
+                                                       res->lnb_file_offset,
+                                                       res->len);
+                       /* check all the pages always */
+                       if (vrc != 0 && rc == 0)
+                               rc = vrc;
+               }
+
+               kunmap(page);
+               /* NB see comment above regarding persistent pages */
+               OBD_PAGE_FREE(page);
+       }
+
+       return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+                      struct obd_export *export, struct obdo *oa,
+                      int objcount, struct obd_ioobj *obj,
+                      struct niobuf_remote *nb, int *pages,
+                      struct niobuf_local *res, struct obd_trans_info *oti,
+                      struct lustre_capa *unused)
+{
+       struct obd_device *obd;
+       int tot_bytes = 0;
+       int rc = 0;
+       int i, left;
+       ENTRY;
+
+       obd = export->exp_obd;
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       /* Temp fix to stop falling foul of osc_announce_cached() */
+       oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+       memset(res, 0, sizeof(*res) * *pages);
+
+       CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+              cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+       if (oti)
+               oti->oti_handle = (void *)DESC_PRIV;
+
+       left = *pages;
+       *pages = 0;
+
+       for (i = 0; i < objcount; i++, obj++) {
+               int j;
+
+               for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+
+                       rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+                                              res + *pages, cmd, &left);
+                       if (rc)
+                               GOTO(preprw_cleanup, rc);
+
+                       tot_bytes += nb->len;
+               }
+       }
+
+       atomic_add(*pages, &obd->u.echo.eo_prep);
+
+       if (cmd & OBD_BRW_READ)
+               lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                   tot_bytes);
+       else
+               lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                   tot_bytes);
+
+       CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+              atomic_read(&obd->u.echo.eo_prep));
+
+       RETURN(0);
+
+preprw_cleanup:
+       /* It is possible that we would rather handle errors by  allow
+        * any already-set-up pages to complete, rather than tearing them
+        * all down again.  I believe that this is what the in-kernel
+        * prep/commit operations do.
+        */
+       CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+       for (i = 0; i < *pages; i++) {
+               kunmap(res[i].page);
+               /* NB if this is a persistent page, __free_pages will just
+                * lose the extra ref gained above */
+               OBD_PAGE_FREE(res[i].page);
+               res[i].page = NULL;
+               atomic_dec(&obd->u.echo.eo_prep);
+       }
+
+       return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+                        struct obd_export *export, struct obdo *oa,
+                        int objcount, struct obd_ioobj *obj,
+                        struct niobuf_remote *rb, int niocount,
+                        struct niobuf_local *res, struct obd_trans_info *oti,
+                        int rc)
+{
+       struct obd_device *obd;
+       int pgs = 0;
+       int i;
+       ENTRY;
+
+       obd = export->exp_obd;
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       if (rc)
+               GOTO(commitrw_cleanup, rc);
+
+       if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+               CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+                      objcount, niocount);
+       } else {
+               CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+                      objcount, niocount);
+       }
+
+       if (niocount && res == NULL) {
+               CERROR("NULL res niobuf with niocount %d\n", niocount);
+               RETURN(-EINVAL);
+       }
+
+       LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV);
+
+       for (i = 0; i < objcount; i++, obj++) {
+               int verify = (rc == 0 &&
+                            ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+                             (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                             (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+               int j;
+
+               for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+                       int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+                                                  verify);
+                       if (vrc == 0)
+                               continue;
+
+                       if (vrc == -EFAULT)
+                               GOTO(commitrw_cleanup, rc = vrc);
+
+                       if (rc == 0)
+                               rc = vrc;
+               }
+
+       }
+
+       atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+       CDEBUG(D_PAGE, "%d pages remain after commit\n",
+              atomic_read(&obd->u.echo.eo_prep));
+       RETURN(rc);
+
+commitrw_cleanup:
+       atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+       CERROR("cleaning up %d pages (%d obdos)\n",
+              niocount - pgs - 1, objcount);
+
+       while (pgs < niocount) {
+               struct page *page = res[pgs++].page;
+
+               if (page == NULL)
+                       continue;
+
+               /* NB see comment above regarding persistent pages */
+               OBD_PAGE_FREE(page);
+               atomic_dec(&obd->u.echo.eo_prep);
+       }
+       return rc;
+}
+
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars;
+       int                     rc;
+       __u64                 lock_flags = 0;
+       struct ldlm_res_id       res_id = {.name = {1}};
+       char                   ns_name[48];
+       ENTRY;
+
+       obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+       spin_lock_init(&obd->u.echo.eo_lock);
+       obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+       sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+       obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+                                               LDLM_NAMESPACE_SERVER,
+                                               LDLM_NAMESPACE_MODEST,
+                                               LDLM_NS_TYPE_OST);
+       if (obd->obd_namespace == NULL) {
+               LBUG();
+               RETURN(-ENOMEM);
+       }
+
+       rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+                                   NULL, LCK_NL, &lock_flags, NULL,
+                                   ldlm_completion_ast, NULL, NULL, 0,
+                                   LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+       LASSERT (rc == ELDLM_OK);
+
+       lprocfs_echo_init_vars(&lvars);
+       if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0 &&
+           lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+               lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                    LPROCFS_CNTR_AVGMINMAX,
+                                    "read_bytes", "bytes");
+               lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                    LPROCFS_CNTR_AVGMINMAX,
+                                    "write_bytes", "bytes");
+       }
+
+       ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+                           "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+       RETURN(0);
+}
+
+static int echo_cleanup(struct obd_device *obd)
+{
+       int leaked;
+       ENTRY;
+
+       lprocfs_obd_cleanup(obd);
+       lprocfs_free_obd_stats(obd);
+
+       ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+
+       /* XXX Bug 3413; wait for a bit to ensure the BL callback has
+        * happened before calling ldlm_namespace_free() */
+       schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, cfs_time_seconds(1));
+
+       ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+       obd->obd_namespace = NULL;
+
+       leaked = atomic_read(&obd->u.echo.eo_prep);
+       if (leaked != 0)
+               CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+       RETURN(0);
+}
+
+struct obd_ops echo_obd_ops = {
+       .o_owner           = THIS_MODULE,
+       .o_connect       = echo_connect,
+       .o_disconnect      = echo_disconnect,
+       .o_init_export     = echo_init_export,
+       .o_destroy_export  = echo_destroy_export,
+       .o_create         = echo_create,
+       .o_destroy       = echo_destroy,
+       .o_getattr       = echo_getattr,
+       .o_setattr       = echo_setattr,
+       .o_preprw         = echo_preprw,
+       .o_commitrw     = echo_commitrw,
+       .o_setup           = echo_setup,
+       .o_cleanup       = echo_cleanup
+};
+
+void echo_persistent_pages_fini(void)
+{
+       int     i;
+
+       for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+               if (echo_persistent_pages[i] != NULL) {
+                       OBD_PAGE_FREE(echo_persistent_pages[i]);
+                       echo_persistent_pages[i] = NULL;
+               }
+}
+
+int echo_persistent_pages_init(void)
+{
+       struct page *pg;
+       int       i;
+
+       for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+               int gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ?
+                       GFP_IOFS : GFP_HIGHUSER;
+
+               OBD_PAGE_ALLOC(pg, gfp_mask);
+               if (pg == NULL) {
+                       echo_persistent_pages_fini ();
+                       return (-ENOMEM);
+               }
+
+               memset (kmap (pg), 0, PAGE_CACHE_SIZE);
+               kunmap (pg);
+
+               echo_persistent_pages[i] = pg;
+       }
+
+       return (0);
+}
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644 (file)
index 0000000..0545d16
--- /dev/null
@@ -0,0 +1,3217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_acl.h>
+#include <lustre_net.h>
+#include <obd_lov.h>
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+       struct cl_device        ed_cl;
+       struct echo_client_obd *ed_ec;
+
+       struct cl_site    ed_site_myself;
+       struct cl_site   *ed_site;
+       struct lu_device       *ed_next;
+       int                  ed_next_islov;
+       int                  ed_next_ismd;
+       struct lu_client_seq   *ed_cl_seq;
+};
+
+struct echo_object {
+       struct cl_object        eo_cl;
+       struct cl_object_header eo_hdr;
+
+       struct echo_device     *eo_dev;
+       struct list_head              eo_obj_chain;
+       struct lov_stripe_md   *eo_lsm;
+       atomic_t            eo_npages;
+       int                  eo_deleted;
+};
+
+struct echo_object_conf {
+       struct cl_object_conf  eoc_cl;
+       struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+       struct cl_page_slice   ep_cl;
+       struct mutex            ep_lock;
+       struct page         *ep_vmpage;
+};
+
+struct echo_lock {
+       struct cl_lock_slice   el_cl;
+       struct list_head             el_chain;
+       struct echo_object    *el_object;
+       __u64             el_cookie;
+       atomic_t           el_refcount;
+};
+
+struct echo_io {
+       struct cl_io_slice     ei_cl;
+};
+
+#if 0
+struct echo_req {
+       struct cl_req_slice er_cl;
+};
+#endif
+
+static int echo_client_setup(const struct lu_env *env,
+                            struct obd_device *obddev,
+                            struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+       return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+       return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+       return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+       return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+       return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+       return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+       return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+       return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+       struct echo_thread_info *info;
+       info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+       return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                              struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue   (struct echo_object *eco, obd_off start,
+                             obd_off end, int mode, __u64 *cookie);
+static int cl_echo_cancel    (struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                             struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+       struct echo_object_conf eti_conf;
+       struct lustre_md        eti_md;
+
+       struct cl_2queue        eti_queue;
+       struct cl_io        eti_io;
+       struct cl_lock_descr    eti_descr;
+       struct lu_fid      eti_fid;
+       struct lu_fid           eti_fid2;
+       struct md_op_spec       eti_spec;
+       struct lov_mds_md_v3    eti_lmm;
+       struct lov_user_md_v3   eti_lum;
+       struct md_attr    eti_ma;
+       struct lu_name    eti_lname;
+       /* per-thread values, can be re-used */
+       void                    *eti_big_lmm;
+       int                     eti_big_lmmsize;
+       char                eti_name[20];
+       struct lu_buf      eti_buf;
+       char                eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE];
+};
+
+/* No session used right now */
+struct echo_session_info {
+       unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+//static struct kmem_cache *echo_req_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+       {
+               .ckd_cache = &echo_lock_kmem,
+               .ckd_name  = "echo_lock_kmem",
+               .ckd_size  = sizeof (struct echo_lock)
+       },
+       {
+               .ckd_cache = &echo_object_kmem,
+               .ckd_name  = "echo_object_kmem",
+               .ckd_size  = sizeof (struct echo_object)
+       },
+       {
+               .ckd_cache = &echo_thread_kmem,
+               .ckd_name  = "echo_thread_kmem",
+               .ckd_size  = sizeof (struct echo_thread_info)
+       },
+       {
+               .ckd_cache = &echo_session_kmem,
+               .ckd_name  = "echo_session_kmem",
+               .ckd_size  = sizeof (struct echo_session_info)
+       },
+#if 0
+       {
+               .ckd_cache = &echo_req_kmem,
+               .ckd_name  = "echo_req_kmem",
+               .ckd_size  = sizeof (struct echo_req)
+       },
+#endif
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+                                   const struct cl_page_slice *slice)
+{
+       return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io, int nonblock)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       if (!nonblock)
+               mutex_lock(&ep->ep_lock);
+       else if (!mutex_trylock(&ep->ep_lock))
+               return -EAGAIN;
+       return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       LASSERT(mutex_is_locked(&ep->ep_lock));
+       mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+                                const struct cl_page_slice *slice)
+{
+       if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+               return -EBUSY;
+       return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                int ioret)
+{
+       LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+                          struct cl_page_slice *slice)
+{
+       struct echo_page *ep    = cl2echo_page(slice);
+       struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+       struct page *vmpage      = ep->ep_vmpage;
+       ENTRY;
+
+       atomic_dec(&eco->eo_npages);
+       page_cache_release(vmpage);
+       EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *unused)
+{
+       return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+                  ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+       return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+       .cpo_own           = echo_page_own,
+       .cpo_disown     = echo_page_disown,
+       .cpo_discard       = echo_page_discard,
+       .cpo_vmpage     = echo_page_vmpage,
+       .cpo_fini         = echo_page_fini,
+       .cpo_print       = echo_page_print,
+       .cpo_is_vmlocked   = echo_page_is_vmlocked,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = echo_page_prep,
+                       .cpo_completion  = echo_page_completion,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = echo_page_prep,
+                       .cpo_completion  = echo_page_completion,
+               }
+       }
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+       struct echo_lock *ecl = cl2echo_lock(slice);
+
+       LASSERT(list_empty(&ecl->el_chain));
+       OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+       struct echo_lock *ecl      = cl2echo_lock(slice);
+
+       LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              const struct cl_lock_descr *need,
+                              const struct cl_io *unused)
+{
+       return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+       .clo_fini      = echo_lock_fini,
+       .clo_delete    = echo_lock_delete,
+       .clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct echo_page *ep = cl_object_page_slice(obj, page);
+       struct echo_object *eco = cl2echo_obj(obj);
+       ENTRY;
+
+       ep->ep_vmpage = vmpage;
+       page_cache_get(vmpage);
+       mutex_init(&ep->ep_lock);
+       cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+       atomic_inc(&eco->eo_npages);
+       RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io)
+{
+       return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+                         struct cl_object *obj, struct cl_lock *lock,
+                         const struct cl_io *unused)
+{
+       struct echo_lock *el;
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, __GFP_IO);
+       if (el != NULL) {
+               cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+               el->el_object = cl2echo_obj(obj);
+               INIT_LIST_HEAD(&el->el_chain);
+               atomic_set(&el->el_refcount, 0);
+       }
+       RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+       return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+       .coo_page_init = echo_page_init,
+       .coo_lock_init = echo_lock_init,
+       .coo_io_init   = echo_io_init,
+       .coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
+{
+       struct echo_device *ed   = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+       struct echo_client_obd *ec     = ed->ed_ec;
+       struct echo_object *eco = cl2echo_obj(lu2cl(obj));
+       ENTRY;
+
+       if (ed->ed_next) {
+               struct lu_object  *below;
+               struct lu_device  *under;
+
+               under = ed->ed_next;
+               below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+                                                       under);
+               if (below == NULL)
+                       RETURN(-ENOMEM);
+               lu_object_add(obj, below);
+       }
+
+       if (!ed->ed_next_ismd) {
+               const struct cl_object_conf *cconf = lu2cl_conf(conf);
+               struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+               LASSERT(econf->eoc_md);
+               eco->eo_lsm = *econf->eoc_md;
+               /* clear the lsm pointer so that it won't get freed. */
+               *econf->eoc_md = NULL;
+       } else {
+               eco->eo_lsm = NULL;
+       }
+
+       eco->eo_dev = ed;
+       atomic_set(&eco->eo_npages, 0);
+       cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+       spin_lock(&ec->ec_lock);
+       list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+       spin_unlock(&ec->ec_lock);
+
+       RETURN(0);
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+                           struct lov_stripe_md **lsmp)
+{
+       int lsm_size;
+
+       ENTRY;
+
+       /* If export is lov/osc then use their obd method */
+       if (ed->ed_next != NULL)
+               return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+       /* OFD has no unpackmd method, do everything here */
+       lsm_size = lov_stripe_md_size(1);
+
+       LASSERT(*lsmp == NULL);
+       OBD_ALLOC(*lsmp, lsm_size);
+       if (*lsmp == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+       if ((*lsmp)->lsm_oinfo[0] == NULL) {
+               OBD_FREE(*lsmp, lsm_size);
+               RETURN(-ENOMEM);
+       }
+
+       loi_init((*lsmp)->lsm_oinfo[0]);
+       (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+       ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+       RETURN(lsm_size);
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+       int lsm_size;
+
+       ENTRY;
+
+       /* If export is lov/osc then use their obd method */
+       if (ed->ed_next != NULL)
+               return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+       /* OFD has no unpackmd method, do everything here */
+       lsm_size = lov_stripe_md_size(1);
+
+       LASSERT(*lsmp != NULL);
+       OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+       OBD_FREE(*lsmp, lsm_size);
+       *lsmp = NULL;
+       RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+       struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+       ENTRY;
+
+       LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+       spin_lock(&ec->ec_lock);
+       list_del_init(&eco->eo_obj_chain);
+       spin_unlock(&ec->ec_lock);
+
+       lu_object_fini(obj);
+       lu_object_header_fini(obj->lo_header);
+
+       if (eco->eo_lsm)
+               echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+       OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+       EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+       return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+       .loo_object_init      = echo_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = echo_object_free,
+       .loo_object_print     = echo_object_print,
+       .loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+                                          const struct lu_object_header *hdr,
+                                          struct lu_device *dev)
+{
+       struct echo_object *eco;
+       struct lu_object *obj = NULL;
+       ENTRY;
+
+       /* we're the top dev. */
+       LASSERT(hdr == NULL);
+       OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, __GFP_IO);
+       if (eco != NULL) {
+               struct cl_object_header *hdr = &eco->eo_hdr;
+
+               obj = &echo_obj2cl(eco)->co_lu;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+
+               eco->eo_cl.co_ops = &echo_cl_obj_ops;
+               obj->lo_ops       = &echo_lu_obj_ops;
+       }
+       RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+       .ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+       struct cl_site *site = &ed->ed_site_myself;
+       int rc;
+
+       /* initialize site */
+       rc = cl_site_init(site, &ed->ed_cl);
+       if (rc) {
+               CERROR("Cannot initilize site for echo client(%d)\n", rc);
+               return rc;
+       }
+
+       rc = lu_site_init_finish(&site->cs_lu);
+       if (rc)
+               return rc;
+
+       ed->ed_site = site;
+       return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+       if (ed->ed_site) {
+               if (!ed->ed_next_ismd)
+                       cl_site_fini(ed->ed_site);
+               ed->ed_site = NULL;
+       }
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct echo_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct echo_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = echo_thread_key_init,
+       .lct_fini = echo_thread_key_fini,
+       .lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct echo_session_info *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct echo_session_info *session = data;
+       OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = echo_session_key_init,
+       .lct_fini = echo_session_key_fini,
+       .lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+                        struct seq_server_site *ss)
+{
+       char *prefix;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(ed->ed_cl_seq);
+       if (ed->ed_cl_seq == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+       if (prefix == NULL)
+               GOTO(out_free_seq, rc = -ENOMEM);
+
+       snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+       /* Init client side sequence-manager */
+       rc = seq_client_init(ed->ed_cl_seq, NULL,
+                            LUSTRE_SEQ_METADATA,
+                            prefix, ss->ss_server_seq);
+       ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+       OBD_FREE(prefix, MAX_OBD_NAME + 5);
+       if (rc)
+               GOTO(out_free_seq, rc);
+
+       RETURN(0);
+
+out_free_seq:
+       OBD_FREE_PTR(ed->ed_cl_seq);
+       ed->ed_cl_seq = NULL;
+       RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obddev)
+{
+       struct echo_device *ed = obd2echo_dev(obddev);
+       ENTRY;
+
+       if (ed->ed_cl_seq != NULL) {
+               seq_client_fini(ed->ed_cl_seq);
+               OBD_FREE_PTR(ed->ed_cl_seq);
+               ed->ed_cl_seq = NULL;
+       }
+
+       RETURN(0);
+}
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+       struct lu_device   *next;
+       struct echo_device *ed;
+       struct cl_device   *cd;
+       struct obd_device  *obd = NULL; /* to keep compiler happy */
+       struct obd_device  *tgt;
+       const char *tgt_type_name;
+       int rc;
+       int cleanup = 0;
+       ENTRY;
+
+       OBD_ALLOC_PTR(ed);
+       if (ed == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       cleanup = 1;
+       cd = &ed->ed_cl;
+       rc = cl_device_init(cd, t);
+       if (rc)
+               GOTO(out, rc);
+
+       cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+       cd->cd_ops = &echo_device_cl_ops;
+
+       cleanup = 2;
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       LASSERT(env != NULL);
+
+       tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+       if (tgt == NULL) {
+               CERROR("Can not find tgt device %s\n",
+                       lustre_cfg_string(cfg, 1));
+               GOTO(out, rc = -ENODEV);
+       }
+
+       next = tgt->obd_lu_dev;
+       if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+               ed->ed_next_ismd = 1;
+       } else {
+               ed->ed_next_ismd = 0;
+               rc = echo_site_init(env, ed);
+               if (rc)
+                       GOTO(out, rc);
+       }
+       cleanup = 3;
+
+       rc = echo_client_setup(env, obd, cfg);
+       if (rc)
+               GOTO(out, rc);
+
+       ed->ed_ec = &obd->u.echo_client;
+       cleanup = 4;
+
+       if (ed->ed_next_ismd) {
+               /* Suppose to connect to some Metadata layer */
+               struct lu_site *ls;
+               struct lu_device *ld;
+               int    found = 0;
+
+               if (next == NULL) {
+                       CERROR("%s is not lu device type!\n",
+                              lustre_cfg_string(cfg, 1));
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               tgt_type_name = lustre_cfg_string(cfg, 2);
+               if (!tgt_type_name) {
+                       CERROR("%s no type name for echo %s setup\n",
+                               lustre_cfg_string(cfg, 1),
+                               tgt->obd_type->typ_name);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               ls = next->ld_site;
+
+               spin_lock(&ls->ls_ld_lock);
+               list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+                       if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+                               found = 1;
+                               break;
+                       }
+               }
+               spin_unlock(&ls->ls_ld_lock);
+
+               if (found == 0) {
+                       CERROR("%s is not lu device type!\n",
+                              lustre_cfg_string(cfg, 1));
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               next = ld;
+               /* For MD echo client, it will use the site in MDS stack */
+               ed->ed_site_myself.cs_lu = *ls;
+               ed->ed_site = &ed->ed_site_myself;
+               ed->ed_cl.cd_lu_dev.ld_site = &ed->ed_site_myself.cs_lu;
+               rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+               if (rc) {
+                       CERROR("echo fid init error %d\n", rc);
+                       GOTO(out, rc);
+               }
+       } else {
+                /* if echo client is to be stacked upon ost device, the next is
+                 * NULL since ost is not a clio device so far */
+               if (next != NULL && !lu_device_is_cl(next))
+                       next = NULL;
+
+               tgt_type_name = tgt->obd_type->typ_name;
+               if (next != NULL) {
+                       LASSERT(next != NULL);
+                       if (next->ld_site != NULL)
+                               GOTO(out, rc = -EBUSY);
+
+                       next->ld_site = &ed->ed_site->cs_lu;
+                       rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+                                                    next->ld_type->ldt_name,
+                                                    NULL);
+                       if (rc)
+                               GOTO(out, rc);
+
+                       /* Tricky case, I have to determine the obd type since
+                        * CLIO uses the different parameters to initialize
+                        * objects for lov & osc. */
+                       if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+                               ed->ed_next_islov = 1;
+                       else
+                               LASSERT(strcmp(tgt_type_name,
+                                              LUSTRE_OSC_NAME) == 0);
+               } else
+                       LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+       }
+
+       ed->ed_next = next;
+       RETURN(&cd->cd_lu_dev);
+out:
+       switch(cleanup) {
+       case 4: {
+               int rc2;
+               rc2 = echo_client_cleanup(obd);
+               if (rc2)
+                       CERROR("Cleanup obd device %s error(%d)\n",
+                              obd->obd_name, rc2);
+       }
+
+       case 3:
+               echo_site_fini(env, ed);
+       case 2:
+               cl_device_fini(&ed->ed_cl);
+       case 1:
+               OBD_FREE_PTR(ed);
+       case 0:
+       default:
+               break;
+       }
+       return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+                         const char *name, struct lu_device *next)
+{
+       LBUG();
+       return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+       struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+       struct lu_device *next = ed->ed_next;
+
+       while (next && !ed->ed_next_ismd)
+               next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+       return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+                             struct echo_lock *ecl,
+                             int still_used)
+{
+       struct cl_lock *clk = echo_lock2cl(ecl);
+
+       cl_lock_get(clk);
+       cl_unuse(env, clk);
+       cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+       if (!still_used) {
+               cl_lock_mutex_get(env, clk);
+               cl_lock_cancel(env, clk);
+               cl_lock_delete(env, clk);
+               cl_lock_mutex_put(env, clk);
+       }
+       cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+       struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+       struct echo_client_obd *ec   = ed->ed_ec;
+       struct echo_object     *eco;
+       struct lu_device       *next = ed->ed_next;
+
+       CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+              ed, next);
+
+       lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+       /* check if there are objects still alive.
+        * It shouldn't have any object because lu_site_purge would cleanup
+        * all of cached objects. Anyway, probably the echo device is being
+        * parallelly accessed.
+        */
+       spin_lock(&ec->ec_lock);
+       list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+               eco->eo_deleted = 1;
+       spin_unlock(&ec->ec_lock);
+
+       /* purge again */
+       lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+       CDEBUG(D_INFO,
+              "Waiting for the reference of echo object to be dropped\n");
+
+       /* Wait for the last reference to be dropped. */
+       spin_lock(&ec->ec_lock);
+       while (!list_empty(&ec->ec_objects)) {
+               spin_unlock(&ec->ec_lock);
+               CERROR("echo_client still has objects at cleanup time, "
+                      "wait for 1 second\n");
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(1));
+               lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+               spin_lock(&ec->ec_lock);
+       }
+       spin_unlock(&ec->ec_lock);
+
+       LASSERT(list_empty(&ec->ec_locks));
+
+       CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+       echo_client_cleanup(d->ld_obd);
+       echo_fid_fini(d->ld_obd);
+       while (next && !ed->ed_next_ismd)
+               next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+       LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+       echo_site_fini(env, ed);
+       cl_device_fini(&ed->ed_cl);
+       OBD_FREE_PTR(ed);
+
+       return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+       .ldto_init = echo_type_init,
+       .ldto_fini = echo_type_fini,
+
+       .ldto_start = echo_type_start,
+       .ldto_stop  = echo_type_stop,
+
+       .ldto_device_alloc = echo_device_alloc,
+       .ldto_device_free  = echo_device_free,
+       .ldto_device_init  = echo_device_init,
+       .ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+       .ldt_ops      = &echo_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                              struct lov_stripe_md **lsmp)
+{
+       struct lu_env *env;
+       struct echo_thread_info *info;
+       struct echo_object_conf *conf;
+       struct lov_stripe_md    *lsm;
+       struct echo_object *eco;
+       struct cl_object   *obj;
+       struct lu_fid *fid;
+       int refcheck;
+       int rc;
+       ENTRY;
+
+       LASSERT(lsmp);
+       lsm = *lsmp;
+       LASSERT(lsm);
+       LASSERT(ostid_id(&lsm->lsm_oi) != 0);
+       LASSERT(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO);
+
+       /* Never return an object if the obd is to be freed. */
+       if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+               RETURN(ERR_PTR(-ENODEV));
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN((void *)env);
+
+       info = echo_env_info(env);
+       conf = &info->eti_conf;
+       if (d->ed_next) {
+               if (!d->ed_next_islov) {
+                       struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+                       LASSERT(oinfo != NULL);
+                       oinfo->loi_oi = lsm->lsm_oi;
+                       conf->eoc_cl.u.coc_oinfo = oinfo;
+               } else {
+                       struct lustre_md *md;
+                       md = &info->eti_md;
+                       memset(md, 0, sizeof *md);
+                       md->lsm = lsm;
+                       conf->eoc_cl.u.coc_md = md;
+               }
+       }
+       conf->eoc_md = lsmp;
+
+       fid  = &info->eti_fid;
+       rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+       if (rc != 0)
+               GOTO(out, eco = ERR_PTR(rc));
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+       if (IS_ERR(obj))
+               GOTO(out, eco = (void*)obj);
+
+       eco = cl2echo_obj(obj);
+       if (eco->eo_deleted) {
+               cl_object_put(env, obj);
+               eco = ERR_PTR(-EAGAIN);
+       }
+
+out:
+       cl_env_put(env, &refcheck);
+       RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+       struct lu_env *env;
+       struct cl_object *obj = echo_obj2cl(eco);
+       int refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       /* an external function to kill an object? */
+       if (eco->eo_deleted) {
+               struct lu_object_header *loh = obj->co_lu.lo_header;
+               LASSERT(&eco->eo_hdr == luh2coh(loh));
+               set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+       }
+
+       cl_object_put(env, obj);
+       cl_env_put(env, &refcheck);
+       RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+                           obd_off start, obd_off end, int mode,
+                           __u64 *cookie , __u32 enqflags)
+{
+       struct cl_io *io;
+       struct cl_lock *lck;
+       struct cl_object *obj;
+       struct cl_lock_descr *descr;
+       struct echo_thread_info *info;
+       int rc = -ENOMEM;
+       ENTRY;
+
+       info = echo_env_info(env);
+       io = &info->eti_io;
+       descr = &info->eti_descr;
+       obj = echo_obj2cl(eco);
+
+       descr->cld_obj   = obj;
+       descr->cld_start = cl_index(obj, start);
+       descr->cld_end   = cl_index(obj, end);
+       descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+       descr->cld_enq_flags = enqflags;
+       io->ci_obj = obj;
+
+       lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+       if (lck) {
+               struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+               struct echo_lock *el;
+
+               rc = cl_wait(env, lck);
+               if (rc == 0) {
+                       el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+                       spin_lock(&ec->ec_lock);
+                       if (list_empty(&el->el_chain)) {
+                               list_add(&el->el_chain, &ec->ec_locks);
+                               el->el_cookie = ++ec->ec_unique;
+                       }
+                       atomic_inc(&el->el_refcount);
+                       *cookie = el->el_cookie;
+                       spin_unlock(&ec->ec_lock);
+               } else {
+                       cl_lock_release(env, lck, "ec enqueue", current);
+               }
+       }
+       RETURN(rc);
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end,
+                          int mode, __u64 *cookie)
+{
+       struct echo_thread_info *info;
+       struct lu_env *env;
+       struct cl_io *io;
+       int refcheck;
+       int result;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       info = echo_env_info(env);
+       io = &info->eti_io;
+
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+       if (result < 0)
+               GOTO(out, result);
+       LASSERT(result == 0);
+
+       result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+       cl_io_fini(env, io);
+
+       EXIT;
+out:
+       cl_env_put(env, &refcheck);
+       return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+                          __u64 cookie)
+{
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct echo_lock       *ecl = NULL;
+       struct list_head             *el;
+       int found = 0, still_used = 0;
+       ENTRY;
+
+       LASSERT(ec != NULL);
+       spin_lock(&ec->ec_lock);
+       list_for_each (el, &ec->ec_locks) {
+               ecl = list_entry (el, struct echo_lock, el_chain);
+               CDEBUG(D_INFO, "ecl: %p, cookie: "LPX64"\n", ecl, ecl->el_cookie);
+               found = (ecl->el_cookie == cookie);
+               if (found) {
+                       if (atomic_dec_and_test(&ecl->el_refcount))
+                               list_del_init(&ecl->el_chain);
+                       else
+                               still_used = 1;
+                       break;
+               }
+       }
+       spin_unlock(&ec->ec_lock);
+
+       if (!found)
+               RETURN(-ENOENT);
+
+       echo_lock_release(env, ecl, still_used);
+       RETURN(0);
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+       struct lu_env *env;
+       int refcheck;
+       int rc;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = cl_echo_cancel0(env, ed, cookie);
+
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+                            enum cl_req_type unused, struct cl_2queue *queue)
+{
+       struct cl_page *clp;
+       struct cl_page *temp;
+       int result = 0;
+       ENTRY;
+
+       cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+               int rc;
+               rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+               if (rc == 0)
+                       continue;
+               result = result ?: rc;
+       }
+       RETURN(result);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                             struct page **pages, int npages, int async)
+{
+       struct lu_env      *env;
+       struct echo_thread_info *info;
+       struct cl_object        *obj = echo_obj2cl(eco);
+       struct echo_device      *ed  = eco->eo_dev;
+       struct cl_2queue        *queue;
+       struct cl_io        *io;
+       struct cl_page    *clp;
+       struct lustre_handle    lh = { 0 };
+       int page_size = cl_page_size(obj);
+       int refcheck;
+       int rc;
+       int i;
+       ENTRY;
+
+       LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+       LASSERT(ed->ed_next != NULL);
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       info    = echo_env_info(env);
+       io      = &info->eti_io;
+       queue   = &info->eti_queue;
+
+       cl_2queue_init(queue);
+
+       io->ci_ignore_layout = 1;
+       rc = cl_io_init(env, io, CIT_MISC, obj);
+       if (rc < 0)
+               GOTO(out, rc);
+       LASSERT(rc == 0);
+
+
+       rc = cl_echo_enqueue0(env, eco, offset,
+                             offset + npages * PAGE_CACHE_SIZE - 1,
+                             rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+                             CEF_NEVER);
+       if (rc < 0)
+               GOTO(error_lock, rc);
+
+       for (i = 0; i < npages; i++) {
+               LASSERT(pages[i]);
+               clp = cl_page_find(env, obj, cl_index(obj, offset),
+                                  pages[i], CPT_TRANSIENT);
+               if (IS_ERR(clp)) {
+                       rc = PTR_ERR(clp);
+                       break;
+               }
+               LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+               rc = cl_page_own(env, io, clp);
+               if (rc) {
+                       LASSERT(clp->cp_state == CPS_FREEING);
+                       cl_page_put(env, clp);
+                       break;
+               }
+
+               cl_2queue_add(queue, clp);
+
+               /* drop the reference count for cl_page_find, so that the page
+                * will be freed in cl_2queue_fini. */
+               cl_page_put(env, clp);
+               cl_page_clip(env, clp, 0, page_size);
+
+               offset += page_size;
+       }
+
+       if (rc == 0) {
+               enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+               async = async && (typ == CRT_WRITE);
+               if (async)
+                       rc = cl_echo_async_brw(env, io, typ, queue);
+               else
+                       rc = cl_io_submit_sync(env, io, typ, queue, 0);
+               CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+                      async ? "async" : "sync", rc);
+       }
+
+       cl_echo_cancel0(env, ed, lh.cookie);
+       EXIT;
+error_lock:
+       cl_2queue_discard(env, io, queue);
+       cl_2queue_disown(env, io, queue);
+       cl_2queue_fini(env, queue);
+       cl_io_fini(env, io);
+out:
+       cl_env_put(env, &refcheck);
+       return rc;
+}
+/** @} echo_exports */
+
+
+static obd_id last_object_id;
+
+static int
+echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+       struct lov_stripe_md *ulsm = _ulsm;
+       int nob, i;
+
+       nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+       if (nob > ulsm_nob)
+               return (-EINVAL);
+
+       if (copy_to_user (ulsm, lsm, sizeof(ulsm)))
+               return (-EFAULT);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (copy_to_user (ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+                                     sizeof(lsm->lsm_oinfo[0])))
+                       return (-EFAULT);
+       }
+       return 0;
+}
+
+static int
+echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm,
+                void *ulsm, int ulsm_nob)
+{
+       struct echo_client_obd *ec = ed->ed_ec;
+       int                  i;
+
+       if (ulsm_nob < sizeof (*lsm))
+               return (-EINVAL);
+
+       if (copy_from_user (lsm, ulsm, sizeof (*lsm)))
+               return (-EFAULT);
+
+       if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+           lsm->lsm_magic != LOV_MAGIC ||
+           (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+           ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+               return (-EINVAL);
+
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (copy_from_user(lsm->lsm_oinfo[i],
+                                      ((struct lov_stripe_md *)ulsm)-> \
+                                      lsm_oinfo[i],
+                                      sizeof(lsm->lsm_oinfo[0])))
+                       return (-EFAULT);
+       }
+       return (0);
+}
+
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+                                     __u64 id)
+{
+       sprintf(name, LPU64, id);
+       lname->ln_name = name;
+       lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+                           struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(ma->ma_lmm_size > 0);
+
+       rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* big_lmm may need to be grown */
+       if (info->eti_big_lmmsize < rc) {
+               int size = size_roundup_power2(rc);
+
+               if (info->eti_big_lmmsize > 0) {
+                       /* free old buffer */
+                       LASSERT(info->eti_big_lmm);
+                       OBD_FREE_LARGE(info->eti_big_lmm,
+                                      info->eti_big_lmmsize);
+                       info->eti_big_lmm = NULL;
+                       info->eti_big_lmmsize = 0;
+               }
+
+               OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+               if (info->eti_big_lmm == NULL)
+                       RETURN(-ENOMEM);
+               info->eti_big_lmmsize = size;
+       }
+       LASSERT(info->eti_big_lmmsize >= rc);
+
+       info->eti_buf.lb_buf = info->eti_big_lmm;
+       info->eti_buf.lb_len = info->eti_big_lmmsize;
+       rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+       if (rc < 0)
+               RETURN(rc);
+
+       ma->ma_valid |= MA_LOV;
+       ma->ma_lmm = info->eti_big_lmm;
+       ma->ma_lmm_size = rc;
+
+       RETURN(0);
+}
+
+int echo_attr_get_complex(const struct lu_env *env, struct md_object *next,
+                         struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_buf           *buf = &info->eti_buf;
+       umode_t          mode = lu_object_attr(&next->mo_lu);
+       int                      need = ma->ma_need;
+       int                      rc = 0, rc2;
+
+       ENTRY;
+
+       ma->ma_valid = 0;
+
+       if (need & MA_INODE) {
+               ma->ma_need = MA_INODE;
+               rc = mo_attr_get(env, next, ma);
+               if (rc)
+                       GOTO(out, rc);
+               ma->ma_valid |= MA_INODE;
+       }
+
+       if (need & MA_LOV) {
+               if (S_ISREG(mode) || S_ISDIR(mode)) {
+                       LASSERT(ma->ma_lmm_size > 0);
+                       buf->lb_buf = ma->ma_lmm;
+                       buf->lb_len = ma->ma_lmm_size;
+                       rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+                       if (rc2 > 0) {
+                               ma->ma_lmm_size = rc2;
+                               ma->ma_valid |= MA_LOV;
+                       } else if (rc2 == -ENODATA) {
+                               /* no LOV EA */
+                               ma->ma_lmm_size = 0;
+                       } else if (rc2 == -ERANGE) {
+                               rc2 = echo_big_lmm_get(env, next, ma);
+                               if (rc2 < 0)
+                                       GOTO(out, rc = rc2);
+                       } else {
+                               GOTO(out, rc = rc2);
+                       }
+               }
+       }
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+               buf->lb_buf = ma->ma_acl;
+               buf->lb_len = ma->ma_acl_size;
+               rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+               if (rc2 > 0) {
+                       ma->ma_acl_size = rc2;
+                       ma->ma_valid |= MA_ACL_DEF;
+               } else if (rc2 == -ENODATA) {
+                       /* no ACLs */
+                       ma->ma_acl_size = 0;
+               } else {
+                       GOTO(out, rc = rc2);
+               }
+       }
+#endif
+out:
+       ma->ma_need = need;
+       CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = "LPX64" ma_lmm=%p\n",
+              rc, ma->ma_valid, ma->ma_lmm);
+       RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+                       struct md_object *parent, struct lu_fid *fid,
+                       struct lu_name *lname, struct md_op_spec *spec,
+                       struct md_attr *ma)
+{
+       struct lu_object        *ec_child, *child;
+       struct lu_device        *ld = ed->ed_next;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid           *fid2 = &info->eti_fid2;
+       struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+       int                      rc;
+
+       ENTRY;
+
+       rc = mdo_lookup(env, parent, lname, fid2, spec);
+       if (rc == 0)
+               return -EEXIST;
+       else if (rc != -ENOENT)
+               return rc;
+
+       ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+                                    fid, &conf);
+       if (IS_ERR(ec_child)) {
+               CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+                       PTR_ERR(ec_child));
+               RETURN(PTR_ERR(ec_child));
+       }
+
+       child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+       if (child == NULL) {
+               CERROR("Can not locate the child "DFID"\n", PFID(fid));
+               GOTO(out_put, rc = -EINVAL);
+       }
+
+       CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+       /*
+        * Do not perform lookup sanity check. We know that name does not exist.
+        */
+       spec->sp_cr_lookup = 0;
+       rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+       if (rc) {
+               CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+               GOTO(out_put, rc);
+       }
+       CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+       EXIT;
+out_put:
+       lu_object_put(env, ec_child);
+       return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+                            struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+
+       if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+               ma->ma_lmm = (void *)&info->eti_lmm;
+               ma->ma_lmm_size = sizeof(info->eti_lmm);
+       } else {
+               LASSERT(info->eti_big_lmmsize);
+               ma->ma_lmm = info->eti_big_lmm;
+               ma->ma_lmm_size = info->eti_big_lmmsize;
+       }
+
+       return 0;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+                                struct echo_device *ed,
+                                struct lu_object *ec_parent,
+                                struct lu_fid *fid,
+                                char *name, int namelen,
+                                __u64 id, __u32 mode, int count,
+                                int stripe_count, int stripe_offset)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       struct md_op_spec       *spec = &info->eti_spec;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       memset(ma, 0, sizeof(*ma));
+       memset(spec, 0, sizeof(*spec));
+       if (stripe_count != 0) {
+               spec->sp_cr_flags |= FMODE_WRITE;
+               echo_set_lmm_size(env, ld, ma);
+               if (stripe_count != -1) {
+                       struct lov_user_md_v3 *lum = &info->eti_lum;
+
+                       lum->lmm_magic = LOV_USER_MAGIC_V3;
+                       lum->lmm_stripe_count = stripe_count;
+                       lum->lmm_stripe_offset = stripe_offset;
+                       lum->lmm_pattern = 0;
+                       spec->u.sp_ea.eadata = lum;
+                       spec->u.sp_ea.eadatalen = sizeof(*lum);
+                       spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+               }
+       }
+
+       ma->ma_attr.la_mode = mode;
+       ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+       ma->ma_attr.la_ctime = cfs_time_current_64();
+
+       if (name != NULL) {
+               lname->ln_name = name;
+               lname->ln_namelen = namelen;
+               /* If name is specified, only create one object by name */
+               rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+                                            spec, ma);
+               RETURN(rc);
+       }
+
+       /* Create multiple object sequenced by id */
+       for (i = 0; i < count; i++) {
+               char *tmp_name = info->eti_name;
+
+               echo_md_build_name(lname, tmp_name, id);
+
+               rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+                                            spec, ma);
+               if (rc) {
+                       CERROR("Can not create child %s: rc = %d\n", tmp_name,
+                               rc);
+                       break;
+               }
+               id++;
+               fid->f_oid++;
+       }
+
+       RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+                                       struct echo_device *ed,
+                                       struct md_object *parent,
+                                       struct lu_name *lname)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_object        *child;
+       int    rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+              PFID(fid), parent);
+       rc = mdo_lookup(env, parent, lname, fid, NULL);
+       if (rc) {
+               CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+       RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct lu_device        *ld = ed->ed_next;
+       struct lu_buf      *buf = &info->eti_buf;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       for (i = 0; i < count; i++) {
+               struct lu_object *ec_child, *child;
+
+               echo_md_build_name(lname, name, id);
+
+               ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+               if (IS_ERR(ec_child)) {
+                       CERROR("Can't find child %s: rc = %ld\n",
+                               lname->ln_name, PTR_ERR(ec_child));
+                       RETURN(PTR_ERR(ec_child));
+               }
+
+               child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+               if (child == NULL) {
+                       CERROR("Can not locate the child %s\n", lname->ln_name);
+                       lu_object_put(env, ec_child);
+                       rc = -EINVAL;
+                       break;
+               }
+
+               CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+
+               buf->lb_buf = info->eti_xattr_buf;
+               buf->lb_len = sizeof(info->eti_xattr_buf);
+
+               sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+               rc = mo_xattr_set(env, lu2md(child), buf, name,
+                                 LU_XATTR_CREATE);
+               if (rc < 0) {
+                       CERROR("Can not setattr child "DFID": rc = %d\n",
+                               PFID(lu_object_fid(child)), rc);
+                       lu_object_put(env, ec_child);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               id++;
+               lu_object_put(env, ec_child);
+       }
+       RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       memset(ma, 0, sizeof(*ma));
+       ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+       ma->ma_acl = info->eti_xattr_buf;
+       ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+       for (i = 0; i < count; i++) {
+               struct lu_object *ec_child, *child;
+
+               ma->ma_valid = 0;
+               echo_md_build_name(lname, name, id);
+               echo_set_lmm_size(env, ld, ma);
+
+               ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+               if (IS_ERR(ec_child)) {
+                       CERROR("Can't find child %s: rc = %ld\n",
+                              lname->ln_name, PTR_ERR(ec_child));
+                       RETURN(PTR_ERR(ec_child));
+               }
+
+               child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+               if (child == NULL) {
+                       CERROR("Can not locate the child %s\n", lname->ln_name);
+                       lu_object_put(env, ec_child);
+                       RETURN(-EINVAL);
+               }
+
+               CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               rc = echo_attr_get_complex(env, lu2md(child), ma);
+               if (rc) {
+                       CERROR("Can not getattr child "DFID": rc = %d\n",
+                               PFID(lu_object_fid(child)), rc);
+                       lu_object_put(env, ec_child);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               id++;
+               lu_object_put(env, ec_child);
+       }
+
+       RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+                             struct echo_device *ed,
+                             struct lu_object *ec_parent,
+                             __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               return -ENXIO;
+
+       /*prepare the requests*/
+       for (i = 0; i < count; i++) {
+               echo_md_build_name(lname, name, id);
+
+               CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+                      PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+               rc = mdo_lookup(env, lu2md(parent), lname, fid, NULL);
+               if (rc) {
+                       CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+                      PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+               id++;
+       }
+       return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+                                   struct echo_device *ed,
+                                   struct md_object *parent,
+                                   struct lu_name *lname,
+                                   struct md_attr *ma)
+{
+       struct lu_device   *ld = ed->ed_next;
+       struct lu_object   *ec_child;
+       struct lu_object   *child;
+       int              rc;
+
+       ENTRY;
+
+       ec_child = echo_md_lookup(env, ed, parent, lname);
+       if (IS_ERR(ec_child)) {
+               CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+                       PTR_ERR(ec_child));
+               RETURN(PTR_ERR(ec_child));
+       }
+
+       child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+       if (child == NULL) {
+               CERROR("Can not locate the child %s\n", lname->ln_name);
+               GOTO(out_put, rc = -EINVAL);
+       }
+
+       CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+       rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+       if (rc) {
+               CERROR("Can not unlink child %s: rc = %d\n",
+                       lname->ln_name, rc);
+               GOTO(out_put, rc);
+       }
+       CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+       lu_object_put(env, ec_child);
+       return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              char *name, int namelen,
+                              __u64 id, __u32 mode,
+                              int count)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       struct lu_object        *parent;
+       int                   rc = 0;
+       int                   i;
+       ENTRY;
+
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-EINVAL);
+
+       memset(ma, 0, sizeof(*ma));
+       ma->ma_attr.la_mode = mode;
+       ma->ma_attr.la_valid = LA_CTIME;
+       ma->ma_attr.la_ctime = cfs_time_current_64();
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+
+       if (name != NULL) {
+               lname->ln_name = name;
+               lname->ln_namelen = namelen;
+               rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+                                             ma);
+               RETURN(rc);
+       }
+
+       /*prepare the requests*/
+       for (i = 0; i < count; i++) {
+               char *tmp_name = info->eti_name;
+
+               ma->ma_valid = 0;
+               echo_md_build_name(lname, tmp_name, id);
+
+               rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+                                             ma);
+               if (rc) {
+                       CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+                       break;
+               }
+               id++;
+       }
+
+       RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+                                          struct echo_device *ed, char *path,
+                                          int path_len)
+{
+       struct lu_device        *ld = ed->ed_next;
+       struct md_device        *md = lu2md_dev(ld);
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_name    *lname = &info->eti_lname;
+       struct lu_object        *parent = NULL;
+       struct lu_object        *child = NULL;
+       int rc = 0;
+       ENTRY;
+
+       /*Only support MDD layer right now*/
+       rc = md->md_ops->mdo_root_get(env, md, fid);
+       if (rc) {
+               CERROR("get root error: rc = %d\n", rc);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+       if (IS_ERR(parent)) {
+               CERROR("Can not find the parent "DFID": rc = %ld\n",
+                       PFID(fid), PTR_ERR(parent));
+               RETURN(parent);
+       }
+
+       while (1) {
+               struct lu_object *ld_parent;
+               char *e;
+
+               e = strsep(&path, "/");
+               if (e == NULL)
+                       break;
+
+               if (e[0] == 0) {
+                       if (!path || path[0] == '\0')
+                               break;
+                       continue;
+               }
+
+               lname->ln_name = e;
+               lname->ln_namelen = strlen(e);
+
+               ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+               if (ld_parent == NULL) {
+                       lu_object_put(env, parent);
+                       rc = -EINVAL;
+                       break;
+               }
+
+               child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+               lu_object_put(env, parent);
+               if (IS_ERR(child)) {
+                       rc = (int)PTR_ERR(child);
+                       CERROR("lookup %s under parent "DFID": rc = %d\n",
+                               lname->ln_name, PFID(lu_object_fid(ld_parent)),
+                               rc);
+                       break;
+               }
+               parent = child;
+       }
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+       struct lu_ucred *ucred = lu_ucred(env);
+
+       ucred->uc_valid = UCRED_INVALID;
+
+       ucred->uc_suppgids[0] = -1;
+       ucred->uc_suppgids[1] = -1;
+
+       ucred->uc_uid   = ucred->uc_o_uid   = current_uid();
+       ucred->uc_gid   = ucred->uc_o_gid   = current_gid();
+       ucred->uc_fsuid = ucred->uc_o_fsuid = current_fsuid();
+       ucred->uc_fsgid = ucred->uc_o_fsgid = current_fsgid();
+       ucred->uc_cap   = cfs_curproc_cap_pack();
+
+       /* remove fs privilege for non-root user. */
+       if (ucred->uc_fsuid)
+               ucred->uc_cap &= ~CFS_CAP_FS_MASK;
+       ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+       struct lu_ucred *ucred = lu_ucred(env);
+       ucred->uc_valid = UCRED_INIT;
+}
+
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION)
+static int echo_md_handler(struct echo_device *ed, int command,
+                          char *path, int path_len, __u64 id, int count,
+                          struct obd_ioctl_data *data)
+{
+       struct echo_thread_info *info;
+       struct lu_device      *ld = ed->ed_next;
+       struct lu_env    *env;
+       int                 refcheck;
+       struct lu_object      *parent;
+       char              *name = NULL;
+       int                 namelen = data->ioc_plen2;
+       int                 rc = 0;
+       ENTRY;
+
+       if (ld == NULL) {
+               CERROR("MD echo client is not being initialized properly\n");
+               RETURN(-EINVAL);
+       }
+
+       if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+               CERROR("Only support MDD layer right now!\n");
+               RETURN(-EINVAL);
+       }
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG);
+       if (rc != 0)
+               GOTO(out_env, rc);
+
+       /* init big_lmm buffer */
+       info = echo_env_info(env);
+       LASSERT(info->eti_big_lmm == NULL);
+       OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+       if (info->eti_big_lmm == NULL)
+               GOTO(out_env, rc = -ENOMEM);
+       info->eti_big_lmmsize = MIN_MD_SIZE;
+
+       parent = echo_resolve_path(env, ed, path, path_len);
+       if (IS_ERR(parent)) {
+               CERROR("Can not resolve the path %s: rc = %ld\n", path,
+                       PTR_ERR(parent));
+               GOTO(out_free, rc = PTR_ERR(parent));
+       }
+
+       if (namelen > 0) {
+               OBD_ALLOC(name, namelen + 1);
+               if (name == NULL)
+                       GOTO(out_put, rc = -ENOMEM);
+               if (copy_from_user(name, data->ioc_pbuf2, namelen))
+                       GOTO(out_name, rc = -EFAULT);
+       }
+
+       echo_ucred_init(env);
+
+       switch (command) {
+       case ECHO_MD_CREATE:
+       case ECHO_MD_MKDIR: {
+               struct echo_thread_info *info = echo_env_info(env);
+               __u32 mode = data->ioc_obdo2.o_mode;
+               struct lu_fid *fid = &info->eti_fid;
+               int stripe_count = (int)data->ioc_obdo2.o_misc;
+               int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+               rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+               if (rc != 0)
+                       break;
+
+               /* In the function below, .hs_keycmp resolves to
+                * lu_obj_hop_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+                                          id, mode, count, stripe_count,
+                                          stripe_index);
+               break;
+       }
+       case ECHO_MD_DESTROY:
+       case ECHO_MD_RMDIR: {
+               __u32 mode = data->ioc_obdo2.o_mode;
+
+               rc = echo_destroy_object(env, ed, parent, name, namelen,
+                                        id, mode, count);
+               break;
+       }
+       case ECHO_MD_LOOKUP:
+               rc = echo_lookup_object(env, ed, parent, id, count);
+               break;
+       case ECHO_MD_GETATTR:
+               rc = echo_getattr_object(env, ed, parent, id, count);
+               break;
+       case ECHO_MD_SETATTR:
+               rc = echo_setattr_object(env, ed, parent, id, count);
+               break;
+       default:
+               CERROR("unknown command %d\n", command);
+               rc = -EINVAL;
+               break;
+       }
+       echo_ucred_fini(env);
+
+out_name:
+       if (name != NULL)
+               OBD_FREE(name, namelen + 1);
+out_put:
+       lu_object_put(env, parent);
+out_free:
+       LASSERT(info->eti_big_lmm);
+       OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+       info->eti_big_lmm = NULL;
+       info->eti_big_lmmsize = 0;
+out_env:
+       cl_env_put(env, &refcheck);
+       return rc;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+                             int on_target, struct obdo *oa, void *ulsm,
+                             int ulsm_nob, struct obd_trans_info *oti)
+{
+       struct echo_object     *eco;
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct lov_stripe_md   *lsm = NULL;
+       int                  rc;
+       int                  created = 0;
+       ENTRY;
+
+       if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+           (on_target ||                      /* set_stripe */
+            ec->ec_nstripes != 0)) {      /* LOV */
+               CERROR ("No valid oid\n");
+               RETURN(-EINVAL);
+       }
+
+       rc = echo_alloc_memmd(ed, &lsm);
+       if (rc < 0) {
+               CERROR("Cannot allocate md: rc = %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       if (ulsm != NULL) {
+               int i, idx;
+
+               rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob);
+               if (rc != 0)
+                       GOTO(failed, rc);
+
+               if (lsm->lsm_stripe_count == 0)
+                       lsm->lsm_stripe_count = ec->ec_nstripes;
+
+               if (lsm->lsm_stripe_size == 0)
+                       lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+               idx = cfs_rand();
+
+               /* setup stripes: indices + default ids if required */
+               for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                       if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+                               lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+                       lsm->lsm_oinfo[i]->loi_ost_idx =
+                               (idx + i) % ec->ec_nstripes;
+               }
+       }
+
+       /* setup object ID here for !on_target and LOV hint */
+       if (oa->o_valid & OBD_MD_FLID) {
+               LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+               lsm->lsm_oi = oa->o_oi;
+       }
+
+       if (ostid_id(&lsm->lsm_oi) == 0)
+               ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+       rc = 0;
+       if (on_target) {
+               /* Only echo objects are allowed to be created */
+               LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+                       (ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+               rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+               if (rc != 0) {
+                       CERROR("Cannot create objects: rc = %d\n", rc);
+                       GOTO(failed, rc);
+               }
+               created = 1;
+       }
+
+       /* See what object ID we were given */
+       oa->o_oi = lsm->lsm_oi;
+       oa->o_valid |= OBD_MD_FLID;
+
+       eco = cl_echo_object_find(ed, &lsm);
+       if (IS_ERR(eco))
+               GOTO(failed, rc = PTR_ERR(eco));
+       cl_echo_object_put(eco);
+
+       CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+       EXIT;
+
+ failed:
+       if (created && rc)
+               obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+       if (lsm)
+               echo_free_memmd(ed, &lsm);
+       if (rc)
+               CERROR("create object failed with: rc = %d\n", rc);
+       return (rc);
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+                          struct obdo *oa)
+{
+       struct lov_stripe_md   *lsm = NULL;
+       struct echo_object     *eco;
+       int                  rc;
+       ENTRY;
+
+       if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+               /* disallow use of object id 0 */
+               CERROR ("No valid oid\n");
+               RETURN(-EINVAL);
+       }
+
+       rc = echo_alloc_memmd(ed, &lsm);
+       if (rc < 0)
+               RETURN(rc);
+
+       lsm->lsm_oi = oa->o_oi;
+       if (!(oa->o_valid & OBD_MD_FLGROUP))
+               ostid_set_seq_echo(&lsm->lsm_oi);
+
+       rc = 0;
+       eco = cl_echo_object_find(ed, &lsm);
+       if (!IS_ERR(eco))
+               *ecop = eco;
+       else
+               rc = PTR_ERR(eco);
+       if (lsm)
+               echo_free_memmd(ed, &lsm);
+       RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+       if (cl_echo_object_put(eco))
+               CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp)
+{
+       unsigned long stripe_count;
+       unsigned long stripe_size;
+       unsigned long width;
+       unsigned long woffset;
+       int        stripe_index;
+       obd_off       offset;
+
+       if (lsm->lsm_stripe_count <= 1)
+               return;
+
+       offset       = *offp;
+       stripe_size  = lsm->lsm_stripe_size;
+       stripe_count = lsm->lsm_stripe_count;
+
+       /* width = # bytes in all stripes */
+       width = stripe_size * stripe_count;
+
+       /* woffset = offset within a width; offset = whole number of widths */
+       woffset = do_div (offset, width);
+
+       stripe_index = woffset / stripe_size;
+
+       *idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+       *offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+                            struct page *page, int rw, obd_id id,
+                            obd_off offset, obd_off count)
+{
+       char    *addr;
+       obd_off  stripe_off;
+       obd_id   stripe_id;
+       int      delta;
+
+       /* no partial pages on the client */
+       LASSERT(count == PAGE_CACHE_SIZE);
+
+       addr = kmap(page);
+
+       for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+               if (rw == OBD_BRW_WRITE) {
+                       stripe_off = offset + delta;
+                       stripe_id = id;
+                       echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+               } else {
+                       stripe_off = 0xdeadbeef00c0ffeeULL;
+                       stripe_id = 0xdeadbeef00c0ffeeULL;
+               }
+               block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                 stripe_off, stripe_id);
+       }
+
+       kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+                                       struct page *page, obd_id id,
+                                       obd_off offset, obd_off count)
+{
+       obd_off stripe_off;
+       obd_id  stripe_id;
+       char   *addr;
+       int     delta;
+       int     rc;
+       int     rc2;
+
+       /* no partial pages on the client */
+       LASSERT(count == PAGE_CACHE_SIZE);
+
+       addr = kmap(page);
+
+       for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+               stripe_off = offset + delta;
+               stripe_id = id;
+               echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+
+               rc2 = block_debug_check("test_brw",
+                                       addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                       stripe_off, stripe_id);
+               if (rc2 != 0) {
+                       CERROR ("Error in echo object "LPX64"\n", id);
+                       rc = rc2;
+               }
+       }
+
+       kunmap(page);
+       return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+                           struct echo_object *eco, obd_off offset,
+                           obd_size count, int async,
+                           struct obd_trans_info *oti)
+{
+       struct lov_stripe_md   *lsm = eco->eo_lsm;
+       obd_count              npages;
+       struct brw_page *pga;
+       struct brw_page *pgp;
+       struct page         **pages;
+       obd_off          off;
+       int                  i;
+       int                  rc;
+       int                  verify;
+       int                  gfp_mask;
+       int                  brw_flags = 0;
+       ENTRY;
+
+       verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+                 (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                 (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+       gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+       LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+       LASSERT(lsm != NULL);
+       LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+       if (count <= 0 ||
+           (count & (~CFS_PAGE_MASK)) != 0)
+               RETURN(-EINVAL);
+
+       /* XXX think again with misaligned I/O */
+       npages = count >> PAGE_CACHE_SHIFT;
+
+       if (rw == OBD_BRW_WRITE)
+               brw_flags = OBD_BRW_ASYNC;
+
+       OBD_ALLOC(pga, npages * sizeof(*pga));
+       if (pga == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(pages, npages * sizeof(*pages));
+       if (pages == NULL) {
+               OBD_FREE(pga, npages * sizeof(*pga));
+               RETURN(-ENOMEM);
+       }
+
+       for (i = 0, pgp = pga, off = offset;
+            i < npages;
+            i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+               LASSERT (pgp->pg == NULL);      /* for cleanup */
+
+               rc = -ENOMEM;
+               OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+               if (pgp->pg == NULL)
+                       goto out;
+
+               pages[i] = pgp->pg;
+               pgp->count = PAGE_CACHE_SIZE;
+               pgp->off = off;
+               pgp->flag = brw_flags;
+
+               if (verify)
+                       echo_client_page_debug_setup(lsm, pgp->pg, rw,
+                                                    ostid_id(&oa->o_oi), off,
+                                                    pgp->count);
+       }
+
+       /* brw mode can only be used at client */
+       LASSERT(ed->ed_next != NULL);
+       rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+       if (rc != 0 || rw != OBD_BRW_READ)
+               verify = 0;
+
+       for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+               if (pgp->pg == NULL)
+                       continue;
+
+               if (verify) {
+                       int vrc;
+                       vrc = echo_client_page_debug_check(lsm, pgp->pg,
+                                                          ostid_id(&oa->o_oi),
+                                                          pgp->off, pgp->count);
+                       if (vrc != 0 && rc == 0)
+                               rc = vrc;
+               }
+               OBD_PAGE_FREE(pgp->pg);
+       }
+       OBD_FREE(pga, npages * sizeof(*pga));
+       OBD_FREE(pages, npages * sizeof(*pages));
+       RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+                                  struct obd_export *exp, int rw,
+                                  struct obdo *oa, struct echo_object *eco,
+                                  obd_off offset, obd_size count,
+                                  obd_size batch, struct obd_trans_info *oti,
+                                  int async)
+{
+       struct lov_stripe_md *lsm = eco->eo_lsm;
+       struct obd_ioobj ioo;
+       struct niobuf_local *lnb;
+       struct niobuf_remote *rnb;
+       obd_off off;
+       obd_size npages, tot_pages;
+       int i, ret = 0, brw_flags = 0;
+
+       ENTRY;
+
+       if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+           (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+               RETURN(-EINVAL);
+
+       npages = batch >> PAGE_CACHE_SHIFT;
+       tot_pages = count >> PAGE_CACHE_SHIFT;
+
+       OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+       OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+       if (lnb == NULL || rnb == NULL)
+               GOTO(out, ret = -ENOMEM);
+
+       if (rw == OBD_BRW_WRITE && async)
+               brw_flags |= OBD_BRW_ASYNC;
+
+       obdo_to_ioobj(oa, &ioo);
+
+       off = offset;
+
+       for(; tot_pages; tot_pages -= npages) {
+               int lpages;
+
+               if (tot_pages < npages)
+                       npages = tot_pages;
+
+               for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+                       rnb[i].offset = off;
+                       rnb[i].len = PAGE_CACHE_SIZE;
+                       rnb[i].flags = brw_flags;
+               }
+
+               ioo.ioo_bufcnt = npages;
+               oti->oti_transno = 0;
+
+               lpages = npages;
+               ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+                                lnb, oti, NULL);
+               if (ret != 0)
+                       GOTO(out, ret);
+               LASSERT(lpages == npages);
+
+               for (i = 0; i < lpages; i++) {
+                       struct page *page = lnb[i].page;
+
+                       /* read past eof? */
+                       if (page == NULL && lnb[i].rc == 0)
+                               continue;
+
+                       if (async)
+                               lnb[i].flags |= OBD_BRW_ASYNC;
+
+                       if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+                           (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+                           (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+                               continue;
+
+                       if (rw == OBD_BRW_WRITE)
+                               echo_client_page_debug_setup(lsm, page, rw,
+                                                           ostid_id(&oa->o_oi),
+                                                            rnb[i].offset,
+                                                            rnb[i].len);
+                       else
+                               echo_client_page_debug_check(lsm, page,
+                                                           ostid_id(&oa->o_oi),
+                                                            rnb[i].offset,
+                                                            rnb[i].len);
+               }
+
+               ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+                                  rnb, npages, lnb, oti, ret);
+               if (ret != 0)
+                       GOTO(out, ret);
+
+               /* Reset oti otherwise it would confuse ldiskfs. */
+               memset(oti, 0, sizeof(*oti));
+       }
+
+out:
+       if (lnb)
+               OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+       if (rnb)
+               OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+       RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+                                struct obd_export *exp,
+                                struct obd_ioctl_data *data,
+                                struct obd_trans_info *dummy_oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct echo_device *ed = obd2echo_dev(obd);
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct obdo *oa = &data->ioc_obdo1;
+       struct echo_object *eco;
+       int rc;
+       int async = 1;
+       long test_mode;
+       ENTRY;
+
+       LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+       rc = echo_get_object(&eco, ed, oa);
+       if (rc)
+               RETURN(rc);
+
+       oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+       /* OFD/obdfilter works only via prep/commit */
+       test_mode = (long)data->ioc_pbuf1;
+       if (test_mode == 1)
+               async = 0;
+
+       if (ed->ed_next == NULL && test_mode != 3) {
+               test_mode = 3;
+               data->ioc_plen1 = data->ioc_count;
+       }
+
+       /* Truncate batch size to maximum */
+       if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+               data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+       switch (test_mode) {
+       case 1:
+               /* fall through */
+       case 2:
+               rc = echo_client_kbrw(ed, rw, oa,
+                                     eco, data->ioc_offset,
+                                     data->ioc_count, async, dummy_oti);
+               break;
+       case 3:
+               rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+                                            eco, data->ioc_offset,
+                                            data->ioc_count, data->ioc_plen1,
+                                            dummy_oti, async);
+               break;
+       default:
+               rc = -EINVAL;
+       }
+       echo_put_object(eco);
+       RETURN(rc);
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+                   int mode, obd_off offset, obd_size nob)
+{
+       struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
+       struct lustre_handle   *ulh = &oa->o_handle;
+       struct echo_object     *eco;
+       obd_off          end;
+       int                  rc;
+       ENTRY;
+
+       if (ed->ed_next == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       if (!(mode == LCK_PR || mode == LCK_PW))
+               RETURN(-EINVAL);
+
+       if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+           (nob & (~CFS_PAGE_MASK)) != 0)
+               RETURN(-EINVAL);
+
+       rc = echo_get_object (&eco, ed, oa);
+       if (rc != 0)
+               RETURN(rc);
+
+       end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
+       rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+       if (rc == 0) {
+               oa->o_valid |= OBD_MD_FLHANDLE;
+               CDEBUG(D_INFO, "Cookie is "LPX64"\n", ulh->cookie);
+       }
+       echo_put_object(eco);
+       RETURN(rc);
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+       struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+       __u64          cookie = oa->o_handle.cookie;
+
+       if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+               return -EINVAL;
+
+       CDEBUG(D_INFO, "Cookie is "LPX64"\n", cookie);
+       return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                     void *karg, void *uarg)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct echo_device     *ed = obd2echo_dev(obd);
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct echo_object     *eco;
+       struct obd_ioctl_data  *data = karg;
+       struct obd_trans_info   dummy_oti;
+       struct lu_env     *env;
+       struct oti_req_ack_lock *ack_lock;
+       struct obdo         *oa;
+       struct lu_fid      fid;
+       int                  rw = OBD_BRW_READ;
+       int                  rc = 0;
+       int                  i;
+       ENTRY;
+
+       memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+       oa = &data->ioc_obdo1;
+       if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+               oa->o_valid |= OBD_MD_FLGROUP;
+               ostid_set_seq_echo(&oa->o_oi);
+       }
+
+       /* This FID is unpacked just for validation at this point */
+       rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+       if (rc < 0)
+               RETURN(rc);
+
+       OBD_ALLOC_PTR(env);
+       if (env == NULL)
+               RETURN(-ENOMEM);
+
+       rc = lu_env_init(env, LCT_DT_THREAD);
+       if (rc)
+               GOTO(out, rc = -ENOMEM);
+
+       switch (cmd) {
+       case OBD_IOC_CREATE:                /* may create echo object */
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+                                       data->ioc_plen1, &dummy_oti);
+               GOTO(out, rc);
+
+       case OBD_IOC_ECHO_MD: {
+               int count;
+               int cmd;
+               char *dir = NULL;
+               int dirlen;
+               __u64 id;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO(out, rc = -EPERM);
+
+               count = data->ioc_count;
+               cmd = data->ioc_command;
+
+               id = ostid_id(&data->ioc_obdo2.o_oi);
+
+               dirlen = data->ioc_plen1;
+               OBD_ALLOC(dir, dirlen + 1);
+               if (dir == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+                       OBD_FREE(dir, data->ioc_plen1 + 1);
+                       GOTO(out, rc = -EFAULT);
+               }
+
+               rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+               OBD_FREE(dir, dirlen + 1);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_ECHO_ALLOC_SEQ: {
+               struct lu_env   *cl_env;
+               int           refcheck;
+               __u64       seq;
+               int           max_count;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO(out, rc = -EPERM);
+
+               cl_env = cl_env_get(&refcheck);
+               if (IS_ERR(cl_env))
+                       GOTO(out, rc = PTR_ERR(cl_env));
+
+               rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG,
+                                           ECHO_MD_SES_TAG);
+               if (rc != 0) {
+                       cl_env_put(cl_env, &refcheck);
+                       GOTO(out, rc);
+               }
+
+               rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq);
+               cl_env_put(cl_env, &refcheck);
+               if (rc < 0) {
+                       CERROR("%s: Can not alloc seq: rc = %d\n",
+                              obd->obd_name, rc);
+                       GOTO(out, rc);
+               }
+
+               if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+                       return -EFAULT;
+
+               max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+               if (copy_to_user(data->ioc_pbuf2, &max_count,
+                                    data->ioc_plen2))
+                       return -EFAULT;
+               GOTO(out, rc);
+       }
+       case OBD_IOC_DESTROY:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+                                        &dummy_oti, NULL, NULL);
+                       if (rc == 0)
+                               eco->eo_deleted = 1;
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_GETATTR:
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       struct obd_info oinfo = { { { 0 } } };
+                       oinfo.oi_md = eco->eo_lsm;
+                       oinfo.oi_oa = oa;
+                       rc = obd_getattr(env, ec->ec_exp, &oinfo);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_SETATTR:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       struct obd_info oinfo = { { { 0 } } };
+                       oinfo.oi_oa = oa;
+                       oinfo.oi_md = eco->eo_lsm;
+
+                       rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_BRW_WRITE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rw = OBD_BRW_WRITE;
+               /* fall through */
+       case OBD_IOC_BRW_READ:
+               rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+               GOTO(out, rc);
+
+       case ECHO_IOC_GET_STRIPE:
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+                                             data->ioc_plen1);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case ECHO_IOC_SET_STRIPE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               if (data->ioc_pbuf1 == NULL) {  /* unset */
+                       rc = echo_get_object(&eco, ed, oa);
+                       if (rc == 0) {
+                               eco->eo_deleted = 1;
+                               echo_put_object(eco);
+                       }
+               } else {
+                       rc = echo_create_object(env, ed, 0, oa,
+                                               data->ioc_pbuf1,
+                                               data->ioc_plen1, &dummy_oti);
+               }
+               GOTO (out, rc);
+
+       case ECHO_IOC_ENQUEUE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_client_enqueue(exp, oa,
+                                        data->ioc_conn1, /* lock mode */
+                                        data->ioc_offset,
+                                        data->ioc_count);/*extent*/
+               GOTO (out, rc);
+
+       case ECHO_IOC_CANCEL:
+               rc = echo_client_cancel(exp, oa);
+               GOTO (out, rc);
+
+       default:
+               CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+               GOTO (out, rc = -ENOTTY);
+       }
+
+       EXIT;
+out:
+       lu_env_fini(env);
+       OBD_FREE_PTR(env);
+
+       /* XXX this should be in a helper also called by target_send_reply */
+       for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+            i++, ack_lock++) {
+               if (!ack_lock->mode)
+                       break;
+               ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+       }
+
+       return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+                            struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+       struct echo_client_obd *ec = &obddev->u.echo_client;
+       struct obd_device *tgt;
+       struct obd_uuid echo_uuid = { "ECHO_UUID" };
+       struct obd_connect_data *ocd = NULL;
+       int rc;
+       ENTRY;
+
+       if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+       if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+               CERROR("device not attached or not set up (%s)\n",
+                      lustre_cfg_string(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       spin_lock_init(&ec->ec_lock);
+       INIT_LIST_HEAD (&ec->ec_objects);
+       INIT_LIST_HEAD (&ec->ec_locks);
+       ec->ec_unique = 0;
+       ec->ec_nstripes = 0;
+
+       if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+               lu_context_tags_update(ECHO_MD_CTX_TAG);
+               lu_session_tags_update(ECHO_MD_SES_TAG);
+               RETURN(0);
+       }
+
+       OBD_ALLOC(ocd, sizeof(*ocd));
+       if (ocd == NULL) {
+               CERROR("Can't alloc ocd connecting to %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               return -ENOMEM;
+       }
+
+       ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+                                OBD_CONNECT_BRW_SIZE |
+                                OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+                                OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE;
+       ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+       ocd->ocd_version = LUSTRE_VERSION_CODE;
+       ocd->ocd_group = FID_SEQ_ECHO;
+
+       rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+       if (rc == 0) {
+               /* Turn off pinger because it connects to tgt obd directly. */
+               spin_lock(&tgt->obd_dev_lock);
+               list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+               spin_unlock(&tgt->obd_dev_lock);
+       }
+
+       OBD_FREE(ocd, sizeof(*ocd));
+
+       if (rc != 0) {
+               CERROR("fail to connect to device %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               return (rc);
+       }
+
+       RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+       struct echo_device *ed = obd2echo_dev(obddev);
+       struct echo_client_obd *ec = &obddev->u.echo_client;
+       int rc;
+       ENTRY;
+
+       /*Do nothing for Metadata echo client*/
+       if (ed == NULL )
+               RETURN(0);
+
+       if (ed->ed_next_ismd) {
+               lu_context_tags_clear(ECHO_MD_CTX_TAG);
+               lu_session_tags_clear(ECHO_MD_SES_TAG);
+               RETURN(0);
+       }
+
+       if (!list_empty(&obddev->obd_exports)) {
+               CERROR("still has clients!\n");
+               RETURN(-EBUSY);
+       }
+
+       LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+       rc = obd_disconnect(ec->ec_exp);
+       if (rc != 0)
+               CERROR("fail to disconnect device: %d\n", rc);
+
+       RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+                              struct obd_export **exp,
+                              struct obd_device *src, struct obd_uuid *cluuid,
+                              struct obd_connect_data *data, void *localdata)
+{
+       int             rc;
+       struct lustre_handle conn = { 0 };
+
+       ENTRY;
+       rc = class_connect(&conn, src, cluuid);
+       if (rc == 0) {
+               *exp = class_conn2export(&conn);
+       }
+
+       RETURN (rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+#if 0
+       struct obd_device      *obd;
+       struct echo_client_obd *ec;
+       struct ec_lock   *ecl;
+#endif
+       int                  rc;
+       ENTRY;
+
+       if (exp == NULL)
+               GOTO(out, rc = -EINVAL);
+
+#if 0
+       obd = exp->exp_obd;
+       ec = &obd->u.echo_client;
+
+       /* no more contention on export's lock list */
+       while (!list_empty (&exp->exp_ec_data.eced_locks)) {
+               ecl = list_entry (exp->exp_ec_data.eced_locks.next,
+                                     struct ec_lock, ecl_exp_chain);
+               list_del (&ecl->ecl_exp_chain);
+
+               rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm,
+                                ecl->ecl_mode, &ecl->ecl_lock_handle);
+
+               CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect "
+                       "(%d)\n", ecl->ecl_object->eco_id, rc);
+
+               echo_put_object (ecl->ecl_object);
+               OBD_FREE (ecl, sizeof (*ecl));
+       }
+#endif
+
+       rc = class_disconnect(exp);
+       GOTO(out, rc);
+ out:
+       return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+       .o_owner       = THIS_MODULE,
+
+#if 0
+       .o_setup       = echo_client_setup,
+       .o_cleanup     = echo_client_cleanup,
+#endif
+
+       .o_iocontrol   = echo_client_iocontrol,
+       .o_connect     = echo_client_connect,
+       .o_disconnect  = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+
+       lprocfs_echo_init_vars(&lvars);
+
+       rc = lu_kmem_init(echo_caches);
+       if (rc == 0) {
+               rc = class_register_type(&echo_client_obd_ops, NULL,
+                                        lvars.module_vars,
+                                        LUSTRE_ECHO_CLIENT_NAME,
+                                        &echo_device_type);
+               if (rc)
+                       lu_kmem_fini(echo_caches);
+       }
+       return rc;
+}
+
+void echo_client_exit(void)
+{
+       class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+       lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+       struct lprocfs_static_vars lvars;
+       int rc;
+
+       ENTRY;
+       LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+       LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+       lprocfs_echo_init_vars(&lvars);
+
+
+       rc = echo_client_init();
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+       echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(obdecho, LUSTRE_VERSION_STRING, obdecho_init, obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644 (file)
index 0000000..8e9dbc2
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE    (4<<10)
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644 (file)
index 0000000..e23ed32
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+       { "uuid",        lprocfs_rd_uuid,       0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+       { "num_refs",     lprocfs_rd_numrefs,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_echo_module_vars;
+    lvars->obd_vars     = lprocfs_echo_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644 (file)
index 0000000..bbd2f77
--- /dev/null
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o \
+        osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644 (file)
index 0000000..016ad02
--- /dev/null
@@ -0,0 +1,715 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+#ifdef LPROCFS
+static int osc_rd_active(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       int rc;
+
+       LPROCFS_CLIMP_CHECK(dev);
+       rc = snprintf(page, count, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+       LPROCFS_CLIMP_EXIT(dev);
+       return rc;
+}
+
+static int osc_wr_active(struct file *file, const char *buffer,
+                        unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       if (val < 0 || val > 1)
+               return -ERANGE;
+
+       /* opposite senses */
+       if (dev->u.cli.cl_import->imp_deactive == val)
+               rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+       else
+               CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+       return count;
+}
+
+static int osc_rd_max_rpcs_in_flight(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 1 || val > OSC_MAX_RIF_MAX)
+               return -ERANGE;
+
+       LPROCFS_CLIMP_CHECK(dev);
+       if (pool && val > cli->cl_max_rpcs_in_flight)
+               pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_rpcs_in_flight = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_EXIT(dev);
+       return count;
+}
+
+static int osc_rd_max_dirty_mb(char *page, char **start, off_t off, int count,
+                              int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       long val;
+       int mult;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       val = cli->cl_dirty_max;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       mult = 1 << 20;
+       return lprocfs_read_frac_helper(page, count, val, mult);
+}
+
+static int osc_wr_max_dirty_mb(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int pages_number, mult, rc;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number <= 0 ||
+           pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+           pages_number > num_physpages / 4) /* 1/4 of RAM */
+               return -ERANGE;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_dirty_max = (obd_count)(pages_number << PAGE_CACHE_SHIFT);
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return count;
+}
+
+static int osc_rd_cached_mb(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int shift = 20 - PAGE_CACHE_SHIFT;
+       int rc;
+
+       rc = snprintf(page, count,
+                     "used_mb: %d\n"
+                     "busy_cnt: %d\n",
+                     (atomic_read(&cli->cl_lru_in_list) +
+                       atomic_read(&cli->cl_lru_busy)) >> shift,
+                     atomic_read(&cli->cl_lru_busy));
+
+       return rc;
+}
+
+/* shrink the number of caching pages to a specific number */
+static int osc_wr_cached_mb(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int pages_number, mult, rc;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       buffer = lprocfs_find_named_value(buffer, "used_mb:", &count);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0)
+               return -ERANGE;
+
+       rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+       if (rc > 0)
+               (void)osc_lru_shrink(cli, rc);
+
+       return count;
+}
+
+static int osc_rd_cur_dirty_bytes(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%lu\n", cli->cl_dirty);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int osc_rd_cur_grant_bytes(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%lu\n", cli->cl_avail_grant);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int osc_wr_cur_grant_bytes(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       struct client_obd *cli = &obd->u.cli;
+       int             rc;
+       __u64         val;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       /* this is only for shrinking grant */
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (val >= cli->cl_avail_grant) {
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               return 0;
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_CHECK(obd);
+       if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+               rc = osc_shrink_grant_to_target(cli, val);
+       LPROCFS_CLIMP_EXIT(obd);
+       if (rc)
+               return rc;
+       return count;
+}
+
+static int osc_rd_cur_lost_grant_bytes(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = snprintf(page, count, "%lu\n", cli->cl_lost_grant);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int osc_rd_grant_shrink_interval(char *page, char **start, off_t off,
+                                       int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+
+       if (obd == NULL)
+               return 0;
+       return snprintf(page, count, "%d\n",
+                       obd->u.cli.cl_grant_shrink_interval);
+}
+
+static int osc_wr_grant_shrink_interval(struct file *file, const char *buffer,
+                                       unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       int val, rc;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val <= 0)
+               return -ERANGE;
+
+       obd->u.cli.cl_grant_shrink_interval = val;
+
+       return count;
+}
+
+static int osc_rd_checksum(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+       struct obd_device *obd = data;
+
+       if (obd == NULL)
+               return 0;
+
+       return snprintf(page, count, "%d\n",
+                       obd->u.cli.cl_checksum ? 1 : 0);
+}
+
+static int osc_wr_checksum(struct file *file, const char *buffer,
+                          unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       int val, rc;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+       return count;
+}
+
+static int osc_rd_checksum_type(char *page, char **start, off_t off, int count,
+                               int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       int i, len =0;
+       DECLARE_CKSUM_NAME;
+
+       if (obd == NULL)
+               return 0;
+
+       for (i = 0; i < ARRAY_SIZE(cksum_name) && len < count; i++) {
+               if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+                       continue;
+               if (obd->u.cli.cl_cksum_type == (1 << i))
+                       len += snprintf(page + len, count - len, "[%s] ",
+                                       cksum_name[i]);
+               else
+                       len += snprintf(page + len, count - len, "%s ",
+                                       cksum_name[i]);
+       }
+       if (len < count)
+               len += sprintf(page + len, "\n");
+       return len;
+}
+
+static int osc_wd_checksum_type(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       int i;
+       DECLARE_CKSUM_NAME;
+       char kernbuf[10];
+
+       if (obd == NULL)
+               return 0;
+
+       if (count > sizeof(kernbuf) - 1)
+               return -EINVAL;
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+       if (count > 0 && kernbuf[count - 1] == '\n')
+               kernbuf[count - 1] = '\0';
+       else
+               kernbuf[count] = '\0';
+
+       for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+               if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+                       continue;
+               if (!strcmp(kernbuf, cksum_name[i])) {
+                      obd->u.cli.cl_cksum_type = 1 << i;
+                      return count;
+               }
+       }
+       return -EINVAL;
+}
+
+static int osc_rd_resend_count(char *page, char **start, off_t off, int count,
+                              int *eof, void *data)
+{
+       struct obd_device *obd = data;
+
+       return snprintf(page, count, "%u\n",
+                       atomic_read(&obd->u.cli.cl_resends));
+}
+
+static int osc_wr_resend_count(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 0)
+              return -EINVAL;
+
+       atomic_set(&obd->u.cli.cl_resends, val);
+
+       return count;
+}
+
+static int osc_rd_contention_seconds(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return snprintf(page, count, "%u\n", od->od_contention_time);
+}
+
+static int osc_wr_contention_seconds(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+               count;
+}
+
+static int osc_rd_lockless_truncate(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return snprintf(page, count, "%u\n", od->od_lockless_truncate);
+}
+
+static int osc_wr_lockless_truncate(struct file *file, const char *buffer,
+                                   unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+               count;
+}
+
+static int osc_rd_destroys_in_flight(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       return snprintf(page, count, "%u\n",
+                       atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+
+static int lprocfs_osc_wr_max_pages_per_rpc(struct file *file,
+       const char *buffer, unsigned long count, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+       int chunk_mask, rc;
+       __u64 val;
+
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       /* if the max_pages is specified in bytes, convert to pages */
+       if (val >= ONE_MB_BRW_SIZE)
+               val >>= PAGE_CACHE_SHIFT;
+
+       LPROCFS_CLIMP_CHECK(dev);
+
+       chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+       /* max_pages_per_rpc must be chunk aligned */
+       val = (val + ~chunk_mask) & chunk_mask;
+       if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+               LPROCFS_CLIMP_EXIT(dev);
+               return -ERANGE;
+       }
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_pages_per_rpc = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_EXIT(dev);
+       return count;
+}
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+       { "uuid",           lprocfs_rd_uuid,    0, 0 },
+       { "ping",           0, lprocfs_wr_ping,     0, 0, 0222 },
+       { "connect_flags",   lprocfs_rd_connect_flags, 0, 0 },
+       { "blocksize",       lprocfs_rd_blksize,     0, 0 },
+       { "kbytestotal",     lprocfs_rd_kbytestotal, 0, 0 },
+       { "kbytesfree",      lprocfs_rd_kbytesfree,  0, 0 },
+       { "kbytesavail",     lprocfs_rd_kbytesavail, 0, 0 },
+       { "filestotal",      lprocfs_rd_filestotal,  0, 0 },
+       { "filesfree",       lprocfs_rd_filesfree,   0, 0 },
+       //{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
+       { "ost_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
+       { "ost_conn_uuid",   lprocfs_rd_conn_uuid, 0, 0 },
+       { "active",       osc_rd_active,
+                            osc_wr_active, 0 },
+       { "max_pages_per_rpc", lprocfs_obd_rd_max_pages_per_rpc,
+                              lprocfs_osc_wr_max_pages_per_rpc, 0 },
+       { "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight,
+                               osc_wr_max_rpcs_in_flight, 0 },
+       { "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 },
+       { "max_dirty_mb",    osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
+       { "osc_cached_mb",   osc_rd_cached_mb,     osc_wr_cached_mb, 0 },
+       { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
+       { "cur_grant_bytes", osc_rd_cur_grant_bytes,
+                            osc_wr_cur_grant_bytes, 0 },
+       { "cur_lost_grant_bytes", osc_rd_cur_lost_grant_bytes, 0, 0},
+       { "grant_shrink_interval", osc_rd_grant_shrink_interval,
+                                  osc_wr_grant_shrink_interval, 0 },
+       { "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
+       { "checksum_type",   osc_rd_checksum_type, osc_wd_checksum_type, 0 },
+       { "resend_count",    osc_rd_resend_count, osc_wr_resend_count, 0},
+       { "timeouts",   lprocfs_rd_timeouts,      0, 0 },
+       { "contention_seconds", osc_rd_contention_seconds,
+                               osc_wr_contention_seconds, 0 },
+       { "lockless_truncate",  osc_rd_lockless_truncate,
+                               osc_wr_lockless_truncate, 0 },
+       { "import",       lprocfs_rd_import,    lprocfs_wr_import, 0 },
+       { "state",         lprocfs_rd_state,     0, 0 },
+       { "pinger_recov",    lprocfs_rd_pinger_recov,
+                            lprocfs_wr_pinger_recov,  0, 0 },
+       { 0 }
+};
+
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+       { "num_refs",   lprocfs_rd_numrefs,     0, 0 },
+       { 0 }
+};
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+       int i;
+
+       do_gettimeofday(&now);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "read RPCs in flight:  %d\n",
+                  cli->cl_r_in_flight);
+       seq_printf(seq, "write RPCs in flight: %d\n",
+                  cli->cl_w_in_flight);
+       seq_printf(seq, "pending write pages:  %d\n",
+                  atomic_read(&cli->cl_pending_w_pages));
+       seq_printf(seq, "pending read pages:   %d\n",
+                  atomic_read(&cli->cl_pending_r_pages));
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "pages per rpc   rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                                1 << i, r, pct(r, read_tot),
+                                pct(read_cum, read_tot), w,
+                                pct(w, write_tot),
+                                pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "rpcs in flight rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                                i, r, pct(r, read_tot),
+                                pct(read_cum, read_tot), w,
+                                pct(w, write_tot),
+                                pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "offset         rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                          (i == 0) ? 0 : 1 << (i - 1),
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+
+       lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+       lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+       lprocfs_oh_clear(&cli->cl_read_page_hist);
+       lprocfs_oh_clear(&cli->cl_write_page_hist);
+       lprocfs_oh_clear(&cli->cl_read_offset_hist);
+       lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct obd_device *dev = seq->private;
+       struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+       do_gettimeofday(&now);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n",
+                  stats->os_lockless_writes);
+       seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n",
+                  stats->os_lockless_reads);
+       seq_printf(seq, "lockless_truncate\t\t"LPU64"\n",
+                  stats->os_lockless_truncates);
+       return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file, const char *buf,
+                                  size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_device *dev = seq->private;
+       struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+       memset(stats, 0, sizeof(*stats));
+       return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+       int rc;
+
+       rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+                               &osc_stats_fops, dev);
+       if (rc == 0)
+               rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+                                           &osc_rpc_stats_fops, dev);
+
+       return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars = lprocfs_osc_module_vars;
+       lvars->obd_vars    = lprocfs_osc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644 (file)
index 0000000..206fead
--- /dev/null
@@ -0,0 +1,3002 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+                          int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+                             struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+                            struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+                              struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+                          unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+                                 const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+       osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+       char *buf = flags;
+       *buf++ = ext->oe_rw ? 'r' : 'w';
+       if (ext->oe_intree)
+               *buf++ = 'i';
+       if (ext->oe_srvlock)
+               *buf++ = 's';
+       if (ext->oe_hp)
+               *buf++ = 'h';
+       if (ext->oe_urgent)
+               *buf++ = 'u';
+       if (ext->oe_memalloc)
+               *buf++ = 'm';
+       if (ext->oe_trunc_pending)
+               *buf++ = 't';
+       if (ext->oe_fsync_wait)
+               *buf++ = 'Y';
+       *buf = 0;
+       return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+       return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {                          \
+       struct osc_extent *__ext = (extent);                                  \
+       const char *__str[] = OES_STRINGS;                                    \
+       char __buf[16];                                                       \
+                                                                             \
+       CDEBUG(lvl,                                                           \
+               "extent %p@{" EXTSTR ", "                                     \
+               "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,          \
+               /* ----- extent part 0 ----- */                               \
+               __ext, EXTPARA(__ext),                                        \
+               /* ----- part 1 ----- */                                      \
+               atomic_read(&__ext->oe_refc),                         \
+               atomic_read(&__ext->oe_users),                        \
+               list_empty_marker(&__ext->oe_link),                           \
+               __str[__ext->oe_state], ext_flags(__ext, __buf),              \
+               __ext->oe_obj,                                                \
+               /* ----- part 2 ----- */                                      \
+               __ext->oe_grants, __ext->oe_nr_pages,                         \
+               list_empty_marker(&__ext->oe_pages),                          \
+               waitqueue_active(&__ext->oe_waitq) ? '+' : '-',               \
+               __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner,           \
+               /* ----- part 4 ----- */                                      \
+               ## __VA_ARGS__);                                              \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {                         \
+       if (!(expr)) {                                                  \
+               OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);            \
+               osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);            \
+               LASSERT(expr);                                          \
+       }                                                                    \
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+       if (n == NULL)
+               return NULL;
+
+       return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+       if (ext == NULL)
+               return NULL;
+
+       LASSERT(ext->oe_intree);
+       return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+       if (ext == NULL)
+               return NULL;
+
+       LASSERT(ext->oe_intree);
+       return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+       return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+                                   const char *func, const int line)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct osc_async_page *oap;
+       int page_count;
+       int rc = 0;
+
+       if (!osc_object_is_locked(obj))
+               GOTO(out, rc = 9);
+
+       if (ext->oe_state >= OES_STATE_MAX)
+               GOTO(out, rc = 10);
+
+       if (atomic_read(&ext->oe_refc) <= 0)
+               GOTO(out, rc = 20);
+
+       if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+               GOTO(out, rc = 30);
+
+       switch (ext->oe_state) {
+       case OES_INV:
+               if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+                       GOTO(out, rc = 35);
+               GOTO(out, rc = 0);
+               break;
+       case OES_ACTIVE:
+               if (atomic_read(&ext->oe_users) == 0)
+                       GOTO(out, rc = 40);
+               if (ext->oe_hp)
+                       GOTO(out, rc = 50);
+               if (ext->oe_fsync_wait && !ext->oe_urgent)
+                       GOTO(out, rc = 55);
+               break;
+       case OES_CACHE:
+               if (ext->oe_grants == 0)
+                       GOTO(out, rc = 60);
+               if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+                       GOTO(out, rc = 65);
+       default:
+               if (atomic_read(&ext->oe_users) > 0)
+                       GOTO(out, rc = 70);
+       }
+
+       if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+               GOTO(out, rc = 80);
+
+       if (ext->oe_osclock == NULL && ext->oe_grants > 0)
+               GOTO(out, rc = 90);
+
+       if (ext->oe_osclock) {
+               struct cl_lock_descr *descr;
+               descr = &ext->oe_osclock->cll_descr;
+               if (!(descr->cld_start <= ext->oe_start &&
+                     descr->cld_end >= ext->oe_max_end))
+                       GOTO(out, rc = 100);
+       }
+
+       if (ext->oe_nr_pages > ext->oe_mppr)
+               GOTO(out, rc = 105);
+
+       /* Do not verify page list if extent is in RPC. This is because an
+        * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+       if (ext->oe_state > OES_CACHE)
+               GOTO(out, rc = 0);
+
+       if (!extent_debug)
+               GOTO(out, rc = 0);
+
+       page_count = 0;
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               pgoff_t index = oap2cl_page(oap)->cp_index;
+               ++page_count;
+               if (index > ext->oe_end || index < ext->oe_start)
+                       GOTO(out, rc = 110);
+       }
+       if (page_count != ext->oe_nr_pages)
+               GOTO(out, rc = 120);
+
+out:
+       if (rc != 0)
+               OSC_EXTENT_DUMP(D_ERROR, ext,
+                               "%s:%d sanity check %p failed with rc = %d\n",
+                               func, line, ext, rc);
+       return rc;
+}
+
+#define sanity_check_nolock(ext) \
+       osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({                                              \
+       int __res;                                                           \
+       osc_object_lock((ext)->oe_obj);                                 \
+       __res = sanity_check_nolock(ext);                                     \
+       osc_object_unlock((ext)->oe_obj);                                     \
+       __res;                                                           \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+                                   struct osc_extent *ext)
+{
+       struct osc_extent *tmp;
+
+       LASSERT(osc_object_is_locked(obj));
+
+       if (!extent_debug)
+               return 0;
+
+       for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+               if (tmp == ext)
+                       continue;
+               if (tmp->oe_end >= ext->oe_start &&
+                   tmp->oe_start <= ext->oe_end)
+                       return 1;
+       }
+       return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+       /* Never try to sanity check a state changing extent :-) */
+       /* LASSERT(sanity_check_nolock(ext) == 0); */
+
+       /* TODO: validate the state machine */
+       ext->oe_state = state;
+       wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+       struct osc_extent *ext;
+
+       OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+       if (ext == NULL)
+               return NULL;
+
+       RB_CLEAR_NODE(&ext->oe_node);
+       ext->oe_obj = obj;
+       atomic_set(&ext->oe_refc, 1);
+       atomic_set(&ext->oe_users, 0);
+       INIT_LIST_HEAD(&ext->oe_link);
+       ext->oe_state = OES_INV;
+       INIT_LIST_HEAD(&ext->oe_pages);
+       init_waitqueue_head(&ext->oe_waitq);
+       ext->oe_osclock = NULL;
+
+       return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+       OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) >= 0);
+       atomic_inc(&ext->oe_refc);
+       return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) > 0);
+       if (atomic_dec_and_test(&ext->oe_refc)) {
+               LASSERT(list_empty(&ext->oe_link));
+               LASSERT(atomic_read(&ext->oe_users) == 0);
+               LASSERT(ext->oe_state == OES_INV);
+               LASSERT(!ext->oe_intree);
+
+               if (ext->oe_osclock) {
+                       cl_lock_put(env, ext->oe_osclock);
+                       ext->oe_osclock = NULL;
+               }
+               osc_extent_free(ext);
+       }
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) > 1);
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+                                           pgoff_t index)
+{
+       struct rb_node    *n = obj->oo_root.rb_node;
+       struct osc_extent *tmp, *p = NULL;
+
+       LASSERT(osc_object_is_locked(obj));
+       while (n != NULL) {
+               tmp = rb_extent(n);
+               if (index < tmp->oe_start) {
+                       n = n->rb_left;
+               } else if (index > tmp->oe_end) {
+                       p = rb_extent(n);
+                       n = n->rb_right;
+               } else {
+                       return tmp;
+               }
+       }
+       return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+                                           pgoff_t index)
+{
+       struct osc_extent *ext;
+
+       ext = osc_extent_search(obj, index);
+       if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+               return osc_extent_get(ext);
+       return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+       struct rb_node   **n      = &obj->oo_root.rb_node;
+       struct rb_node    *parent = NULL;
+       struct osc_extent *tmp;
+
+       LASSERT(ext->oe_intree == 0);
+       LASSERT(ext->oe_obj == obj);
+       LASSERT(osc_object_is_locked(obj));
+       while (*n != NULL) {
+               tmp = rb_extent(*n);
+               parent = *n;
+
+               if (ext->oe_end < tmp->oe_start)
+                       n = &(*n)->rb_left;
+               else if (ext->oe_start > tmp->oe_end)
+                       n = &(*n)->rb_right;
+               else
+                       EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+       }
+       rb_link_node(&ext->oe_node, parent, n);
+       rb_insert_color(&ext->oe_node, &obj->oo_root);
+       osc_extent_get(ext);
+       ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+       LASSERT(osc_object_is_locked(obj));
+       if (ext->oe_intree) {
+               rb_erase(&ext->oe_node, &obj->oo_root);
+               ext->oe_intree = 0;
+               /* rbtree held a refcount */
+               osc_extent_put_trust(ext);
+       }
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+
+       LASSERT(osc_object_is_locked(obj));
+       LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+       if (ext->oe_state == OES_CACHE) {
+               osc_extent_state_set(ext, OES_ACTIVE);
+               osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+       }
+       atomic_inc(&ext->oe_users);
+       list_del_init(&ext->oe_link);
+       return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       LASSERT(list_empty(&ext->oe_pages));
+       osc_extent_erase(ext);
+       list_del_init(&ext->oe_link);
+       osc_extent_state_set(ext, OES_INV);
+       OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+
+       osc_object_lock(obj);
+       __osc_extent_remove(ext);
+       osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+                           struct osc_extent *victim)
+{
+       struct osc_object *obj = cur->oe_obj;
+       pgoff_t chunk_start;
+       pgoff_t chunk_end;
+       int ppc_bits;
+
+       LASSERT(cur->oe_state == OES_CACHE);
+       LASSERT(osc_object_is_locked(obj));
+       if (victim == NULL)
+               return -EINVAL;
+
+       if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+               return -EBUSY;
+
+       if (cur->oe_max_end != victim->oe_max_end)
+               return -ERANGE;
+
+       LASSERT(cur->oe_osclock == victim->oe_osclock);
+       ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+       chunk_start = cur->oe_start >> ppc_bits;
+       chunk_end   = cur->oe_end   >> ppc_bits;
+       if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+           chunk_end + 1 != victim->oe_start >> ppc_bits)
+               return -ERANGE;
+
+       OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+       cur->oe_start     = min(cur->oe_start, victim->oe_start);
+       cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+       cur->oe_grants   += victim->oe_grants;
+       cur->oe_nr_pages += victim->oe_nr_pages;
+       /* only the following bits are needed to merge */
+       cur->oe_urgent   |= victim->oe_urgent;
+       cur->oe_memalloc |= victim->oe_memalloc;
+       list_splice_init(&victim->oe_pages, &cur->oe_pages);
+       list_del_init(&victim->oe_link);
+       victim->oe_nr_pages = 0;
+
+       osc_extent_get(victim);
+       __osc_extent_remove(victim);
+       osc_extent_put(env, victim);
+
+       OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+       return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(atomic_read(&ext->oe_users) > 0);
+       LASSERT(sanity_check(ext) == 0);
+       LASSERT(ext->oe_grants > 0);
+
+       if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+               LASSERT(ext->oe_state == OES_ACTIVE);
+               if (ext->oe_trunc_pending) {
+                       /* a truncate process is waiting for this extent.
+                        * This may happen due to a race, check
+                        * osc_cache_truncate_start(). */
+                       osc_extent_state_set(ext, OES_TRUNC);
+                       ext->oe_trunc_pending = 0;
+               } else {
+                       osc_extent_state_set(ext, OES_CACHE);
+                       osc_update_pending(obj, OBD_BRW_WRITE,
+                                          ext->oe_nr_pages);
+
+                       /* try to merge the previous and next extent. */
+                       osc_extent_merge(env, ext, prev_extent(ext));
+                       osc_extent_merge(env, ext, next_extent(ext));
+
+                       if (ext->oe_urgent)
+                               list_move_tail(&ext->oe_link,
+                                                  &obj->oo_urgent_exts);
+               }
+               osc_object_unlock(obj);
+
+               osc_io_unplug_async(env, osc_cli(obj), obj);
+       }
+       osc_extent_put(env, ext);
+       RETURN(rc);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+       return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+                                  struct osc_object *obj, pgoff_t index,
+                                  int *grants)
+
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct cl_lock    *lock;
+       struct osc_extent *cur;
+       struct osc_extent *ext;
+       struct osc_extent *conflict = NULL;
+       struct osc_extent *found = NULL;
+       pgoff_t    chunk;
+       pgoff_t    max_end;
+       int     max_pages; /* max_pages_per_rpc */
+       int     chunksize;
+       int     ppc_bits; /* pages per chunk bits */
+       int     chunk_mask;
+       int     rc;
+       ENTRY;
+
+       cur = osc_extent_alloc(obj);
+       if (cur == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+       LASSERT(lock != NULL);
+       LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+       LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+       ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       chunk_mask = ~((1 << ppc_bits) - 1);
+       chunksize  = 1 << cli->cl_chunkbits;
+       chunk      = index >> ppc_bits;
+
+       /* align end to rpc edge, rpc size may not be a power 2 integer. */
+       max_pages = cli->cl_max_pages_per_rpc;
+       LASSERT((max_pages & ~chunk_mask) == 0);
+       max_end = index - (index % max_pages) + max_pages - 1;
+       max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+       /* initialize new extent by parameters so far */
+       cur->oe_max_end = max_end;
+       cur->oe_start   = index & chunk_mask;
+       cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+       if (cur->oe_start < lock->cll_descr.cld_start)
+               cur->oe_start = lock->cll_descr.cld_start;
+       if (cur->oe_end > max_end)
+               cur->oe_end = max_end;
+       cur->oe_osclock = lock;
+       cur->oe_grants  = 0;
+       cur->oe_mppr    = max_pages;
+
+       /* grants has been allocated by caller */
+       LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+                "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+       LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, cur->oe_start);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       while (ext != NULL) {
+               loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+               loff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+               LASSERT(sanity_check_nolock(ext) == 0);
+               if (chunk > ext_chk_end + 1)
+                       break;
+
+               /* if covering by different locks, no chance to match */
+               if (lock != ext->oe_osclock) {
+                       EASSERTF(!overlapped(ext, cur), ext,
+                                EXTSTR, EXTPARA(cur));
+
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* discontiguous chunks? */
+               if (chunk + 1 < ext_chk_start) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* ok, from now on, ext and cur have these attrs:
+                * 1. covered by the same lock
+                * 2. contiguous at chunk level or overlapping. */
+
+               if (overlapped(ext, cur)) {
+                       /* cur is the minimum unit, so overlapping means
+                        * full contain. */
+                       EASSERTF((ext->oe_start <= cur->oe_start &&
+                                 ext->oe_end >= cur->oe_end),
+                                ext, EXTSTR, EXTPARA(cur));
+
+                       if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+                               /* for simplicity, we wait for this extent to
+                                * finish before going forward. */
+                               conflict = osc_extent_get(ext);
+                               break;
+                       }
+
+                       found = osc_extent_hold(ext);
+                       break;
+               }
+
+               /* non-overlapped extent */
+               if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+                       /* we can't do anything for a non OES_CACHE extent, or
+                        * if there is someone waiting for this extent to be
+                        * flushed, try next one. */
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* check if they belong to the same rpc slot before trying to
+                * merge. the extents are not overlapped and contiguous at
+                * chunk level to get here. */
+               if (ext->oe_max_end != max_end) {
+                       /* if they don't belong to the same RPC slot or
+                        * max_pages_per_rpc has ever changed, do not merge. */
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* it's required that an extent must be contiguous at chunk
+                * level so that we know the whole extent is covered by grant
+                * (the pages in the extent are NOT required to be contiguous).
+                * Otherwise, it will be too much difficult to know which
+                * chunks have grants allocated. */
+
+               /* try to do front merge - extend ext's start */
+               if (chunk + 1 == ext_chk_start) {
+                       /* ext must be chunk size aligned */
+                       EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+                       /* pull ext's start back to cover cur */
+                       ext->oe_start   = cur->oe_start;
+                       ext->oe_grants += chunksize;
+                       *grants -= chunksize;
+
+                       found = osc_extent_hold(ext);
+               } else if (chunk == ext_chk_end + 1) {
+                       /* rear merge */
+                       ext->oe_end     = cur->oe_end;
+                       ext->oe_grants += chunksize;
+                       *grants -= chunksize;
+
+                       /* try to merge with the next one because we just fill
+                        * in a gap */
+                       if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+                               /* we can save extent tax from next extent */
+                               *grants += cli->cl_extent_tax;
+
+                       found = osc_extent_hold(ext);
+               }
+               if (found != NULL)
+                       break;
+
+               ext = next_extent(ext);
+       }
+
+       osc_extent_tree_dump(D_CACHE, obj);
+       if (found != NULL) {
+               LASSERT(conflict == NULL);
+               if (!IS_ERR(found)) {
+                       LASSERT(found->oe_osclock == cur->oe_osclock);
+                       OSC_EXTENT_DUMP(D_CACHE, found,
+                                       "found caching ext for %lu.\n", index);
+               }
+       } else if (conflict == NULL) {
+               /* create a new extent */
+               EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+               cur->oe_grants = chunksize + cli->cl_extent_tax;
+               *grants -= cur->oe_grants;
+               LASSERT(*grants >= 0);
+
+               cur->oe_state = OES_CACHE;
+               found = osc_extent_hold(cur);
+               osc_extent_insert(obj, cur);
+               OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+                               index, lock->cll_descr.cld_end);
+       }
+       osc_object_unlock(obj);
+
+       if (conflict != NULL) {
+               LASSERT(found == NULL);
+
+               /* waiting for IO to finish. Please notice that it's impossible
+                * to be an OES_TRUNC extent. */
+               rc = osc_extent_wait(env, conflict, OES_INV);
+               osc_extent_put(env, conflict);
+               conflict = NULL;
+               if (rc < 0)
+                       GOTO(out, found = ERR_PTR(rc));
+
+               goto restart;
+       }
+       EXIT;
+
+out:
+       osc_extent_put(env, cur);
+       LASSERT(*grants >= 0);
+       return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+                     int sent, int rc)
+{
+       struct client_obd *cli = osc_cli(ext->oe_obj);
+       struct osc_async_page *oap;
+       struct osc_async_page *tmp;
+       int nr_pages = ext->oe_nr_pages;
+       int lost_grant = 0;
+       int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+       __u64 last_off = 0;
+       int last_count = -1;
+       ENTRY;
+
+       OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+       ext->oe_rc = rc ?: ext->oe_nr_pages;
+       EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+       list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+                                    oap_pending_item) {
+               list_del_init(&oap->oap_rpc_item);
+               list_del_init(&oap->oap_pending_item);
+               if (last_off <= oap->oap_obj_off) {
+                       last_off = oap->oap_obj_off;
+                       last_count = oap->oap_count;
+               }
+
+               --ext->oe_nr_pages;
+               osc_ap_completion(env, cli, oap, sent, rc);
+       }
+       EASSERT(ext->oe_nr_pages == 0, ext);
+
+       if (!sent) {
+               lost_grant = ext->oe_grants;
+       } else if (blocksize < PAGE_CACHE_SIZE &&
+                  last_count != PAGE_CACHE_SIZE) {
+               /* For short writes we shouldn't count parts of pages that
+                * span a whole chunk on the OST side, or our accounting goes
+                * wrong.  Should match the code in filter_grant_check. */
+               int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+               int count = oap->oap_count + (offset & (blocksize - 1));
+               int end = (offset + oap->oap_count) & (blocksize - 1);
+               if (end)
+                       count += blocksize - end;
+
+               lost_grant = PAGE_CACHE_SIZE - count;
+       }
+       if (ext->oe_grants > 0)
+               osc_free_grant(cli, nr_pages, lost_grant);
+
+       osc_extent_remove(ext);
+       /* put the refcount for RPC */
+       osc_extent_put(env, ext);
+       RETURN(0);
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+       int ret;
+
+       osc_object_lock(ext->oe_obj);
+       ret = ext->oe_state == state;
+       osc_object_unlock(ext->oe_obj);
+
+       return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+                          int state)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+                                                 LWI_ON_SIGNAL_NOOP, NULL);
+       int rc = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       LASSERT(sanity_check_nolock(ext) == 0);
+       /* `Kick' this extent only if the caller is waiting for it to be
+        * written out. */
+       if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+               if (ext->oe_state == OES_ACTIVE) {
+                       ext->oe_urgent = 1;
+               } else if (ext->oe_state == OES_CACHE) {
+                       ext->oe_urgent = 1;
+                       osc_extent_hold(ext);
+                       rc = 1;
+               }
+       }
+       osc_object_unlock(obj);
+       if (rc == 1)
+               osc_extent_release(env, ext);
+
+       /* wait for the extent until its state becomes @state */
+       rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+       if (rc == -ETIMEDOUT) {
+               OSC_EXTENT_DUMP(D_ERROR, ext,
+                       "%s: wait ext to %d timedout, recovery in progress?\n",
+                       osc_export(obj)->exp_obd->obd_name, state);
+
+               lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+                                 &lwi);
+       }
+       if (rc == 0 && ext->oe_rc < 0)
+               rc = ext->oe_rc;
+       RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+                               bool partial)
+{
+       struct cl_env_nest     nest;
+       struct lu_env    *env;
+       struct cl_io      *io;
+       struct osc_object     *obj = ext->oe_obj;
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_async_page *oap;
+       struct osc_async_page *tmp;
+       int                 pages_in_chunk = 0;
+       int                 ppc_bits    = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       __u64             trunc_chunk = trunc_index >> ppc_bits;
+       int                 grants   = 0;
+       int                 nr_pages = 0;
+       int                 rc       = 0;
+       ENTRY;
+
+       LASSERT(sanity_check(ext) == 0);
+       LASSERT(ext->oe_state == OES_TRUNC);
+       LASSERT(!ext->oe_urgent);
+
+       /* Request new lu_env.
+        * We can't use that env from osc_cache_truncate_start() because
+        * it's from lov_io_sub and not fully initialized. */
+       env = cl_env_nested_get(&nest);
+       io  = &osc_env_info(env)->oti_io;
+       io->ci_obj = cl_object_top(osc2cl(obj));
+       rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* discard all pages with index greater then trunc_index */
+       list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+                                    oap_pending_item) {
+               struct cl_page  *sub  = oap2cl_page(oap);
+               struct cl_page  *page = cl_page_top(sub);
+
+               LASSERT(list_empty(&oap->oap_rpc_item));
+
+               /* only discard the pages with their index greater than
+                * trunc_index, and ... */
+               if (sub->cp_index < trunc_index ||
+                   (sub->cp_index == trunc_index && partial)) {
+                       /* accounting how many pages remaining in the chunk
+                        * so that we can calculate grants correctly. */
+                       if (sub->cp_index >> ppc_bits == trunc_chunk)
+                               ++pages_in_chunk;
+                       continue;
+               }
+
+               list_del_init(&oap->oap_pending_item);
+
+               cl_page_get(page);
+               lu_ref_add(&page->cp_reference, "truncate", current);
+
+               if (cl_page_own(env, io, page) == 0) {
+                       cl_page_unmap(env, io, page);
+                       cl_page_discard(env, io, page);
+                       cl_page_disown(env, io, page);
+               } else {
+                       LASSERT(page->cp_state == CPS_FREEING);
+                       LASSERT(0);
+               }
+
+               lu_ref_del(&page->cp_reference, "truncate", current);
+               cl_page_put(env, page);
+
+               --ext->oe_nr_pages;
+               ++nr_pages;
+       }
+       EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+                     ext->oe_nr_pages == 0),
+               ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+       osc_object_lock(obj);
+       if (ext->oe_nr_pages == 0) {
+               LASSERT(pages_in_chunk == 0);
+               grants = ext->oe_grants;
+               ext->oe_grants = 0;
+       } else { /* calculate how many grants we can free */
+               int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+               pgoff_t last_index;
+
+
+               /* if there is no pages in this chunk, we can also free grants
+                * for the last chunk */
+               if (pages_in_chunk == 0) {
+                       /* if this is the 1st chunk and no pages in this chunk,
+                        * ext->oe_nr_pages must be zero, so we should be in
+                        * the other if-clause. */
+                       LASSERT(trunc_chunk > 0);
+                       --trunc_chunk;
+                       ++chunks;
+               }
+
+               /* this is what we can free from this extent */
+               grants    = chunks << cli->cl_chunkbits;
+               ext->oe_grants -= grants;
+               last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+               ext->oe_end     = min(last_index, ext->oe_max_end);
+               LASSERT(ext->oe_end >= ext->oe_start);
+               LASSERT(ext->oe_grants > 0);
+       }
+       osc_object_unlock(obj);
+
+       if (grants > 0 || nr_pages > 0)
+               osc_free_grant(cli, nr_pages, grants);
+
+out:
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+       RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+                                struct osc_extent *ext)
+{
+       struct osc_async_page *oap;
+       struct osc_async_page *last = NULL;
+       struct osc_object *obj = ext->oe_obj;
+       int page_count = 0;
+       int rc;
+       ENTRY;
+
+       /* we're going to grab page lock, so object lock must not be taken. */
+       LASSERT(sanity_check(ext) == 0);
+       /* in locking state, any process should not touch this extent. */
+       EASSERT(ext->oe_state == OES_LOCKING, ext);
+       EASSERT(ext->oe_owner != NULL, ext);
+
+       OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               ++page_count;
+               if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+                       last = oap;
+
+               /* checking ASYNC_READY is race safe */
+               if ((oap->oap_async_flags & ASYNC_READY) != 0)
+                       continue;
+
+               rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+               switch (rc) {
+               case 0:
+                       spin_lock(&oap->oap_lock);
+                       oap->oap_async_flags |= ASYNC_READY;
+                       spin_unlock(&oap->oap_lock);
+                       break;
+               case -EALREADY:
+                       LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+                       break;
+               default:
+                       LASSERTF(0, "unknown return code: %d\n", rc);
+               }
+       }
+
+       LASSERT(page_count == ext->oe_nr_pages);
+       LASSERT(last != NULL);
+       /* the last page is the only one we need to refresh its count by
+        * the size of file. */
+       if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+               last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+               LASSERT(last->oap_count > 0);
+               LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+               last->oap_async_flags |= ASYNC_COUNT_STABLE;
+       }
+
+       /* for the rest of pages, we don't need to call osf_refresh_count()
+        * because it's known they are not the last page */
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+                       oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+                       oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+               }
+       }
+
+       osc_object_lock(obj);
+       osc_extent_state_set(ext, OES_RPC);
+       osc_object_unlock(obj);
+       /* get a refcount for RPC. */
+       osc_extent_get(ext);
+
+       RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *next;
+       int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       pgoff_t chunk = index >> ppc_bits;
+       pgoff_t end_chunk;
+       pgoff_t end_index;
+       int chunksize = 1 << cli->cl_chunkbits;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+       osc_object_lock(obj);
+       LASSERT(sanity_check_nolock(ext) == 0);
+       end_chunk = ext->oe_end >> ppc_bits;
+       if (chunk > end_chunk + 1)
+               GOTO(out, rc = -ERANGE);
+
+       if (end_chunk >= chunk)
+               GOTO(out, rc = 0);
+
+       LASSERT(end_chunk + 1 == chunk);
+       /* try to expand this extent to cover @index */
+       end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+       next = next_extent(ext);
+       if (next != NULL && next->oe_start <= end_index)
+               /* complex mode - overlapped with the next extent,
+                * this case will be handled by osc_extent_find() */
+               GOTO(out, rc = -EAGAIN);
+
+       ext->oe_end = end_index;
+       ext->oe_grants += chunksize;
+       *grants -= chunksize;
+       LASSERT(*grants >= 0);
+       EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+                "overlapped after expanding for %lu.\n", index);
+       EXIT;
+
+out:
+       osc_object_unlock(obj);
+       RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+                                 const char *func, int line)
+{
+       struct osc_extent *ext;
+       int cnt;
+
+       CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+              obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+       /* osc_object_lock(obj); */
+       cnt = 1;
+       for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+               OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+       /* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+       return !list_empty(&osc->oo_ready_item) ||
+              !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)                                               \
+       CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+              (OSC), osc_is_ready(OSC),                                       \
+              list_empty_marker(&(OSC)->oo_hp_ready_item),                    \
+              list_empty_marker(&(OSC)->oo_ready_item),                       \
+              atomic_read(&(OSC)->oo_nr_writes),                              \
+              list_empty_marker(&(OSC)->oo_hp_exts),                          \
+              list_empty_marker(&(OSC)->oo_urgent_exts),                      \
+              atomic_read(&(OSC)->oo_nr_reads),                               \
+              list_empty_marker(&(OSC)->oo_reading_exts),                     \
+              ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd)
+{
+       struct osc_page *opg  = oap2osc_page(oap);
+       struct cl_page  *page = cl_page_top(oap2cl_page(oap));
+       int result;
+
+       LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+       ENTRY;
+       result = cl_page_make_ready(env, page, CRT_WRITE);
+       if (result == 0)
+               opg->ops_submit_time = cfs_time_current();
+       RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+                            struct osc_async_page *oap, int cmd)
+{
+       struct osc_page  *opg = oap2osc_page(oap);
+       struct cl_page   *page = oap2cl_page(oap);
+       struct cl_object *obj;
+       struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+       int result;
+       loff_t kms;
+
+       /* readpage queues with _COUNT_STABLE, shouldn't get here. */
+       LASSERT(!(cmd & OBD_BRW_READ));
+       LASSERT(opg != NULL);
+       obj = opg->ops_cl.cpl_obj;
+
+       cl_object_attr_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+       if (result < 0)
+               return result;
+       kms = attr->cat_kms;
+       if (cl_offset(obj, page->cp_index) >= kms)
+               /* catch race with truncate */
+               return 0;
+       else if (cl_offset(obj, page->cp_index + 1) > kms)
+               /* catch sub-page write at end of file */
+               return kms % PAGE_CACHE_SIZE;
+       else
+               return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd, int rc)
+{
+       struct osc_page   *opg  = oap2osc_page(oap);
+       struct cl_page    *page = cl_page_top(oap2cl_page(oap));
+       struct osc_object *obj  = cl2osc(opg->ops_cl.cpl_obj);
+       enum cl_req_type   crt;
+       int srvlock;
+
+       ENTRY;
+
+       cmd &= ~OBD_BRW_NOQUOTA;
+       LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+       LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+       LASSERT(opg->ops_transfer_pinned);
+
+       /*
+        * page->cp_req can be NULL if io submission failed before
+        * cl_req was allocated.
+        */
+       if (page->cp_req != NULL)
+               cl_req_page_done(env, page);
+       LASSERT(page->cp_req == NULL);
+
+       crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+       /* Clear opg->ops_transfer_pinned before VM lock is released. */
+       opg->ops_transfer_pinned = 0;
+
+       spin_lock(&obj->oo_seatbelt);
+       LASSERT(opg->ops_submitter != NULL);
+       LASSERT(!list_empty(&opg->ops_inflight));
+       list_del_init(&opg->ops_inflight);
+       opg->ops_submitter = NULL;
+       spin_unlock(&obj->oo_seatbelt);
+
+       opg->ops_submit_time = 0;
+       srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+       /* statistic */
+       if (rc == 0 && srvlock) {
+               struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+               struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+               int bytes = oap->oap_count;
+
+               if (crt == CRT_READ)
+                       stats->os_lockless_reads += bytes;
+               else
+                       stats->os_lockless_writes += bytes;
+       }
+
+       /*
+        * This has to be the last operation with the page, as locks are
+        * released in cl_page_completion() and nothing except for the
+        * reference counter protects page from concurrent reclaim.
+        */
+       lu_ref_del(&page->cp_reference, "transfer", page);
+
+       cl_page_completion(env, page, crt, rc);
+
+       RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do {                               \
+       struct client_obd *__tmp = (cli);                                     \
+       CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "            \
+              "unstable_pages: %d/%d dropped: %ld avail: %ld, "              \
+              "reserved: %ld, flight: %d } " fmt,                            \
+              __tmp->cl_import->imp_obd->obd_name,                           \
+              __tmp->cl_dirty, __tmp->cl_dirty_max,                          \
+              atomic_read(&obd_dirty_pages), obd_max_dirty_pages,            \
+              atomic_read(&obd_unstable_pages), obd_max_dirty_pages,     \
+              __tmp->cl_lost_grant, __tmp->cl_avail_grant,                   \
+              __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);      \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+                                   struct brw_page *pga)
+{
+       LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+       LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+       atomic_inc(&obd_dirty_pages);
+       cli->cl_dirty += PAGE_CACHE_SIZE;
+       pga->flag |= OBD_BRW_FROM_GRANT;
+       CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+              PAGE_CACHE_SIZE, pga, pga->pg);
+       osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+                                   struct brw_page *pga)
+{
+       ENTRY;
+
+       LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+       if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+               EXIT;
+               return;
+       }
+
+       pga->flag &= ~OBD_BRW_FROM_GRANT;
+       atomic_dec(&obd_dirty_pages);
+       cli->cl_dirty -= PAGE_CACHE_SIZE;
+       if (pga->flag & OBD_BRW_NOCACHE) {
+               pga->flag &= ~OBD_BRW_NOCACHE;
+               atomic_dec(&obd_dirty_transit_pages);
+               cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+       }
+       EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+       int rc = -EDQUOT;
+
+       if (cli->cl_avail_grant >= bytes) {
+               cli->cl_avail_grant    -= bytes;
+               cli->cl_reserved_grant += bytes;
+               rc = 0;
+       }
+       return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+                                 unsigned int reserved, unsigned int unused)
+{
+       /* it's quite normal for us to get more grant than reserved.
+        * Thinking about a case that two extents merged by adding a new
+        * chunk, we can save one extent tax. If extent tax is greater than
+        * one chunk, we can save more grant by adding a new chunk */
+       cli->cl_reserved_grant -= reserved;
+       if (unused > reserved) {
+               cli->cl_avail_grant += reserved;
+               cli->cl_lost_grant  += unused - reserved;
+       } else {
+               cli->cl_avail_grant += unused;
+       }
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+                        unsigned int reserved, unsigned int unused)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       __osc_unreserve_grant(cli, reserved, unused);
+       if (unused > 0)
+               osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+                          unsigned int lost_grant)
+{
+       int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       atomic_sub(nr_pages, &obd_dirty_pages);
+       cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+       cli->cl_lost_grant += lost_grant;
+       if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+               /* borrow some grant from truncate to avoid the case that
+                * truncate uses up all avail grant */
+               cli->cl_lost_grant -= grant;
+               cli->cl_avail_grant += grant;
+       }
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+              lost_grant, cli->cl_lost_grant,
+              cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       osc_release_write_grant(cli, &oap->oap_brw_page);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+                              struct osc_async_page *oap,
+                              int bytes, int transient)
+{
+       int rc;
+
+       OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+       rc = osc_reserve_grant(cli, bytes);
+       if (rc < 0)
+               return 0;
+
+       if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+           atomic_read(&obd_unstable_pages) + 1 +
+           atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) {
+               osc_consume_write_grant(cli, &oap->oap_brw_page);
+               if (transient) {
+                       cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+                       atomic_inc(&obd_dirty_transit_pages);
+                       oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+               }
+               rc = 1;
+       } else {
+               __osc_unreserve_grant(cli, bytes, bytes);
+               rc = 0;
+       }
+       return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+       int rc;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&ocw->ocw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+                          struct osc_async_page *oap, int bytes)
+{
+       struct osc_object *osc = oap->oap_obj;
+       struct lov_oinfo  *loi = osc->oo_oinfo;
+       struct osc_cache_waiter ocw;
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       int rc = -EDQUOT;
+       ENTRY;
+
+       OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+
+       /* force the caller to try sync io.  this can jump the list
+        * of queued writes and create a discontiguous rpc stream */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+           cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+           cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync)
+               GOTO(out, rc = -EDQUOT);
+
+       /* Hopefully normal case - cache space and write credits available */
+       if (osc_enter_cache_try(cli, oap, bytes, 0))
+               GOTO(out, rc = 0);
+
+       /* We can get here for two reasons: too many dirty pages in cache, or
+        * run out of grants. In both cases we should write dirty pages out.
+        * Adding a cache waiter will trigger urgent write-out no matter what
+        * RPC size will be.
+        * The exiting condition is no avail grants and no dirty pages caching,
+        * that really means there is no space on the OST. */
+       init_waitqueue_head(&ocw.ocw_waitq);
+       ocw.ocw_oap   = oap;
+       ocw.ocw_grant = bytes;
+       while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+               list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+               ocw.ocw_rc = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+               osc_io_unplug_async(env, cli, NULL);
+
+               CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+                      cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+               rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+
+               /* l_wait_event is interrupted by signal */
+               if (rc < 0) {
+                       list_del_init(&ocw.ocw_entry);
+                       GOTO(out, rc);
+               }
+
+               LASSERT(list_empty(&ocw.ocw_entry));
+               rc = ocw.ocw_rc;
+
+               if (rc != -EDQUOT)
+                       GOTO(out, rc);
+               if (osc_enter_cache_try(cli, oap, bytes, 0))
+                       GOTO(out, rc = 0);
+       }
+       EXIT;
+out:
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+       RETURN(rc);
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct osc_cache_waiter *ocw;
+
+       ENTRY;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+               ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+               list_del_init(&ocw->ocw_entry);
+
+               ocw->ocw_rc = -EDQUOT;
+               /* we can't dirty more */
+               if (cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max ||
+                   atomic_read(&obd_unstable_pages) + 1 +
+                   atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) {
+                       CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+                              "osc max %ld, sys max %d\n", cli->cl_dirty,
+                              cli->cl_dirty_max, obd_max_dirty_pages);
+                       goto wakeup;
+               }
+
+               ocw->ocw_rc = 0;
+               if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+                       ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+               CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+                      ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+               wake_up(&ocw->ocw_waitq);
+       }
+
+       EXIT;
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+       int hprpc = !!list_empty(&osc->oo_hp_exts);
+       return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+                        int cmd)
+{
+       int invalid_import = 0;
+       ENTRY;
+
+       /* if we have an invalid import we want to drain the queued pages
+        * by forcing them through rpcs that immediately fail and complete
+        * the pages.  recovery relies on this to empty the queued pages
+        * before canceling the locks and evicting down the llite pages */
+       if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+               invalid_import = 1;
+
+       if (cmd & OBD_BRW_WRITE) {
+               if (atomic_read(&osc->oo_nr_writes) == 0)
+                       RETURN(0);
+               if (invalid_import) {
+                       CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+                       RETURN(1);
+               }
+               if (!list_empty(&osc->oo_hp_exts)) {
+                       CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+                       RETURN(1);
+               }
+               if (!list_empty(&osc->oo_urgent_exts)) {
+                       CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+                       RETURN(1);
+               }
+               /* trigger a write rpc stream as long as there are dirtiers
+                * waiting for space.  as they're waiting, they're not going to
+                * create more pages to coalesce with what's waiting.. */
+               if (!list_empty(&cli->cl_cache_waiters)) {
+                       CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+                       RETURN(1);
+               }
+               if (atomic_read(&osc->oo_nr_writes) >=
+                   cli->cl_max_pages_per_rpc)
+                       RETURN(1);
+       } else {
+               if (atomic_read(&osc->oo_nr_reads) == 0)
+                       RETURN(0);
+               if (invalid_import) {
+                       CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+                       RETURN(1);
+               }
+               /* all read are urgent. */
+               if (!list_empty(&osc->oo_reading_exts))
+                       RETURN(1);
+       }
+
+       RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+       struct client_obd *cli = osc_cli(obj);
+       if (cmd & OBD_BRW_WRITE) {
+               atomic_add(delta, &obj->oo_nr_writes);
+               atomic_add(delta, &cli->cl_pending_w_pages);
+               LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+       } else {
+               atomic_add(delta, &obj->oo_nr_reads);
+               atomic_add(delta, &cli->cl_pending_r_pages);
+               LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+       }
+       OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+       return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+       if (list_empty(item) && should_be_on)
+               list_add_tail(item, list);
+       else if (!list_empty(item) && !should_be_on)
+               list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+       if (osc_makes_hprpc(osc)) {
+               /* HP rpc */
+               on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+               on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+       } else {
+               on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+               on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+                       osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+                       osc_makes_rpc(cli, osc, OBD_BRW_READ));
+       }
+
+       on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+               atomic_read(&osc->oo_nr_writes) > 0);
+
+       on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+               atomic_read(&osc->oo_nr_reads) > 0);
+
+       return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+       int is_ready;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       is_ready = __osc_list_maint(cli, osc);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+                          int rc)
+{
+       if (rc) {
+               if (!ar->ar_rc)
+                       ar->ar_rc = rc;
+
+               ar->ar_force_sync = 1;
+               ar->ar_min_xid = ptlrpc_sample_next_xid();
+               return;
+
+       }
+
+       if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+               ar->ar_force_sync = 0;
+}
+
+/* Performs "unstable" page accounting. This function balances the
+ * increment operations performed in osc_inc_unstable_pages. It is
+ * registered as the RPC request callback, and is executed when the
+ * bulk RPC is committed on the server. Thus at this point, the pages
+ * involved in the bulk transfer are no longer considered unstable. */
+void osc_dec_unstable_pages(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc       = req->rq_bulk;
+       struct client_obd       *cli    = &req->rq_import->imp_obd->u.cli;
+       obd_count               page_count = desc->bd_iov_count;
+       int i;
+
+       /* No unstable page tracking */
+       if (cli->cl_cache == NULL)
+               return;
+
+       LASSERT(page_count >= 0);
+
+       for (i = 0; i < page_count; i++)
+               dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+       atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
+       LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+
+       atomic_sub(page_count, &obd_unstable_pages);
+       LASSERT(atomic_read(&obd_unstable_pages) >= 0);
+
+       spin_lock(&req->rq_lock);
+       req->rq_committed = 1;
+       req->rq_unstable  = 0;
+       spin_unlock(&req->rq_lock);
+
+       wake_up_all(&cli->cl_cache->ccc_unstable_waitq);
+}
+
+/* "unstable" page accounting. See: osc_dec_unstable_pages. */
+void osc_inc_unstable_pages(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       struct client_obd       *cli  = &req->rq_import->imp_obd->u.cli;
+       obd_count               page_count = desc->bd_iov_count;
+       int i;
+
+       /* No unstable page tracking */
+       if (cli->cl_cache == NULL)
+               return;
+
+       LASSERT(page_count >= 0);
+
+       for (i = 0; i < page_count; i++)
+               inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+
+       LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
+       atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);
+
+       LASSERT(atomic_read(&obd_unstable_pages) >= 0);
+       atomic_add(page_count, &obd_unstable_pages);
+
+       spin_lock(&req->rq_lock);
+
+       /* If the request has already been committed (i.e. brw_commit
+        * called via rq_commit_cb), we need to undo the unstable page
+        * increments we just performed because rq_commit_cb wont be
+        * called again. Otherwise, just set the commit callback so the
+        * unstable page accounting is properly updated when the request
+        * is committed */
+       if (req->rq_committed) {
+               /* Drop lock before calling osc_dec_unstable_pages */
+               spin_unlock(&req->rq_lock);
+               osc_dec_unstable_pages(req);
+               spin_lock(&req->rq_lock);
+       } else {
+               req->rq_unstable  = 1;
+               req->rq_commit_cb = osc_dec_unstable_pages;
+       }
+
+       spin_unlock(&req->rq_lock);
+}
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+                             struct osc_async_page *oap, int sent, int rc)
+{
+       struct osc_object *osc = oap->oap_obj;
+       struct lov_oinfo  *loi = osc->oo_oinfo;
+       __u64 xid = 0;
+
+       ENTRY;
+       if (oap->oap_request != NULL) {
+               if (rc == 0)
+                       osc_inc_unstable_pages(oap->oap_request);
+
+               xid = ptlrpc_req_xid(oap->oap_request);
+               ptlrpc_req_finished(oap->oap_request);
+               oap->oap_request = NULL;
+       }
+
+       /* As the transfer for this page is being done, clear the flags */
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags = 0;
+       spin_unlock(&oap->oap_lock);
+       oap->oap_interrupted = 0;
+
+       if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               osc_process_ar(&cli->cl_ar, xid, rc);
+               osc_process_ar(&loi->loi_ar, xid, rc);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+
+       rc = osc_completion(env, oap, oap->oap_cmd, rc);
+       if (rc)
+               CERROR("completion on oap %p obj %p returns %d.\n",
+                      oap, osc, rc);
+
+       EXIT;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+                                   struct osc_extent *ext, struct list_head *rpclist,
+                                   int *pc, unsigned int *max_pages)
+{
+       struct osc_extent *tmp;
+       ENTRY;
+
+       EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+               ext);
+
+       *max_pages = max(ext->oe_mppr, *max_pages);
+       if (*pc + ext->oe_nr_pages > *max_pages)
+               RETURN(0);
+
+       list_for_each_entry(tmp, rpclist, oe_link) {
+               EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+               if (overlapped(tmp, ext)) {
+                       OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+                       EASSERT(0, ext);
+               }
+#endif
+
+               if (tmp->oe_srvlock != ext->oe_srvlock ||
+                   !tmp->oe_grants != !ext->oe_grants)
+                       RETURN(0);
+
+               /* remove break for strict check */
+               break;
+       }
+
+       *pc += ext->oe_nr_pages;
+       list_move_tail(&ext->oe_link, rpclist);
+       ext->oe_owner = current;
+       RETURN(1);
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *ext;
+       int page_count = 0;
+       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+       LASSERT(osc_object_is_locked(obj));
+       while (!list_empty(&obj->oo_hp_exts)) {
+               ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+                                    oe_link);
+               LASSERT(ext->oe_state == OES_CACHE);
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+       }
+       if (page_count == max_pages)
+               return page_count;
+
+       while (!list_empty(&obj->oo_urgent_exts)) {
+               ext = list_entry(obj->oo_urgent_exts.next,
+                                    struct osc_extent, oe_link);
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+
+               if (!ext->oe_intree)
+                       continue;
+
+               while ((ext = next_extent(ext)) != NULL) {
+                       if ((ext->oe_state != OES_CACHE) ||
+                           (!list_empty(&ext->oe_link) &&
+                            ext->oe_owner != NULL))
+                               continue;
+
+                       if (!try_to_add_extent_for_io(cli, ext, rpclist,
+                                                     &page_count, &max_pages))
+                               return page_count;
+               }
+       }
+       if (page_count == max_pages)
+               return page_count;
+
+       ext = first_extent(obj);
+       while (ext != NULL) {
+               if ((ext->oe_state != OES_CACHE) ||
+                   /* this extent may be already in current rpclist */
+                   (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+
+               ext = next_extent(ext);
+       }
+       return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol)
+{
+       LIST_HEAD(rpclist);
+       struct osc_extent *ext;
+       struct osc_extent *tmp;
+       struct osc_extent *first = NULL;
+       obd_count page_count = 0;
+       int srvlock = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(osc_object_is_locked(osc));
+
+       page_count = get_write_extents(osc, &rpclist);
+       LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+       if (list_empty(&rpclist))
+               RETURN(0);
+
+       osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+       list_for_each_entry(ext, &rpclist, oe_link) {
+               LASSERT(ext->oe_state == OES_CACHE ||
+                       ext->oe_state == OES_LOCK_DONE);
+               if (ext->oe_state == OES_CACHE)
+                       osc_extent_state_set(ext, OES_LOCKING);
+               else
+                       osc_extent_state_set(ext, OES_RPC);
+       }
+
+       /* we're going to grab page lock, so release object lock because
+        * lock order is page lock -> object lock. */
+       osc_object_unlock(osc);
+
+       list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+               if (ext->oe_state == OES_LOCKING) {
+                       rc = osc_extent_make_ready(env, ext);
+                       if (unlikely(rc < 0)) {
+                               list_del_init(&ext->oe_link);
+                               osc_extent_finish(env, ext, 0, rc);
+                               continue;
+                       }
+               }
+               if (first == NULL) {
+                       first = ext;
+                       srvlock = ext->oe_srvlock;
+               } else {
+                       LASSERT(srvlock == ext->oe_srvlock);
+               }
+       }
+
+       if (!list_empty(&rpclist)) {
+               LASSERT(page_count > 0);
+               rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+               LASSERT(list_empty(&rpclist));
+       }
+
+       osc_object_lock(osc);
+       RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct osc_object *osc, pdl_policy_t pol)
+{
+       struct osc_extent *ext;
+       struct osc_extent *next;
+       LIST_HEAD(rpclist);
+       int page_count = 0;
+       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(osc_object_is_locked(osc));
+       list_for_each_entry_safe(ext, next,
+                                    &osc->oo_reading_exts, oe_link) {
+               EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+               if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+                                             &max_pages))
+                       break;
+               osc_extent_state_set(ext, OES_RPC);
+               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+       }
+       LASSERT(page_count <= max_pages);
+
+       osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+       if (!list_empty(&rpclist)) {
+               osc_object_unlock(osc);
+
+               LASSERT(page_count > 0);
+               rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+               LASSERT(list_empty(&rpclist));
+
+               osc_object_lock(osc);
+       }
+       RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({                                           \
+       struct list_head *__tmp = (list)->next;                               \
+       list_del_init(__tmp);                                         \
+       list_entry(__tmp, struct osc_object, oo_##item);                      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+       ENTRY;
+
+       /* First return objects that have blocked locks so that they
+        * will be flushed quickly and other clients can get the lock,
+        * then objects which have pages ready to be stuffed into RPCs */
+       if (!list_empty(&cli->cl_loi_hp_ready_list))
+               RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+       if (!list_empty(&cli->cl_loi_ready_list))
+               RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+       /* then if we have cache waiters, return all objects with queued
+        * writes.  This is especially important when many small files
+        * have filled up the cache and not been fired into rpcs because
+        * they don't pass the nr_pending/object threshhold */
+       if (!list_empty(&cli->cl_cache_waiters) &&
+           !list_empty(&cli->cl_loi_write_list))
+               RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+       /* then return all queued objects when we have an invalid import
+        * so that they get flushed */
+       if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+               if (!list_empty(&cli->cl_loi_write_list))
+                       RETURN(list_to_obj(&cli->cl_loi_write_list,
+                                          write_item));
+               if (!list_empty(&cli->cl_loi_read_list))
+                       RETURN(list_to_obj(&cli->cl_loi_read_list,
+                                          read_item));
+       }
+       RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+                          pdl_policy_t pol)
+{
+       struct osc_object *osc;
+       int rc = 0;
+       ENTRY;
+
+       while ((osc = osc_next_obj(cli)) != NULL) {
+               struct cl_object *obj = osc2cl(osc);
+               struct lu_ref_link *link;
+
+               OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+               if (osc_max_rpc_in_flight(cli, osc)) {
+                       __osc_list_maint(cli, osc);
+                       break;
+               }
+
+               cl_object_get(obj);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               link = lu_object_ref_add(&obj->co_lu, "check", current);
+
+               /* attempt some read/write balancing by alternating between
+                * reads and writes in an object.  The makes_rpc checks here
+                * would be redundant if we were getting read/write work items
+                * instead of objects.  we don't want send_oap_rpc to drain a
+                * partial read pending queue when we're given this object to
+                * do io on writes while there are cache waiters */
+               osc_object_lock(osc);
+               if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+                       rc = osc_send_write_rpc(env, cli, osc, pol);
+                       if (rc < 0) {
+                               CERROR("Write request failed with %d\n", rc);
+
+                               /* osc_send_write_rpc failed, mostly because of
+                                * memory pressure.
+                                *
+                                * It can't break here, because if:
+                                *  - a page was submitted by osc_io_submit, so
+                                *    page locked;
+                                *  - no request in flight
+                                *  - no subsequent request
+                                * The system will be in live-lock state,
+                                * because there is no chance to call
+                                * osc_io_unplug() and osc_check_rpcs() any
+                                * more. pdflush can't help in this case,
+                                * because it might be blocked at grabbing
+                                * the page lock as we mentioned.
+                                *
+                                * Anyway, continue to drain pages. */
+                               /* break; */
+                       }
+               }
+               if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+                       rc = osc_send_read_rpc(env, cli, osc, pol);
+                       if (rc < 0)
+                               CERROR("Read request failed with %d\n", rc);
+               }
+               osc_object_unlock(osc);
+
+               osc_list_maint(cli, osc);
+               lu_object_ref_del_at(&obj->co_lu, link, "check", current);
+               cl_object_put(env, obj);
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+       }
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+                         struct osc_object *osc, pdl_policy_t pol, int async)
+{
+       int has_rpcs = 1;
+       int rc = 0;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (osc != NULL)
+               has_rpcs = __osc_list_maint(cli, osc);
+       if (has_rpcs) {
+               if (!async) {
+                       /* disable osc_lru_shrink() temporarily to avoid
+                        * potential stack overrun problem. LU-2859 */
+                       atomic_inc(&cli->cl_lru_shrinkers);
+                       osc_check_rpcs(env, cli, pol);
+                       atomic_dec(&cli->cl_lru_shrinkers);
+               } else {
+                       CDEBUG(D_CACHE, "Queue writeback work for client %p.\n",
+                              cli);
+                       LASSERT(cli->cl_writeback_work != NULL);
+                       rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+               }
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+                               struct client_obd *cli, struct osc_object *osc)
+{
+       /* XXX: policy is no use actually. */
+       return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol)
+{
+       (void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+                       struct page *page, loff_t offset)
+{
+       struct obd_export     *exp = osc_export(osc);
+       struct osc_async_page *oap = &ops->ops_oap;
+       ENTRY;
+
+       if (!page)
+               return cfs_size_round(sizeof(*oap));
+
+       oap->oap_magic = OAP_MAGIC;
+       oap->oap_cli = &exp->exp_obd->u.cli;
+       oap->oap_obj = osc;
+
+       oap->oap_page = page;
+       oap->oap_obj_off = offset;
+       LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+       if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+               oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+       INIT_LIST_HEAD(&oap->oap_pending_item);
+       INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+       spin_lock_init(&oap->oap_lock);
+       CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+              oap, page, oap->oap_obj_off);
+       RETURN(0);
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+                      struct osc_page *ops)
+{
+       struct osc_io *oio = osc_env_io(env);
+       struct osc_extent     *ext = NULL;
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct client_obd     *cli = oap->oap_cli;
+       struct osc_object     *osc = oap->oap_obj;
+       pgoff_t index;
+       int    grants = 0;
+       int    brw_flags = OBD_BRW_ASYNC;
+       int    cmd = OBD_BRW_WRITE;
+       int    need_release = 0;
+       int    rc = 0;
+       ENTRY;
+
+       if (oap->oap_magic != OAP_MAGIC)
+               RETURN(-EINVAL);
+
+       if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+               RETURN(-EIO);
+
+       if (!list_empty(&oap->oap_pending_item) ||
+           !list_empty(&oap->oap_rpc_item))
+               RETURN(-EBUSY);
+
+       /* Set the OBD_BRW_SRVLOCK before the page is queued. */
+       brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+       if (!client_is_remote(osc_export(osc)) &&
+           cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+               brw_flags |= OBD_BRW_NOQUOTA;
+               cmd |= OBD_BRW_NOQUOTA;
+       }
+
+       /* check if the file's owner/group is over quota */
+       if (!(cmd & OBD_BRW_NOQUOTA)) {
+               struct cl_object *obj;
+               struct cl_attr   *attr;
+               unsigned int qid[MAXQUOTAS];
+
+               obj = cl_object_top(&osc->oo_cl);
+               attr = &osc_env_info(env)->oti_attr;
+
+               cl_object_attr_lock(obj);
+               rc = cl_object_attr_get(env, obj, attr);
+               cl_object_attr_unlock(obj);
+
+               qid[USRQUOTA] = attr->cat_uid;
+               qid[GRPQUOTA] = attr->cat_gid;
+               if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+                       rc = -EDQUOT;
+               if (rc)
+                       RETURN(rc);
+       }
+
+       oap->oap_cmd = cmd;
+       oap->oap_page_off = ops->ops_from;
+       oap->oap_count = ops->ops_to - ops->ops_from;
+       oap->oap_async_flags = 0;
+       oap->oap_brw_flags = brw_flags;
+
+       OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+                    oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+       index = oap2cl_page(oap)->cp_index;
+
+       /* Add this page into extent by the following steps:
+        * 1. if there exists an active extent for this IO, mostly this page
+        *    can be added to the active extent and sometimes we need to
+        *    expand extent to accomodate this page;
+        * 2. otherwise, a new extent will be allocated. */
+
+       ext = oio->oi_active;
+       if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+               /* one chunk plus extent overhead must be enough to write this
+                * page */
+               grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+               if (ext->oe_end >= index)
+                       grants = 0;
+
+               /* it doesn't need any grant to dirty this page */
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               rc = osc_enter_cache_try(cli, oap, grants, 0);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               if (rc == 0) { /* try failed */
+                       grants = 0;
+                       need_release = 1;
+               } else if (ext->oe_end < index) {
+                       int tmp = grants;
+                       /* try to expand this extent */
+                       rc = osc_extent_expand(ext, index, &tmp);
+                       if (rc < 0) {
+                               need_release = 1;
+                               /* don't free reserved grant */
+                       } else {
+                               OSC_EXTENT_DUMP(D_CACHE, ext,
+                                               "expanded for %lu.\n", index);
+                               osc_unreserve_grant(cli, grants, tmp);
+                               grants = 0;
+                       }
+               }
+               rc = 0;
+       } else if (ext != NULL) {
+               /* index is located outside of active extent */
+               need_release = 1;
+       }
+       if (need_release) {
+               osc_extent_release(env, ext);
+               oio->oi_active = NULL;
+               ext = NULL;
+       }
+
+       if (ext == NULL) {
+               int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+               /* try to find new extent to cover this page */
+               LASSERT(oio->oi_active == NULL);
+               /* we may have allocated grant for this page if we failed
+                * to expand the previous active extent. */
+               LASSERT(ergo(grants > 0, grants >= tmp));
+
+               rc = 0;
+               if (grants == 0) {
+                       /* we haven't allocated grant for this page. */
+                       rc = osc_enter_cache(env, cli, oap, tmp);
+                       if (rc == 0)
+                               grants = tmp;
+               }
+
+               tmp = grants;
+               if (rc == 0) {
+                       ext = osc_extent_find(env, osc, index, &tmp);
+                       if (IS_ERR(ext)) {
+                               LASSERT(tmp == grants);
+                               osc_exit_cache(cli, oap);
+                               rc = PTR_ERR(ext);
+                               ext = NULL;
+                       } else {
+                               oio->oi_active = ext;
+                       }
+               }
+               if (grants > 0)
+                       osc_unreserve_grant(cli, grants, tmp);
+       }
+
+       LASSERT(ergo(rc == 0, ext != NULL));
+       if (ext != NULL) {
+               EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+                        ext, "index = %lu.\n", index);
+               LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+               osc_object_lock(osc);
+               if (ext->oe_nr_pages == 0)
+                       ext->oe_srvlock = ops->ops_srvlock;
+               else
+                       LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+               ++ext->oe_nr_pages;
+               list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+               osc_object_unlock(osc);
+       }
+       RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+                           struct osc_object *obj, struct osc_page *ops)
+{
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct osc_extent     *ext = NULL;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oap->oap_magic == OAP_MAGIC);
+
+       CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+              oap, ops, oap2cl_page(oap)->cp_index);
+
+       osc_object_lock(obj);
+       if (!list_empty(&oap->oap_rpc_item)) {
+               CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+               rc = -EBUSY;
+       } else if (!list_empty(&oap->oap_pending_item)) {
+               ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+               /* only truncated pages are allowed to be taken out.
+                * See osc_extent_truncate() and osc_cache_truncate_start()
+                * for details. */
+               if (ext != NULL && ext->oe_state != OES_TRUNC) {
+                       OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+                                       oap2cl_page(oap)->cp_index);
+                       rc = -EBUSY;
+               }
+       }
+       osc_object_unlock(obj);
+       if (ext != NULL)
+               osc_extent_put(env, ext);
+       RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+                        struct osc_page *ops)
+{
+       struct osc_extent *ext   = NULL;
+       struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+       struct cl_page    *cp    = ops->ops_cl.cpl_page;
+       pgoff_t     index = cp->cp_index;
+       struct osc_async_page *oap = &ops->ops_oap;
+       bool unplug = false;
+       int rc = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       ext = osc_extent_lookup(obj, index);
+       if (ext == NULL) {
+               osc_extent_tree_dump(D_ERROR, obj);
+               LASSERTF(0, "page index %lu is NOT covered.\n", index);
+       }
+
+       switch (ext->oe_state) {
+       case OES_RPC:
+       case OES_LOCK_DONE:
+               CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+                             "flush an in-rpc page?\n");
+               LASSERT(0);
+               break;
+       case OES_LOCKING:
+               /* If we know this extent is being written out, we should abort
+                * so that the writer can make this page ready. Otherwise, there
+                * exists a deadlock problem because other process can wait for
+                * page writeback bit holding page lock; and meanwhile in
+                * vvp_page_make_ready(), we need to grab page lock before
+                * really sending the RPC. */
+       case OES_TRUNC:
+               /* race with truncate, page will be redirtied */
+               GOTO(out, rc = -EAGAIN);
+       default:
+               break;
+       }
+
+       rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+       if (rc)
+               GOTO(out, rc);
+
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+       spin_unlock(&oap->oap_lock);
+
+       if (memory_pressure_get())
+               ext->oe_memalloc = 1;
+
+       ext->oe_urgent = 1;
+       if (ext->oe_state == OES_CACHE) {
+               OSC_EXTENT_DUMP(D_CACHE, ext,
+                               "flush page %p make it urgent.\n", oap);
+               if (list_empty(&ext->oe_link))
+                       list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+               unplug = true;
+       }
+       rc = 0;
+       EXIT;
+
+out:
+       osc_object_unlock(obj);
+       osc_extent_put(env, ext);
+       if (unplug)
+               osc_io_unplug_async(env, osc_cli(obj), obj);
+       return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct osc_object     *obj = oap->oap_obj;
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_extent     *ext;
+       struct osc_extent     *found = NULL;
+       struct list_head            *plist;
+       pgoff_t index = oap2cl_page(oap)->cp_index;
+       int     rc = -EBUSY;
+       int     cmd;
+       ENTRY;
+
+       LASSERT(!oap->oap_interrupted);
+       oap->oap_interrupted = 1;
+
+       /* Find out the caching extent */
+       osc_object_lock(obj);
+       if (oap->oap_cmd & OBD_BRW_WRITE) {
+               plist = &obj->oo_urgent_exts;
+               cmd   = OBD_BRW_WRITE;
+       } else {
+               plist = &obj->oo_reading_exts;
+               cmd   = OBD_BRW_READ;
+       }
+       list_for_each_entry(ext, plist, oe_link) {
+               if (ext->oe_start <= index && ext->oe_end >= index) {
+                       LASSERT(ext->oe_state == OES_LOCK_DONE);
+                       /* For OES_LOCK_DONE state extent, it has already held
+                        * a refcount for RPC. */
+                       found = osc_extent_get(ext);
+                       break;
+               }
+       }
+       if (found != NULL) {
+               list_del_init(&found->oe_link);
+               osc_update_pending(obj, cmd, -found->oe_nr_pages);
+               osc_object_unlock(obj);
+
+               osc_extent_finish(env, found, 0, -EINTR);
+               osc_extent_put(env, found);
+               rc = 0;
+       } else {
+               osc_object_unlock(obj);
+               /* ok, it's been put in an rpc. only one oap gets a request
+                * reference */
+               if (oap->oap_request != NULL) {
+                       ptlrpc_mark_interrupted(oap->oap_request);
+                       ptlrpcd_wake(oap->oap_request);
+                       ptlrpc_req_finished(oap->oap_request);
+                       oap->oap_request = NULL;
+               }
+       }
+
+       osc_list_maint(cli, obj);
+       RETURN(rc);
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+                        struct list_head *list, int cmd, int brw_flags)
+{
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_extent     *ext;
+       struct osc_async_page *oap;
+       int     page_count = 0;
+       int     mppr       = cli->cl_max_pages_per_rpc;
+       pgoff_t start      = CL_PAGE_EOF;
+       pgoff_t end     = 0;
+       ENTRY;
+
+       list_for_each_entry(oap, list, oap_pending_item) {
+               struct cl_page *cp = oap2cl_page(oap);
+               if (cp->cp_index > end)
+                       end = cp->cp_index;
+               if (cp->cp_index < start)
+                       start = cp->cp_index;
+               ++page_count;
+               mppr <<= (page_count > mppr);
+       }
+
+       ext = osc_extent_alloc(obj);
+       if (ext == NULL) {
+               list_for_each_entry(oap, list, oap_pending_item) {
+                       list_del_init(&oap->oap_pending_item);
+                       osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+               }
+               RETURN(-ENOMEM);
+       }
+
+       ext->oe_rw = !!(cmd & OBD_BRW_READ);
+       ext->oe_urgent = 1;
+       ext->oe_start = start;
+       ext->oe_end = ext->oe_max_end = end;
+       ext->oe_obj = obj;
+       ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+       ext->oe_nr_pages = page_count;
+       ext->oe_mppr = mppr;
+       list_splice_init(list, &ext->oe_pages);
+
+       osc_object_lock(obj);
+       /* Reuse the initial refcount for RPC, don't drop it */
+       osc_extent_state_set(ext, OES_LOCK_DONE);
+       if (cmd & OBD_BRW_WRITE) {
+               list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+               osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+       } else {
+               list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+               osc_update_pending(obj, OBD_BRW_READ, page_count);
+       }
+       osc_object_unlock(obj);
+
+       osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND);
+       RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+                            struct osc_object *obj, __u64 size)
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *ext;
+       struct osc_extent *waiting = NULL;
+       pgoff_t index;
+       LIST_HEAD(list);
+       int result = 0;
+       bool partial;
+       ENTRY;
+
+       /* pages with index greater or equal to index will be truncated. */
+       index = cl_index(osc2cl(obj), size);
+       partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, index);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < index)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+               if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+                       /* if ext is in urgent state, it means there must exist
+                        * a page already having been flushed by write_page().
+                        * We have to wait for this extent because we can't
+                        * truncate that page. */
+                       LASSERT(!ext->oe_hp);
+                       OSC_EXTENT_DUMP(D_CACHE, ext,
+                                       "waiting for busy extent\n");
+                       waiting = osc_extent_get(ext);
+                       break;
+               }
+
+               OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+
+               osc_extent_get(ext);
+               if (ext->oe_state == OES_ACTIVE) {
+                       /* though we grab inode mutex for write path, but we
+                        * release it before releasing extent(in osc_io_end()),
+                        * so there is a race window that an extent is still
+                        * in OES_ACTIVE when truncate starts. */
+                       LASSERT(!ext->oe_trunc_pending);
+                       ext->oe_trunc_pending = 1;
+               } else {
+                       EASSERT(ext->oe_state == OES_CACHE, ext);
+                       osc_extent_state_set(ext, OES_TRUNC);
+                       osc_update_pending(obj, OBD_BRW_WRITE,
+                                          -ext->oe_nr_pages);
+               }
+               EASSERT(list_empty(&ext->oe_link), ext);
+               list_add_tail(&ext->oe_link, &list);
+
+               ext = next_extent(ext);
+       }
+       osc_object_unlock(obj);
+
+       osc_list_maint(cli, obj);
+
+       while (!list_empty(&list)) {
+               int rc;
+
+               ext = list_entry(list.next, struct osc_extent, oe_link);
+               list_del_init(&ext->oe_link);
+
+               /* extent may be in OES_ACTIVE state because inode mutex
+                * is released before osc_io_end() in file write case */
+               if (ext->oe_state != OES_TRUNC)
+                       osc_extent_wait(env, ext, OES_TRUNC);
+
+               rc = osc_extent_truncate(ext, index, partial);
+               if (rc < 0) {
+                       if (result == 0)
+                               result = rc;
+
+                       OSC_EXTENT_DUMP(D_ERROR, ext,
+                                       "truncate error %d\n", rc);
+               } else if (ext->oe_nr_pages == 0) {
+                       osc_extent_remove(ext);
+               } else {
+                       /* this must be an overlapped extent which means only
+                        * part of pages in this extent have been truncated.
+                        */
+                       EASSERTF(ext->oe_start <= index, ext,
+                                "trunc index = %lu/%d.\n", index, partial);
+                       /* fix index to skip this partially truncated extent */
+                       index = ext->oe_end + 1;
+                       partial = false;
+
+                       /* we need to hold this extent in OES_TRUNC state so
+                        * that no writeback will happen. This is to avoid
+                        * BUG 17397. */
+                       LASSERT(oio->oi_trunc == NULL);
+                       oio->oi_trunc = osc_extent_get(ext);
+                       OSC_EXTENT_DUMP(D_CACHE, ext,
+                                       "trunc at "LPU64"\n", size);
+               }
+               osc_extent_put(env, ext);
+       }
+       if (waiting != NULL) {
+               int rc;
+
+               /* ignore the result of osc_extent_wait the write initiator
+                * should take care of it. */
+               rc = osc_extent_wait(env, waiting, OES_INV);
+               if (rc < 0)
+                       OSC_EXTENT_DUMP(D_CACHE, ext, "wait error: %d.\n", rc);
+
+               osc_extent_put(env, waiting);
+               waiting = NULL;
+               goto again;
+       }
+       RETURN(result);
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+                           struct osc_object *obj)
+{
+       struct osc_extent *ext = oio->oi_trunc;
+
+       oio->oi_trunc = NULL;
+       if (ext != NULL) {
+               bool unplug = false;
+
+               EASSERT(ext->oe_nr_pages > 0, ext);
+               EASSERT(ext->oe_state == OES_TRUNC, ext);
+               EASSERT(!ext->oe_urgent, ext);
+
+               OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+               osc_object_lock(obj);
+               osc_extent_state_set(ext, OES_CACHE);
+               if (ext->oe_fsync_wait && !ext->oe_urgent) {
+                       ext->oe_urgent = 1;
+                       list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+                       unplug = true;
+               }
+               osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+               osc_object_unlock(obj);
+               osc_extent_put(env, ext);
+
+               if (unplug)
+                       osc_io_unplug_async(env, osc_cli(obj), obj);
+       }
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+                        pgoff_t start, pgoff_t end)
+{
+       struct osc_extent *ext;
+       pgoff_t index = start;
+       int     result = 0;
+       ENTRY;
+
+again:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, index);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < index)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               int rc;
+
+               if (ext->oe_start > end)
+                       break;
+
+               if (!ext->oe_fsync_wait) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               EASSERT(ergo(ext->oe_state == OES_CACHE,
+                            ext->oe_hp || ext->oe_urgent), ext);
+               EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+                            !ext->oe_hp && ext->oe_urgent), ext);
+
+               index = ext->oe_end + 1;
+               osc_extent_get(ext);
+               osc_object_unlock(obj);
+
+               rc = osc_extent_wait(env, ext, OES_INV);
+               if (result == 0)
+                       result = rc;
+               osc_extent_put(env, ext);
+               goto again;
+       }
+       osc_object_unlock(obj);
+
+       OSC_IO_DEBUG(obj, "sync file range.\n");
+       RETURN(result);
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *        truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+                             pgoff_t start, pgoff_t end, int hp, int discard)
+{
+       struct osc_extent *ext;
+       LIST_HEAD(discard_list);
+       bool unplug = false;
+       int result = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, start);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < start)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               if (ext->oe_start > end)
+                       break;
+
+               ext->oe_fsync_wait = 1;
+               switch (ext->oe_state) {
+               case OES_CACHE:
+                       result += ext->oe_nr_pages;
+                       if (!discard) {
+                               struct list_head *list = NULL;
+                               if (hp) {
+                                       EASSERT(!ext->oe_hp, ext);
+                                       ext->oe_hp = 1;
+                                       list = &obj->oo_hp_exts;
+                               } else if (!ext->oe_urgent) {
+                                       ext->oe_urgent = 1;
+                                       list = &obj->oo_urgent_exts;
+                               }
+                               if (list != NULL)
+                                       list_move_tail(&ext->oe_link, list);
+                               unplug = true;
+                       } else {
+                               /* the only discarder is lock cancelling, so
+                                * [start, end] must contain this extent */
+                               EASSERT(ext->oe_start >= start &&
+                                       ext->oe_max_end <= end, ext);
+                               osc_extent_state_set(ext, OES_LOCKING);
+                               ext->oe_owner = current;
+                               list_move_tail(&ext->oe_link,
+                                                  &discard_list);
+                               osc_update_pending(obj, OBD_BRW_WRITE,
+                                                  -ext->oe_nr_pages);
+                       }
+                       break;
+               case OES_ACTIVE:
+                       /* It's pretty bad to wait for ACTIVE extents, because
+                        * we don't know how long we will wait for it to be
+                        * flushed since it may be blocked at awaiting more
+                        * grants. We do this for the correctness of fsync. */
+                       LASSERT(hp == 0 && discard == 0);
+                       ext->oe_urgent = 1;
+                       break;
+               case OES_TRUNC:
+                       /* this extent is being truncated, can't do anything
+                        * for it now. it will be set to urgent after truncate
+                        * is finished in osc_cache_truncate_end(). */
+               default:
+                       break;
+               }
+               ext = next_extent(ext);
+       }
+       osc_object_unlock(obj);
+
+       LASSERT(ergo(!discard, list_empty(&discard_list)));
+       if (!list_empty(&discard_list)) {
+               struct osc_extent *tmp;
+               int rc;
+
+               osc_list_maint(osc_cli(obj), obj);
+               list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+                       list_del_init(&ext->oe_link);
+                       EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+                       /* Discard caching pages. We don't actually write this
+                        * extent out but we complete it as if we did. */
+                       rc = osc_extent_make_ready(env, ext);
+                       if (unlikely(rc < 0)) {
+                               OSC_EXTENT_DUMP(D_ERROR, ext,
+                                               "make_ready returned %d\n", rc);
+                               if (result >= 0)
+                                       result = rc;
+                       }
+
+                       /* finish the extent as if the pages were sent */
+                       osc_extent_finish(env, ext, 0, 0);
+               }
+       }
+
+       if (unplug)
+               osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+       if (hp || discard) {
+               int rc;
+               rc = osc_cache_wait_range(env, obj, start, end);
+               if (result >= 0 && rc < 0)
+                       result = rc;
+       }
+
+       OSC_IO_DEBUG(obj, "cache page out.\n");
+       RETURN(result);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644 (file)
index 0000000..001a9c8
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+/* osc_build_res_name() */
+#include <obd_ost.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+       /** super class */
+       struct cl_io_slice oi_cl;
+       /** true if this io is lockless. */
+       int             oi_lockless;
+       /** active extents, we know how many bytes is going to be written,
+        * so having an active extent will prevent it from being fragmented */
+       struct osc_extent *oi_active;
+       /** partially truncated extent, we need to hold this extent to prevent
+        * page writeback from happening. */
+       struct osc_extent *oi_trunc;
+
+       struct obd_info    oi_info;
+       struct obdo     oi_oa;
+       struct osc_async_cbargs {
+               bool              opc_rpc_sent;
+               int            opc_rc;
+               struct completion       opc_sync;
+       } oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+       struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+       struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+       struct ldlm_res_id      oti_resname;
+       ldlm_policy_data_t      oti_policy;
+       struct cl_lock_descr    oti_descr;
+       struct cl_attr    oti_attr;
+       struct lustre_handle    oti_handle;
+       struct cl_page_list     oti_plist;
+       struct cl_io            oti_io;
+       struct cl_page         *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+       struct cl_object   oo_cl;
+       struct lov_oinfo  *oo_oinfo;
+       /**
+        * True if locking against this stripe got -EUSERS.
+        */
+       int             oo_contended;
+       cfs_time_t       oo_contention_time;
+       /**
+        * List of pages in transfer.
+        */
+       struct list_head         oo_inflight[CRT_NR];
+       /**
+        * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+        * locked during take-off and landing.
+        */
+       spinlock_t         oo_seatbelt;
+
+       /**
+        * used by the osc to keep track of what objects to build into rpcs.
+        * Protected by client_obd->cli_loi_list_lock.
+        */
+       struct list_head           oo_ready_item;
+       struct list_head           oo_hp_ready_item;
+       struct list_head           oo_write_item;
+       struct list_head           oo_read_item;
+
+       /**
+        * extent is a red black tree to manage (async) dirty pages.
+        */
+       struct rb_root       oo_root;
+       /**
+        * Manage write(dirty) extents.
+        */
+       struct list_head           oo_hp_exts; /* list of hp extents */
+       struct list_head           oo_urgent_exts; /* list of writeback extents */
+       struct list_head           oo_rpc_exts;
+
+       struct list_head           oo_reading_exts;
+
+       atomic_t         oo_nr_reads;
+       atomic_t         oo_nr_writes;
+
+       /** Protect extent tree. Will be used to protect
+        * oo_{read|write}_pages soon. */
+       spinlock_t          oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+       spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+       return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+       spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+       return spin_is_locked(&obj->oo_lock);
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+       OLS_NEW,
+       OLS_ENQUEUED,
+       OLS_UPCALL_RECEIVED,
+       OLS_GRANTED,
+       OLS_RELEASED,
+       OLS_BLOCKED,
+       OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *       - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *         the caller (released when reply from the server is received, or on
+ *         error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *       - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *         ldlm_lock_new().
+ *       - if (rc != 0)
+ *             ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *       - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *         ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+       struct cl_lock_slice     ols_cl;
+       /** underlying DLM lock */
+       struct ldlm_lock        *ols_lock;
+       /** lock value block */
+       struct ost_lvb     ols_lvb;
+       /** DLM flags with which osc_lock::ols_lock was enqueued */
+       __u64               ols_flags;
+       /** osc_lock::ols_lock handle */
+       struct lustre_handle     ols_handle;
+       struct ldlm_enqueue_info ols_einfo;
+       enum osc_lock_state      ols_state;
+
+       /**
+        * How many pages are using this lock for io, currently only used by
+        * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+        * during recovery to avoid deadlock. see bz16774.
+        *
+        * \see osc_page::ops_lock
+        * \see osc_page_addref_lock(), osc_page_putref_lock()
+        */
+       atomic_t             ols_pageref;
+
+       /**
+        * true, if ldlm_lock_addref() was called against
+        * osc_lock::ols_lock. This is used for sanity checking.
+        *
+        * \see osc_lock::ols_has_ref
+        */
+       unsigned                  ols_hold :1,
+       /**
+        * this is much like osc_lock::ols_hold, except that this bit is
+        * cleared _after_ reference in released in osc_lock_unuse(). This
+        * fine distinction is needed because:
+        *
+        *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+        *       to return associated cl_lock (so that a flag is needed that is
+        *       cleared after ldlm_lock_decref() returned), and
+        *
+        *     - ldlm_lock_decref() can invoke blocking ast (for a
+        *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+        *       osc_lock_cancel() called from there need to know whether to
+        *       release lock reference (so that a flag is needed that is
+        *       cleared before ldlm_lock_decref() is called).
+        */
+                                ols_has_ref:1,
+       /**
+        * inherit the lockless attribute from top level cl_io.
+        * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+        */
+                                ols_locklessable:1,
+       /**
+        * set by osc_lock_use() to wait until blocking AST enters into
+        * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+        * further synchronization.
+        */
+                                ols_ast_wait:1,
+       /**
+        * If the data of this lock has been flushed to server side.
+        */
+                                ols_flush:1,
+       /**
+        * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+        * the EVAVAIL error as torerable, this will make upper logic happy
+        * to wait all glimpse locks to each OSTs to be completed.
+        * Glimpse lock converts to normal lock if the server lock is
+        * granted.
+        * Glimpse lock should be destroyed immediately after use.
+        */
+                                ols_glimpse:1,
+       /**
+        * For async glimpse lock.
+        */
+                                ols_agl:1;
+       /**
+        * IO that owns this lock. This field is used for a dead-lock
+        * avoidance by osc_lock_enqueue_wait().
+        *
+        * XXX: unfortunately, the owner of a osc_lock is not unique,
+        * the lock may have multiple users, if the lock is granted and
+        * then matched.
+        */
+       struct osc_io      *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+       struct cl_page_slice  ops_cl;
+       /**
+        * Page queues used by osc to detect when RPC can be formed.
+        */
+       struct osc_async_page ops_oap;
+       /**
+        * An offset within page from which next transfer starts. This is used
+        * by cl_page_clip() to submit partial page transfers.
+        */
+       int                ops_from;
+       /**
+        * An offset within page at which next transfer ends.
+        *
+        * \see osc_page::ops_from.
+        */
+       int                ops_to;
+       /**
+        * Boolean, true iff page is under transfer. Used for sanity checking.
+        */
+       unsigned              ops_transfer_pinned:1,
+       /**
+        * True for a `temporary page' created by read-ahead code, probably
+        * outside of any DLM lock.
+        */
+                             ops_temp:1,
+       /**
+        * in LRU?
+        */
+                             ops_in_lru:1,
+       /**
+        * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+        */
+                             ops_srvlock:1;
+       union {
+               /**
+                * lru page list. ops_inflight and ops_lru are exclusive so
+                * that they can share the same data.
+                */
+               struct list_head              ops_lru;
+               /**
+                * Linkage into a per-osc_object list of pages in flight. For
+                * debugging.
+                */
+               struct list_head            ops_inflight;
+       };
+       /**
+        * Thread that submitted this page for transfer. For debugging.
+        */
+       task_t     *ops_submitter;
+       /**
+        * Submit time - the time when the page is starting RPC. For debugging.
+        */
+       cfs_time_t          ops_submit_time;
+
+       /**
+        * A lock of which we hold a reference covers this page. Only used by
+        * read-ahead: for a readahead page, we hold it's covering lock to
+        * prevent it from being canceled during recovery.
+        *
+        * \see osc_lock::ols_pageref
+        * \see osc_page_addref_lock(), osc_page_putref_lock().
+        */
+       struct cl_lock       *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+                 struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+                 struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+                       pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+                       lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+                    enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+                       obd_flag async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+                       struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+                      struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+                           struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+                        struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+                        struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+                            struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+                           struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+                             pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+                        pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+       struct osc_thread_info *info;
+
+       info = lu_context_key_get(&env->le_ctx, &osc_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+       struct osc_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &osc_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+       return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &osc_device_type);
+       return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+       return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+       return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+       LINVRNT(osc_is_object(&obj->co_lu));
+       return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+       return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+       LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+       if (mode == CLM_READ)
+               return LCK_PR;
+       else if (mode == CLM_WRITE)
+               return LCK_PW;
+       else
+               return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+       LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+       if (mode == LCK_PR)
+               return CLM_READ;
+       else if (mode == LCK_PW)
+               return CLM_WRITE;
+       else
+               return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+       return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+       return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+       return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+       return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+       return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+       return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+       OES_INV       = 0, /** extent is just initialized or destroyed */
+       OES_ACTIVE    = 1, /** process is using this extent */
+       OES_CACHE     = 2, /** extent is ready for IO */
+       OES_LOCKING   = 3, /** locking page to prepare IO */
+       OES_LOCK_DONE = 4, /** locking finished, ready to send */
+       OES_RPC       = 5, /** in RPC */
+       OES_TRUNC     = 6, /** being truncated */
+       OES_STATE_MAX
+};
+#define OES_STRINGS { "inv", "active", "cache", "locking", "lockdone", "rpc", \
+                     "trunc", NULL }
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+       /** red-black tree node */
+       struct rb_node     oe_node;
+       /** osc_object of this extent */
+       struct osc_object *oe_obj;
+       /** refcount, removed from red-black tree if reaches zero. */
+       atomic_t       oe_refc;
+       /** busy if non-zero */
+       atomic_t       oe_users;
+       /** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+       struct list_head         oe_link;
+       /** state of this extent */
+       unsigned int       oe_state;
+       /** flags for this extent. */
+       unsigned int       oe_intree:1,
+       /** 0 is write, 1 is read */
+                          oe_rw:1,
+                          oe_srvlock:1,
+                          oe_memalloc:1,
+       /** an ACTIVE extent is going to be truncated, so when this extent
+        * is released, it will turn into TRUNC state instead of CACHE. */
+                          oe_trunc_pending:1,
+       /** this extent should be written asap and someone may wait for the
+        * write to finish. This bit is usually set along with urgent if
+        * the extent was CACHE state.
+        * fsync_wait extent can't be merged because new extent region may
+        * exceed fsync range. */
+                          oe_fsync_wait:1,
+       /** covering lock is being canceled */
+                          oe_hp:1,
+       /** this extent should be written back asap. set if one of pages is
+        * called by page WB daemon, or sync write or reading requests. */
+                          oe_urgent:1;
+       /** how many grants allocated for this extent.
+        *  Grant allocated for this extent. There is no grant allocated
+        *  for reading extents and sync write extents. */
+       unsigned int       oe_grants;
+       /** # of dirty pages in this extent */
+       unsigned int       oe_nr_pages;
+       /** list of pending oap pages. Pages in this list are NOT sorted. */
+       struct list_head         oe_pages;
+       /** Since an extent has to be written out in atomic, this is used to
+        * remember the next page need to be locked to write this extent out.
+        * Not used right now.
+        */
+       struct osc_page   *oe_next_page;
+       /** start and end index of this extent, include start and end
+        * themselves. Page offset here is the page index of osc_pages.
+        * oe_start is used as keyword for red-black tree. */
+       pgoff_t     oe_start;
+       pgoff_t     oe_end;
+       /** maximum ending index of this extent, this is limited by
+        * max_pages_per_rpc, lock extent and chunk size. */
+       pgoff_t     oe_max_end;
+       /** waitqueue - for those who want to be notified if this extent's
+        * state has changed. */
+       wait_queue_head_t       oe_waitq;
+       /** lock covering this extent */
+       struct cl_lock    *oe_osclock;
+       /** terminator of this extent. Must be true if this extent is in IO. */
+       task_t  *oe_owner;
+       /** return value of writeback. If somebody is waiting for this extent,
+        * this value can be known by outside world. */
+       int             oe_rc;
+       /** max pages per rpc when this extent was created */
+       unsigned int       oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+                     int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644 (file)
index 0000000..4208ddf
--- /dev/null
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+       {
+               .ckd_cache = &osc_lock_kmem,
+               .ckd_name  = "osc_lock_kmem",
+               .ckd_size  = sizeof (struct osc_lock)
+       },
+       {
+               .ckd_cache = &osc_object_kmem,
+               .ckd_name  = "osc_object_kmem",
+               .ckd_size  = sizeof (struct osc_object)
+       },
+       {
+               .ckd_cache = &osc_thread_kmem,
+               .ckd_name  = "osc_thread_kmem",
+               .ckd_size  = sizeof (struct osc_thread_info)
+       },
+       {
+               .ckd_cache = &osc_session_kmem,
+               .ckd_name  = "osc_session_kmem",
+               .ckd_size  = sizeof (struct osc_session)
+       },
+       {
+               .ckd_cache = &osc_req_kmem,
+               .ckd_name  = "osc_req_kmem",
+               .ckd_size  = sizeof (struct osc_req)
+       },
+       {
+               .ckd_cache = &osc_extent_kmem,
+               .ckd_name  = "osc_extent_kmem",
+               .ckd_size  = sizeof (struct osc_extent)
+       },
+       {
+               .ckd_cache = &osc_quota_kmem,
+               .ckd_name  = "osc_quota_kmem",
+               .ckd_size  = sizeof(struct osc_quota_info)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+       return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+                        struct lu_context_key *key)
+{
+       struct osc_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct osc_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = osc_key_init,
+       .lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+                             struct lu_context_key *key)
+{
+       struct osc_session *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+                            struct lu_context_key *key, void *data)
+{
+       struct osc_session *info = data;
+       OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = osc_session_init,
+       .lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+                                struct lu_device *d, struct lustre_cfg *cfg)
+{
+       ENTRY;
+       RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+       .ldo_object_alloc      = osc_object_alloc,
+       .ldo_process_config    = osc_cl_process_config,
+       .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+       .cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       return 0;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct osc_device *od = lu2osc_dev(d);
+
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(od);
+       return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       struct lu_device *d;
+       struct osc_device *od;
+       struct obd_device *obd;
+       int rc;
+
+       OBD_ALLOC_PTR(od);
+       if (od == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       cl_device_init(&od->od_cl, t);
+       d = osc2lu_dev(od);
+       d->ld_ops = &osc_lu_ops;
+       od->od_cl.cd_ops = &osc_cl_ops;
+
+       /* Setup OSC OBD */
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       rc = osc_setup(obd, cfg);
+       if (rc) {
+               osc_device_free(env, d);
+               RETURN(ERR_PTR(rc));
+       }
+       od->od_exp = obd->obd_self_export;
+       RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+       .ldto_init = osc_type_init,
+       .ldto_fini = osc_type_fini,
+
+       .ldto_start = osc_type_start,
+       .ldto_stop  = osc_type_stop,
+
+       .ldto_device_alloc = osc_device_alloc,
+       .ldto_device_free  = osc_device_free,
+
+       .ldto_device_init    = osc_device_init,
+       .ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_OSC_NAME,
+       .ldt_ops      = &osc_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644 (file)
index 0000000..5343da2
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+       ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+                             page is added to an rpc */
+       ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+       ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+                                    to give the caller a chance to update
+                                    or cancel the size of the io */
+       ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+       int                  oap_magic;
+       unsigned short    oap_cmd;
+       unsigned short    oap_interrupted:1;
+
+       struct list_head              oap_pending_item;
+       struct list_head              oap_rpc_item;
+
+       obd_off          oap_obj_off;
+       unsigned                oap_page_off;
+       enum async_flags        oap_async_flags;
+
+       struct brw_page  oap_brw_page;
+
+       struct ptlrpc_request   *oap_request;
+       struct client_obd       *oap_cli;
+       struct osc_object       *oap_obj;
+
+       struct ldlm_lock        *oap_ldlm_lock;
+       spinlock_t               oap_lock;
+};
+
+#define oap_page       oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+struct osc_cache_waiter {
+       struct list_head              ocw_entry;
+       wait_queue_head_t            ocw_waitq;
+       struct osc_async_page  *ocw_oap;
+       int                  ocw_grant;
+       int                  ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+              struct obdo *oa, struct lov_stripe_md **ea,
+              struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+                   struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                    __u64 *flags, ldlm_policy_data_t *policy,
+                    struct ost_lvb *lvb, int kms_valid,
+                    obd_enqueue_update_f upcall,
+                    void *cookie, struct ldlm_enqueue_info *einfo,
+                    struct lustre_handle *lockh,
+                    struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                  __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                  int *flags, void *data, struct lustre_handle *lockh,
+                  int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+                          struct obd_trans_info *oti,
+                          obd_enqueue_update_f upcall, void *cookie,
+                          struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+                  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+                 obd_enqueue_update_f upcall, void *cookie,
+                 struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#ifdef LPROCFS
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+       return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+               rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+       return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+#ifndef min_t
+#define min_t(type,x,y) \
+       ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#endif
+
+struct osc_device {
+       struct cl_device    od_cl;
+       struct obd_export  *od_exp;
+
+       /* Write stats is actually protected by client_obd's lock. */
+       struct osc_stats {
+               uint64_t     os_lockless_writes;          /* by bytes */
+               uint64_t     os_lockless_reads;    /* by bytes */
+               uint64_t     os_lockless_truncates;       /* by times */
+       } od_stats;
+
+       /* configuration item(s) */
+       int              od_contention_time;
+       int              od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+       return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+       /** linkage for quota hash table */
+       struct hlist_node oqi_hash;
+       obd_uid   oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+                   obd_flag valid, obd_flag flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+void osc_inc_unstable_pages(struct ptlrpc_request *req);
+void osc_dec_unstable_pages(struct ptlrpc_request *req);
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644 (file)
index 0000000..1b27704
--- /dev/null
@@ -0,0 +1,836 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+       LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+       return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+       LINVRNT(oio == osc_env_io(env));
+       return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       slice = cl_page_at(page, &osc_device_type);
+       LASSERT(slice != NULL);
+
+       return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+                        const struct cl_io_slice *ios,
+                        enum cl_req_type crt, struct cl_2queue *queue)
+{
+       struct cl_page    *page;
+       struct cl_page    *tmp;
+       struct client_obd *cli  = NULL;
+       struct osc_object *osc  = NULL; /* to keep gcc happy */
+       struct osc_page   *opg;
+       struct cl_io      *io;
+       LIST_HEAD     (list);
+
+       struct cl_page_list *qin      = &queue->c2_qin;
+       struct cl_page_list *qout     = &queue->c2_qout;
+       int queued = 0;
+       int result = 0;
+       int cmd;
+       int brw_flags;
+       int max_pages;
+
+       LASSERT(qin->pl_nr > 0);
+
+       CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+       osc = cl2osc(ios->cis_obj);
+       cli = osc_cli(osc);
+       max_pages = cli->cl_max_pages_per_rpc;
+
+       cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+       brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+       /*
+        * NOTE: here @page is a top-level page. This is done to avoid
+        *       creation of sub-page-list.
+        */
+       cl_page_list_for_each_safe(page, tmp, qin) {
+               struct osc_async_page *oap;
+
+               /* Top level IO. */
+               io = page->cp_owner;
+               LASSERT(io != NULL);
+
+               opg = osc_cl_page_osc(page);
+               oap = &opg->ops_oap;
+               LASSERT(osc == oap->oap_obj);
+
+               if (!list_empty(&oap->oap_pending_item) ||
+                   !list_empty(&oap->oap_rpc_item)) {
+                       CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+                              oap, opg);
+                       result = -EBUSY;
+                       break;
+               }
+
+               result = cl_page_prep(env, io, page, crt);
+               if (result != 0) {
+                       LASSERT(result < 0);
+                       if (result != -EALREADY)
+                               break;
+                       /*
+                        * Handle -EALREADY error: for read case, the page is
+                        * already in UPTODATE state; for write, the page
+                        * is not dirty.
+                        */
+                       result = 0;
+                       continue;
+               }
+
+               cl_page_list_move(qout, qin, page);
+               oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+               oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+               osc_page_submit(env, opg, crt, brw_flags);
+               list_add_tail(&oap->oap_pending_item, &list);
+               if (++queued == max_pages) {
+                       queued = 0;
+                       result = osc_queue_sync_pages(env, osc, &list, cmd,
+                                                     brw_flags);
+                       if (result < 0)
+                               break;
+               }
+       }
+
+       if (queued > 0)
+               result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+       CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+       return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+                             struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+       struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+       struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+       int valid;
+       __u64 kms;
+
+       /* offset within stripe */
+       kms = cl_offset(obj, idx) + to;
+
+       cl_object_attr_lock(obj);
+       /*
+        * XXX old code used
+        *
+        *       ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+        *
+        * here
+        */
+       CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n",
+              kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+              loi->loi_lvb.lvb_size);
+
+       valid = 0;
+       if (kms > loi->loi_kms) {
+               attr->cat_kms = kms;
+               valid |= CAT_KMS;
+       }
+       if (kms > loi->loi_lvb.lvb_size) {
+               attr->cat_size = kms;
+               valid |= CAT_SIZE;
+       }
+       cl_object_attr_set(env, obj, attr, valid);
+       cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+                          struct osc_page *opage, unsigned to)
+{
+       struct cl_page    *page = opage->ops_cl.cpl_page;
+       struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+       osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+       struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+       struct osc_io     *oio = cl2osc_io(env, ios);
+       int result = 0;
+       ENTRY;
+
+       /*
+        * This implements OBD_BRW_CHECK logic from old client.
+        */
+
+       if (imp == NULL || imp->imp_invalid)
+               result = -EIO;
+       if (result == 0 && oio->oi_lockless)
+               /* this page contains `invalid' data, but who cares?
+                * nobody can access the invalid data.
+                * in osc_io_commit_write(), we're going to write exact
+                * [from, to) bytes of this page to OST. -jay */
+               cl_page_export(env, slice->cpl_page, 1);
+
+       RETURN(result);
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct osc_io    *oio = cl2osc_io(env, ios);
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+       struct osc_async_page *oap = &opg->ops_oap;
+       ENTRY;
+
+       LASSERT(to > 0);
+       /*
+        * XXX instead of calling osc_page_touch() here and in
+        * osc_io_fault_start() it might be more logical to introduce
+        * cl_page_touch() method, that generic cl_io_commit_write() and page
+        * fault code calls.
+        */
+       osc_page_touch(env, cl2osc_page(slice), to);
+       if (!client_is_remote(osc_export(obj)) &&
+           cfs_capable(CFS_CAP_SYS_RESOURCE))
+               oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+       if (oio->oi_lockless)
+               /* see osc_io_prepare_write() for lockless io handling. */
+               cl_page_clip(env, slice->cpl_page, from, to);
+
+       RETURN(0);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_io       *io;
+       struct cl_fault_io *fio;
+
+       ENTRY;
+
+       io  = ios->cis_io;
+       fio = &io->u.ci_fault;
+       CDEBUG(D_INFO, "%lu %d %d\n",
+              fio->ft_index, fio->ft_writable, fio->ft_nob);
+       /*
+        * If mapping is writeable, adjust kms to cover this page,
+        * but do not extend kms beyond actual file size.
+        * See bug 10919.
+        */
+       if (fio->ft_writable)
+               osc_page_touch_at(env, ios->cis_obj,
+                                 fio->ft_index, fio->ft_nob);
+       RETURN(0);
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+       struct osc_async_cbargs *args = a;
+
+       args->opc_rc = rc;
+       complete(&args->opc_sync);
+       return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, void *cbdata)
+{
+       const struct cl_page_slice *slice;
+       struct osc_page *ops;
+       struct osc_async_page *oap;
+       __u64 start = *(__u64 *)cbdata;
+
+       slice = cl_page_at(page, &osc_device_type);
+       LASSERT(slice != NULL);
+       ops = cl2osc_page(slice);
+       oap = &ops->ops_oap;
+
+       if (oap->oap_cmd & OBD_BRW_WRITE &&
+           !list_empty(&oap->oap_pending_item))
+               CL_PAGE_DEBUG(D_ERROR, env, page, "exists " LPU64 "/%s.\n",
+                               start, current->comm);
+
+       {
+               struct page *vmpage = cl_page_vmpage(env, page);
+               if (PageLocked(vmpage))
+                       CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+                              ops, page->cp_index,
+                              (oap->oap_cmd & OBD_BRW_RWMASK));
+       }
+
+       return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+                           struct osc_io *oio, __u64 size)
+{
+       struct cl_object *clob;
+       int     partial;
+       pgoff_t start;
+
+       clob    = oio->oi_cl.cis_obj;
+       start   = cl_index(clob, size);
+       partial = cl_offset(clob, start) < size;
+
+       /*
+        * Complain if there are pages in the truncated region.
+        */
+       cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+                           trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       struct cl_io        *io     = slice->cis_io;
+       struct osc_io      *oio    = cl2osc_io(env, slice);
+       struct cl_object        *obj    = slice->cis_obj;
+       struct lov_oinfo        *loi    = cl2osc(obj)->oo_oinfo;
+       struct cl_attr    *attr   = &osc_env_info(env)->oti_attr;
+       struct obdo          *oa     = &oio->oi_oa;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       __u64               size   = io->u.ci_setattr.sa_attr.lvb_size;
+       unsigned int         ia_valid = io->u.ci_setattr.sa_valid;
+       int                   result = 0;
+       struct obd_info   oinfo = { { { 0 } } };
+
+       /* truncate cache dirty pages first */
+       if (cl_io_is_trunc(io))
+               result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+       if (result == 0 && oio->oi_lockless == 0) {
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+                       unsigned int cl_valid = 0;
+
+                       if (ia_valid & ATTR_SIZE) {
+                               attr->cat_size = attr->cat_kms = size;
+                               cl_valid = (CAT_SIZE | CAT_KMS);
+                       }
+                       if (ia_valid & ATTR_MTIME_SET) {
+                               attr->cat_mtime = lvb->lvb_mtime;
+                               cl_valid |= CAT_MTIME;
+                       }
+                       if (ia_valid & ATTR_ATIME_SET) {
+                               attr->cat_atime = lvb->lvb_atime;
+                               cl_valid |= CAT_ATIME;
+                       }
+                       if (ia_valid & ATTR_CTIME_SET) {
+                               attr->cat_ctime = lvb->lvb_ctime;
+                               cl_valid |= CAT_CTIME;
+                       }
+                       result = cl_object_attr_set(env, obj, attr, cl_valid);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       memset(oa, 0, sizeof(*oa));
+       if (result == 0) {
+               oa->o_oi = loi->loi_oi;
+               oa->o_mtime = attr->cat_mtime;
+               oa->o_atime = attr->cat_atime;
+               oa->o_ctime = attr->cat_ctime;
+               oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+                       OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+               if (ia_valid & ATTR_SIZE) {
+                       oa->o_size = size;
+                       oa->o_blocks = OBD_OBJECT_EOF;
+                       oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                       if (oio->oi_lockless) {
+                               oa->o_flags = OBD_FL_SRVLOCK;
+                               oa->o_valid |= OBD_MD_FLFLAGS;
+                       }
+               } else {
+                       LASSERT(oio->oi_lockless == 0);
+               }
+
+               oinfo.oi_oa = oa;
+               oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+               init_completion(&cbargs->opc_sync);
+
+               if (ia_valid & ATTR_SIZE)
+                       result = osc_punch_base(osc_export(cl2osc(obj)),
+                                               &oinfo, osc_async_upcall,
+                                               cbargs, PTLRPCD_SET);
+               else
+                       result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+                                                       &oinfo, NULL,
+                                                       osc_async_upcall,
+                                                       cbargs, PTLRPCD_SET);
+               cbargs->opc_rpc_sent = result == 0;
+       }
+       return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
+{
+       struct cl_io     *io  = slice->cis_io;
+       struct osc_io    *oio = cl2osc_io(env, slice);
+       struct cl_object *obj = slice->cis_obj;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       int result = 0;
+
+       if (cbargs->opc_rpc_sent) {
+               wait_for_completion(&cbargs->opc_sync);
+               result = io->ci_result = cbargs->opc_rc;
+       }
+       if (result == 0) {
+               if (oio->oi_lockless) {
+                       /* lockless truncate */
+                       struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+                       LASSERT(cl_io_is_trunc(io));
+                       /* XXX: Need a lock. */
+                       osd->od_stats.os_lockless_truncates++;
+               }
+       }
+
+       if (cl_io_is_trunc(io)) {
+               __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+               osc_trunc_check(env, io, oio, size);
+               if (oio->oi_trunc != NULL) {
+                       osc_cache_truncate_end(env, oio, cl2osc(obj));
+                       oio->oi_trunc = NULL;
+               }
+       }
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+                            const struct cl_io_slice *slice)
+{
+       struct osc_io    *oio   = cl2osc_io(env, slice);
+       struct cl_object *obj   = slice->cis_obj;
+       struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+       int           result = 0;
+       ENTRY;
+
+       if (oio->oi_lockless == 0) {
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       attr->cat_atime = LTIME_S(CFS_CURRENT_TIME);
+                       result = cl_object_attr_set(env, obj, attr,
+                                                   CAT_ATIME);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       RETURN(result);
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+       struct osc_io    *oio   = cl2osc_io(env, slice);
+       struct cl_object *obj   = slice->cis_obj;
+       struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+       int           result = 0;
+       ENTRY;
+
+       if (oio->oi_lockless == 0) {
+               OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       attr->cat_mtime = attr->cat_ctime =
+                               LTIME_S(CFS_CURRENT_TIME);
+                       result = cl_object_attr_set(env, obj, attr,
+                                                   CAT_MTIME | CAT_CTIME);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       RETURN(result);
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+                        struct cl_fsync_io *fio)
+{
+       struct osc_io    *oio   = osc_env_io(env);
+       struct obdo      *oa    = &oio->oi_oa;
+       struct obd_info  *oinfo = &oio->oi_info;
+       struct lov_oinfo *loi   = obj->oo_oinfo;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       int rc = 0;
+       ENTRY;
+
+       memset(oa, 0, sizeof(*oa));
+       oa->o_oi = loi->loi_oi;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+       /* reload size abd blocks for start and end of sync range */
+       oa->o_size = fio->fi_start;
+       oa->o_blocks = fio->fi_end;
+       oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+       obdo_set_parent_fid(oa, fio->fi_fid);
+
+       memset(oinfo, 0, sizeof(*oinfo));
+       oinfo->oi_oa = oa;
+       oinfo->oi_capa = fio->fi_capa;
+       init_completion(&cbargs->opc_sync);
+
+       rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+                          PTLRPCD_SET);
+       RETURN(rc);
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+       struct cl_io       *io  = slice->cis_io;
+       struct cl_fsync_io *fio = &io->u.ci_fsync;
+       struct cl_object   *obj = slice->cis_obj;
+       struct osc_object  *osc = cl2osc(obj);
+       pgoff_t start  = cl_index(obj, fio->fi_start);
+       pgoff_t end    = cl_index(obj, fio->fi_end);
+       int     result = 0;
+       ENTRY;
+
+       if (fio->fi_end == OBD_OBJECT_EOF)
+               end = CL_PAGE_EOF;
+
+       result = osc_cache_writeback_range(env, osc, start, end, 0,
+                                          fio->fi_mode == CL_FSYNC_DISCARD);
+       if (result > 0) {
+               fio->fi_nr_written += result;
+               result = 0;
+       }
+       if (fio->fi_mode == CL_FSYNC_ALL) {
+               int rc;
+
+               /* we have to wait for writeback to finish before we can
+                * send OST_SYNC RPC. This is bad because it causes extents
+                * to be written osc by osc. However, we usually start
+                * writeback before CL_FSYNC_ALL so this won't have any real
+                * problem. */
+               rc = osc_cache_wait_range(env, osc, start, end);
+               if (result == 0)
+                       result = rc;
+               rc = osc_fsync_ost(env, osc, fio);
+               if (result == 0)
+                       result = rc;
+       }
+
+       RETURN(result);
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+                            const struct cl_io_slice *slice)
+{
+       struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+       struct cl_object   *obj = slice->cis_obj;
+       pgoff_t start = cl_index(obj, fio->fi_start);
+       pgoff_t end   = cl_index(obj, fio->fi_end);
+       int result = 0;
+
+       if (fio->fi_mode == CL_FSYNC_LOCAL) {
+               result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+       } else if (fio->fi_mode == CL_FSYNC_ALL) {
+               struct osc_io      *oio    = cl2osc_io(env, slice);
+               struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+               wait_for_completion(&cbargs->opc_sync);
+               if (result == 0)
+                       result = cbargs->opc_rc;
+       }
+       slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+                      const struct cl_io_slice *slice)
+{
+       struct osc_io *oio = cl2osc_io(env, slice);
+
+       if (oio->oi_active) {
+               osc_extent_release(env, oio->oi_active);
+               oio->oi_active = NULL;
+       }
+}
+
+static const struct cl_io_operations osc_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_start  = osc_io_read_start,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_WRITE] = {
+                       .cio_start  = osc_io_write_start,
+                       .cio_end    = osc_io_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_SETATTR] = {
+                       .cio_start  = osc_io_setattr_start,
+                       .cio_end    = osc_io_setattr_end
+               },
+               [CIT_FAULT] = {
+                       .cio_start  = osc_io_fault_start,
+                       .cio_end    = osc_io_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_FSYNC] = {
+                       .cio_start  = osc_io_fsync_start,
+                       .cio_end    = osc_io_fsync_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = osc_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = osc_io_submit
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = osc_io_submit
+                }
+        },
+       .cio_prepare_write = osc_io_prepare_write,
+       .cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+                       const struct cl_req_slice *slice)
+{
+       return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret)
+{
+       struct osc_req *or;
+
+       or = cl2osc_req(slice);
+       OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+                            const struct cl_req_slice *slice,
+                            const struct cl_object *obj,
+                            struct cl_req_attr *attr, obd_valid flags)
+{
+       struct lov_oinfo *oinfo;
+       struct cl_req    *clerq;
+       struct cl_page   *apage; /* _some_ page in @clerq */
+       struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+       struct osc_lock  *olck;
+       struct osc_page  *opg;
+       struct obdo      *oa;
+       struct ost_lvb   *lvb;
+
+       oinfo   = cl2osc(obj)->oo_oinfo;
+       lvb     = &oinfo->loi_lvb;
+       oa      = attr->cra_oa;
+
+       if ((flags & OBD_MD_FLMTIME) != 0) {
+               oa->o_mtime = lvb->lvb_mtime;
+               oa->o_valid |= OBD_MD_FLMTIME;
+       }
+       if ((flags & OBD_MD_FLATIME) != 0) {
+               oa->o_atime = lvb->lvb_atime;
+               oa->o_valid |= OBD_MD_FLATIME;
+       }
+       if ((flags & OBD_MD_FLCTIME) != 0) {
+               oa->o_ctime = lvb->lvb_ctime;
+               oa->o_valid |= OBD_MD_FLCTIME;
+       }
+       if (flags & OBD_MD_FLGROUP) {
+               ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+               oa->o_valid |= OBD_MD_FLGROUP;
+       }
+       if (flags & OBD_MD_FLID) {
+               ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+               oa->o_valid |= OBD_MD_FLID;
+       }
+       if (flags & OBD_MD_FLHANDLE) {
+               clerq = slice->crs_req;
+               LASSERT(!list_empty(&clerq->crq_pages));
+               apage = container_of(clerq->crq_pages.next,
+                                    struct cl_page, cp_flight);
+               opg = osc_cl_page_osc(apage);
+               apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+               lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+               if (lock == NULL) {
+                       struct cl_object_header *head;
+                       struct cl_lock    *scan;
+
+                       head = cl_object_header(apage->cp_obj);
+                       list_for_each_entry(scan, &head->coh_locks,
+                                               cll_linkage)
+                               CL_LOCK_DEBUG(D_ERROR, env, scan,
+                                             "no cover page!\n");
+                       CL_PAGE_DEBUG(D_ERROR, env, apage,
+                                     "dump uncover page!\n");
+                       libcfs_debug_dumpstack(NULL);
+                       LBUG();
+               }
+
+               olck = osc_lock_at(lock);
+               LASSERT(olck != NULL);
+               LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+               /* check for lockless io. */
+               if (olck->ols_lock != NULL) {
+                       oa->o_handle = olck->ols_lock->l_remote_handle;
+                       oa->o_valid |= OBD_MD_FLHANDLE;
+               }
+               cl_lock_put(env, lock);
+       }
+}
+
+static const struct cl_req_operations osc_req_ops = {
+       .cro_prep       = osc_req_prep,
+       .cro_attr_set   = osc_req_attr_set,
+       .cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+               struct cl_object *obj, struct cl_io *io)
+{
+       struct osc_io *oio = osc_env_io(env);
+
+       CL_IO_SLICE_CLEAN(oio, oi_cl);
+       cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+       return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+                struct cl_req *req)
+{
+       struct osc_req *or;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, __GFP_IO);
+       if (or != NULL) {
+               cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644 (file)
index 0000000..640bc3d
--- /dev/null
@@ -0,0 +1,1663 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+# include <linux/libcfs/libcfs.h>
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+#define _PAGEREF_MAGIC  (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+       return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+       struct ldlm_lock *lock;
+
+       lock = ldlm_handle2lock(handle);
+       if (lock != NULL)
+               LDLM_LOCK_PUT(lock);
+       return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+       struct ldlm_lock *lock  = osc_handle_ptr(&ols->ols_handle);
+       struct ldlm_lock *olock       = ols->ols_lock;
+       int            handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+       return
+               ergo(osc_lock_is_lockless(ols),
+                    ols->ols_locklessable && ols->ols_lock == NULL)  ||
+               (ergo(olock != NULL, handle_used) &&
+                ergo(olock != NULL,
+                     olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
+                /*
+                 * Check that ->ols_handle and ->ols_lock are consistent, but
+                 * take into account that they are set at the different time.
+                 */
+                ergo(handle_used,
+                     ergo(lock != NULL && olock != NULL, lock == olock) &&
+                     ergo(lock == NULL, olock == NULL)) &&
+                ergo(ols->ols_state == OLS_CANCELLED,
+                     olock == NULL && !handle_used) &&
+                /*
+                 * DLM lock is destroyed only after we have seen cancellation
+                 * ast.
+                 */
+                ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+                     !olock->l_destroyed) &&
+                ergo(ols->ols_state == OLS_GRANTED,
+                     olock != NULL &&
+                     olock->l_req_mode == olock->l_granted_mode &&
+                     ols->ols_hold));
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+       struct ldlm_lock *dlmlock;
+
+       spin_lock(&osc_ast_guard);
+       dlmlock = olck->ols_lock;
+       if (dlmlock == NULL) {
+               spin_unlock(&osc_ast_guard);
+               return;
+       }
+
+       olck->ols_lock = NULL;
+       /* wb(); --- for all who checks (ols->ols_lock != NULL) before
+        * call to osc_lock_detach() */
+       dlmlock->l_ast_data = NULL;
+       olck->ols_handle.cookie = 0ULL;
+       spin_unlock(&osc_ast_guard);
+
+       lock_res_and_lock(dlmlock);
+       if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+               struct cl_object *obj = olck->ols_cl.cls_obj;
+               struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+               __u64 old_kms;
+
+               cl_object_attr_lock(obj);
+               /* Must get the value under the lock to avoid possible races. */
+               old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+               /* Update the kms. Need to loop all granted locks.
+                * Not a problem for the client */
+               attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+               cl_object_attr_set(env, obj, attr, CAT_KMS);
+               cl_object_attr_unlock(obj);
+       }
+       unlock_res_and_lock(dlmlock);
+
+       /* release a reference taken in osc_lock_upcall0(). */
+       LASSERT(olck->ols_has_ref);
+       lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+       LDLM_LOCK_RELEASE(dlmlock);
+       olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+       int result = 0;
+
+       if (ols->ols_hold) {
+               ols->ols_hold = 0;
+               result = osc_cancel_base(&ols->ols_handle,
+                                        ols->ols_einfo.ei_mode);
+       }
+       return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(ols));
+
+       switch (ols->ols_state) {
+       case OLS_NEW:
+               LASSERT(!ols->ols_hold);
+               LASSERT(ols->ols_agl);
+               return 0;
+       case OLS_UPCALL_RECEIVED:
+               osc_lock_unhold(ols);
+       case OLS_ENQUEUED:
+               LASSERT(!ols->ols_hold);
+               osc_lock_detach(env, ols);
+               ols->ols_state = OLS_NEW;
+               return 0;
+       case OLS_GRANTED:
+               LASSERT(!ols->ols_glimpse);
+               LASSERT(ols->ols_hold);
+               /*
+                * Move lock into OLS_RELEASED state before calling
+                * osc_cancel_base() so that possible synchronous cancellation
+                * (that always happens e.g., for liblustre) sees that lock is
+                * released.
+                */
+               ols->ols_state = OLS_RELEASED;
+               return osc_lock_unhold(ols);
+       default:
+               CERROR("Impossible state: %d\n", ols->ols_state);
+               LBUG();
+       }
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+                         struct cl_lock_slice *slice)
+{
+       struct osc_lock  *ols = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(ols));
+       /*
+        * ->ols_hold can still be true at this point if, for example, a
+        * thread that requested a lock was killed (and released a reference
+        * to the lock), before reply from a server was received. In this case
+        * lock is destroyed immediately after upcall.
+        */
+       osc_lock_unhold(ols);
+       LASSERT(ols->ols_lock == NULL);
+       LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+               atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+       OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+                                 const struct cl_lock *lock,
+                                 ldlm_policy_data_t *policy)
+{
+       const struct cl_lock_descr *d = &lock->cll_descr;
+
+       osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+       policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+       __u64 result = 0;
+
+       LASSERT((enqflags & ~CEF_MASK) == 0);
+
+       if (enqflags & CEF_NONBLOCK)
+               result |= LDLM_FL_BLOCK_NOWAIT;
+       if (enqflags & CEF_ASYNC)
+               result |= LDLM_FL_HAS_INTENT;
+       if (enqflags & CEF_DISCARD_DATA)
+               result |= LDLM_AST_DISCARD_DATA;
+       return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+       struct osc_lock *olck;
+
+       lock_res_and_lock(dlm_lock);
+       spin_lock(&osc_ast_guard);
+       olck = dlm_lock->l_ast_data;
+       if (olck != NULL) {
+               struct cl_lock *lock = olck->ols_cl.cls_lock;
+               /*
+                * If osc_lock holds a reference on ldlm lock, return it even
+                * when cl_lock is in CLS_FREEING state. This way
+                *
+                *       osc_ast_data_get(dlmlock) == NULL
+                *
+                * guarantees that all osc references on dlmlock were
+                * released. osc_dlm_blocking_ast0() relies on that.
+                */
+               if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add_atomic(&lock->cll_reference,
+                                         "ast", current);
+               } else
+                       olck = NULL;
+       }
+       spin_unlock(&osc_ast_guard);
+       unlock_res_and_lock(dlm_lock);
+       return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+       struct cl_lock *lock;
+
+       lock = olck->ols_cl.cls_lock;
+       lu_ref_del(&lock->cll_reference, "ast", current);
+       cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+                               int rc)
+{
+       struct ost_lvb    *lvb;
+       struct cl_object  *obj;
+       struct lov_oinfo  *oinfo;
+       struct cl_attr    *attr;
+       unsigned           valid;
+
+       ENTRY;
+
+       if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+               RETURN_EXIT;
+
+       lvb   = &olck->ols_lvb;
+       obj   = olck->ols_cl.cls_obj;
+       oinfo = cl2osc(obj)->oo_oinfo;
+       attr  = &osc_env_info(env)->oti_attr;
+       valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+       cl_lvb2attr(attr, lvb);
+
+       cl_object_attr_lock(obj);
+       if (rc == 0) {
+               struct ldlm_lock  *dlmlock;
+               __u64 size;
+
+               dlmlock = olck->ols_lock;
+               LASSERT(dlmlock != NULL);
+
+               /* re-grab LVB from a dlm lock under DLM spin-locks. */
+               *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+               size = lvb->lvb_size;
+               /* Extend KMS up to the end of this lock and no further
+                * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+               if (size > dlmlock->l_policy_data.l_extent.end)
+                       size = dlmlock->l_policy_data.l_extent.end + 1;
+               if (size >= oinfo->loi_kms) {
+                       LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64
+                                  ", kms="LPU64, lvb->lvb_size, size);
+                       valid |= CAT_KMS;
+                       attr->cat_kms = size;
+               } else {
+                       LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+                                  LPU64"; leaving kms="LPU64", end="LPU64,
+                                  lvb->lvb_size, oinfo->loi_kms,
+                                  dlmlock->l_policy_data.l_extent.end);
+               }
+               ldlm_lock_allow_match_locked(dlmlock);
+       } else if (rc == -ENAVAIL && olck->ols_glimpse) {
+               CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                      " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
+       } else
+               valid = 0;
+
+       if (valid != 0)
+               cl_object_attr_set(env, obj, attr, valid);
+
+       cl_object_attr_unlock(obj);
+
+       EXIT;
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+                            struct ldlm_lock *dlmlock, int rc)
+{
+       struct ldlm_extent   *ext;
+       struct cl_lock       *lock;
+       struct cl_lock_descr *descr;
+
+       LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+       ENTRY;
+       if (olck->ols_state < OLS_GRANTED) {
+               lock  = olck->ols_cl.cls_lock;
+               ext   = &dlmlock->l_policy_data.l_extent;
+               descr = &osc_env_info(env)->oti_descr;
+               descr->cld_obj = lock->cll_descr.cld_obj;
+
+               /* XXX check that ->l_granted_mode is valid. */
+               descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+               descr->cld_start = cl_index(descr->cld_obj, ext->start);
+               descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+               descr->cld_gid   = ext->gid;
+               /*
+                * tell upper layers the extent of the lock that was actually
+                * granted
+                */
+               olck->ols_state = OLS_GRANTED;
+               osc_lock_lvb_update(env, olck, rc);
+
+               /* release DLM spin-locks to allow cl_lock_{modify,signal}()
+                * to take a semaphore on a parent lock. This is safe, because
+                * spin-locks are needed to protect consistency of
+                * dlmlock->l_*_mode and LVB, and we have finished processing
+                * them. */
+               unlock_res_and_lock(dlmlock);
+               cl_lock_modify(env, lock, descr);
+               cl_lock_signal(env, lock);
+               LINVRNT(osc_lock_invariant(olck));
+               lock_res_and_lock(dlmlock);
+       }
+       EXIT;
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+       struct ldlm_lock *dlmlock;
+
+       ENTRY;
+
+       dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+       LASSERT(dlmlock != NULL);
+
+       lock_res_and_lock(dlmlock);
+       spin_lock(&osc_ast_guard);
+       LASSERT(dlmlock->l_ast_data == olck);
+       LASSERT(olck->ols_lock == NULL);
+       olck->ols_lock = dlmlock;
+       spin_unlock(&osc_ast_guard);
+
+       /*
+        * Lock might be not yet granted. In this case, completion ast
+        * (osc_ldlm_completion_ast()) comes later and finishes lock
+        * granting.
+        */
+       if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+               osc_lock_granted(env, olck, dlmlock, 0);
+       unlock_res_and_lock(dlmlock);
+
+       /*
+        * osc_enqueue_interpret() decrefs asynchronous locks, counter
+        * this.
+        */
+       ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+       olck->ols_hold = 1;
+
+       /* lock reference taken by ldlm_handle2lock_long() is owned by
+        * osc_lock and released in osc_lock_detach() */
+       lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+       olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+       struct osc_lock  *olck  = cookie;
+       struct cl_lock_slice    *slice = &olck->ols_cl;
+       struct cl_lock    *lock  = slice->cls_lock;
+       struct lu_env      *env;
+       struct cl_env_nest       nest;
+
+       ENTRY;
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               int rc;
+
+               cl_lock_mutex_get(env, lock);
+
+               LASSERT(lock->cll_state >= CLS_QUEUING);
+               if (olck->ols_state == OLS_ENQUEUED) {
+                       olck->ols_state = OLS_UPCALL_RECEIVED;
+                       rc = ldlm_error2errno(errcode);
+               } else if (olck->ols_state == OLS_CANCELLED) {
+                       rc = -EIO;
+               } else {
+                       CERROR("Impossible state: %d\n", olck->ols_state);
+                       LBUG();
+               }
+               if (rc) {
+                       struct ldlm_lock *dlmlock;
+
+                       dlmlock = ldlm_handle2lock(&olck->ols_handle);
+                       if (dlmlock != NULL) {
+                               lock_res_and_lock(dlmlock);
+                               spin_lock(&osc_ast_guard);
+                               LASSERT(olck->ols_lock == NULL);
+                               dlmlock->l_ast_data = NULL;
+                               olck->ols_handle.cookie = 0ULL;
+                               spin_unlock(&osc_ast_guard);
+                               ldlm_lock_fail_match_locked(dlmlock);
+                               unlock_res_and_lock(dlmlock);
+                               LDLM_LOCK_PUT(dlmlock);
+                       }
+               } else {
+                       if (olck->ols_glimpse)
+                               olck->ols_glimpse = 0;
+                       osc_lock_upcall0(env, olck);
+               }
+
+               /* Error handling, some errors are tolerable. */
+               if (olck->ols_locklessable && rc == -EUSERS) {
+                       /* This is a tolerable error, turn this lock into
+                        * lockless lock.
+                        */
+                       osc_object_set_contended(cl2osc(slice->cls_obj));
+                       LASSERT(slice->cls_ops == &osc_lock_ops);
+
+                       /* Change this lock to ldlmlock-less lock. */
+                       osc_lock_to_lockless(env, olck, 1);
+                       olck->ols_state = OLS_GRANTED;
+                       rc = 0;
+               } else if (olck->ols_glimpse && rc == -ENAVAIL) {
+                       osc_lock_lvb_update(env, olck, rc);
+                       cl_lock_delete(env, lock);
+                       /* Hide the error. */
+                       rc = 0;
+               }
+
+               if (rc == 0) {
+                       /* For AGL case, the RPC sponsor may exits the cl_lock
+                       *  processing without wait() called before related OSC
+                       *  lock upcall(). So update the lock status according
+                       *  to the enqueue result inside AGL upcall(). */
+                       if (olck->ols_agl) {
+                               lock->cll_flags |= CLF_FROM_UPCALL;
+                               cl_wait_try(env, lock);
+                               lock->cll_flags &= ~CLF_FROM_UPCALL;
+                               if (!olck->ols_glimpse)
+                                       olck->ols_agl = 0;
+                       }
+                       cl_lock_signal(env, lock);
+                       /* del user for lock upcall cookie */
+                       cl_unuse_try(env, lock);
+               } else {
+                       /* del user for lock upcall cookie */
+                       cl_lock_user_del(env, lock);
+                       cl_lock_error(env, lock, rc);
+               }
+
+               /* release cookie reference, acquired by osc_lock_enqueue() */
+               cl_lock_hold_release(env, lock, "upcall", lock);
+               cl_lock_mutex_put(env, lock);
+
+               lu_ref_del(&lock->cll_reference, "upcall", lock);
+               /* This maybe the last reference, so must be called after
+                * cl_lock_mutex_put(). */
+               cl_lock_put(env, lock);
+
+               cl_env_nested_put(&nest, env);
+       } else {
+               /* should never happen, similar to osc_ldlm_blocking_ast(). */
+               LBUG();
+       }
+       RETURN(errcode);
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+                             struct ldlm_lock *dlmlock,
+                             struct osc_lock *olck, int blocking)
+{
+       struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+       LASSERT(olck->ols_lock == dlmlock);
+       CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+       LASSERT(!osc_lock_is_lockless(olck));
+
+       /*
+        * Lock might be still addref-ed here, if e.g., blocking ast
+        * is sent for a failed lock.
+        */
+       osc_lock_unhold(olck);
+
+       if (blocking && olck->ols_state < OLS_BLOCKED)
+               /*
+                * Move osc_lock into OLS_BLOCKED before canceling the lock,
+                * because it recursively re-enters osc_lock_blocking(), with
+                * the state set to OLS_CANCELLED.
+                */
+               olck->ols_state = OLS_BLOCKED;
+       /*
+        * cancel and destroy lock at least once no matter how blocking ast is
+        * entered (see comment above osc_ldlm_blocking_ast() for use
+        * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+        */
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+                                struct ldlm_lock *dlmlock,
+                                void *data, int flag)
+{
+       struct osc_lock *olck;
+       struct cl_lock  *lock;
+       int result;
+       int cancel;
+
+       LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+       cancel = 0;
+       olck = osc_ast_data_get(dlmlock);
+       if (olck != NULL) {
+               lock = olck->ols_cl.cls_lock;
+               cl_lock_mutex_get(env, lock);
+               LINVRNT(osc_lock_invariant(olck));
+               if (olck->ols_ast_wait) {
+                       /* wake up osc_lock_use() */
+                       cl_lock_signal(env, lock);
+                       olck->ols_ast_wait = 0;
+               }
+               /*
+                * Lock might have been canceled while this thread was
+                * sleeping for lock mutex, but olck is pinned in memory.
+                */
+               if (olck == dlmlock->l_ast_data) {
+                       /*
+                        * NOTE: DLM sends blocking AST's for failed locks
+                        *       (that are still in pre-OLS_GRANTED state)
+                        *       too, and they have to be canceled otherwise
+                        *       DLM lock is never destroyed and stuck in
+                        *       the memory.
+                        *
+                        *       Alternatively, ldlm_cli_cancel() can be
+                        *       called here directly for osc_locks with
+                        *       ols_state < OLS_GRANTED to maintain an
+                        *       invariant that ->clo_cancel() is only called
+                        *       for locks that were granted.
+                        */
+                       LASSERT(data == olck);
+                       osc_lock_blocking(env, dlmlock,
+                                         olck, flag == LDLM_CB_BLOCKING);
+               } else
+                       cancel = 1;
+               cl_lock_mutex_put(env, lock);
+               osc_ast_data_put(env, olck);
+       } else
+               /*
+                * DLM lock exists, but there is no cl_lock attached to it.
+                * This is a `normal' race. cl_object and its cl_lock's can be
+                * removed by memory pressure, together with all pages.
+                */
+               cancel = (flag == LDLM_CB_BLOCKING);
+
+       if (cancel) {
+               struct lustre_handle *lockh;
+
+               lockh = &osc_env_info(env)->oti_handle;
+               ldlm_lock2handle(dlmlock, lockh);
+               result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+       } else
+               result = 0;
+       return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *          cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *        - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *          ldlm_cli_cancel() that calls
+ *
+ *               dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *          recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *        cl_lock_cancel()->
+ *          osc_lock_cancel()->
+ *            ldlm_cli_cancel()->
+ *              dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+                                struct ldlm_lock_desc *new, void *data,
+                                int flag)
+{
+       struct lu_env     *env;
+       struct cl_env_nest nest;
+       int             result;
+
+       /*
+        * This can be called in the context of outer IO, e.g.,
+        *
+        *     cl_enqueue()->...
+        *       ->osc_enqueue_base()->...
+        *       ->ldlm_prep_elc_req()->...
+        *         ->ldlm_cancel_callback()->...
+        *           ->osc_ldlm_blocking_ast()
+        *
+        * new environment has to be created to not corrupt outer context.
+        */
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+               cl_env_nested_put(&nest, env);
+       } else {
+               result = PTR_ERR(env);
+               /*
+                * XXX This should never happen, as cl_lock is
+                * stuck. Pre-allocated environment a la vvp_inode_fini_env
+                * should be used.
+                */
+               LBUG();
+       }
+       if (result != 0) {
+               if (result == -ENODATA)
+                       result = 0;
+               else
+                       CERROR("BAST failed: %d\n", result);
+       }
+       return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+                                  __u64 flags, void *data)
+{
+       struct cl_env_nest nest;
+       struct lu_env     *env;
+       struct osc_lock   *olck;
+       struct cl_lock    *lock;
+       int result;
+       int dlmrc;
+
+       /* first, do dlm part of the work */
+       dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+       /* then, notify cl_lock */
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               olck = osc_ast_data_get(dlmlock);
+               if (olck != NULL) {
+                       lock = olck->ols_cl.cls_lock;
+                       cl_lock_mutex_get(env, lock);
+                       /*
+                        * ldlm_handle_cp_callback() copied LVB from request
+                        * to lock->l_lvb_data, store it in osc_lock.
+                        */
+                       LASSERT(dlmlock->l_lvb_data != NULL);
+                       lock_res_and_lock(dlmlock);
+                       olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+                       if (olck->ols_lock == NULL) {
+                               /*
+                                * upcall (osc_lock_upcall()) hasn't yet been
+                                * called. Do nothing now, upcall will bind
+                                * olck to dlmlock and signal the waiters.
+                                *
+                                * This maintains an invariant that osc_lock
+                                * and ldlm_lock are always bound when
+                                * osc_lock is in OLS_GRANTED state.
+                                */
+                       } else if (dlmlock->l_granted_mode ==
+                                  dlmlock->l_req_mode) {
+                               osc_lock_granted(env, olck, dlmlock, dlmrc);
+                       }
+                       unlock_res_and_lock(dlmlock);
+
+                       if (dlmrc != 0) {
+                               CL_LOCK_DEBUG(D_ERROR, env, lock,
+                                             "dlmlock returned %d\n", dlmrc);
+                               cl_lock_error(env, lock, dlmrc);
+                       }
+                       cl_lock_mutex_put(env, lock);
+                       osc_ast_data_put(env, olck);
+                       result = 0;
+               } else
+                       result = -ELDLM_NO_LOCK_DATA;
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+       struct ptlrpc_request  *req  = data;
+       struct osc_lock *olck;
+       struct cl_lock   *lock;
+       struct cl_object       *obj;
+       struct cl_env_nest      nest;
+       struct lu_env     *env;
+       struct ost_lvb   *lvb;
+       struct req_capsule     *cap;
+       int                  result;
+
+       LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               /* osc_ast_data_get() has to go after environment is
+                * allocated, because osc_ast_data() acquires a
+                * reference to a lock, and it can only be released in
+                * environment.
+                */
+               olck = osc_ast_data_get(dlmlock);
+               if (olck != NULL) {
+                       lock = olck->ols_cl.cls_lock;
+                       /* Do not grab the mutex of cl_lock for glimpse.
+                        * See LU-1274 for details.
+                        * BTW, it's okay for cl_lock to be cancelled during
+                        * this period because server can handle this race.
+                        * See ldlm_server_glimpse_ast() for details.
+                        * cl_lock_mutex_get(env, lock); */
+                       cap = &req->rq_pill;
+                       req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+                       req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+                                            sizeof *lvb);
+                       result = req_capsule_server_pack(cap);
+                       if (result == 0) {
+                               lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+                               obj = lock->cll_descr.cld_obj;
+                               result = cl_object_glimpse(env, obj, lvb);
+                       }
+                       if (!exp_connect_lvb_type(req->rq_export))
+                               req_capsule_shrink(&req->rq_pill,
+                                                  &RMF_DLM_LVB,
+                                                  sizeof(struct ost_lvb_v1),
+                                                  RCL_SERVER);
+                       osc_ast_data_put(env, olck);
+               } else {
+                       /*
+                        * These errors are normal races, so we don't want to
+                        * fill the console with messages by calling
+                        * ptlrpc_error()
+                        */
+                       lustre_pack_reply(req, 1, NULL, NULL);
+                       result = -ELDLM_NO_LOCK_DATA;
+               }
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       req->rq_status = result;
+       return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice)
+{
+       /*
+        * don't need to grab coh_page_guard since we don't care the exact #
+        * of pages..
+        */
+       return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ *
+ * XXX: it should return the pages covered by this \a dlmlock.
+ */
+static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+       struct cl_env_nest       nest;
+       struct lu_env      *env;
+       struct osc_lock  *lock;
+       struct cl_lock    *cll;
+       unsigned long       weight;
+       ENTRY;
+
+       might_sleep();
+       /*
+        * osc_ldlm_weigh_ast has a complex context since it might be called
+        * because of lock canceling, or from user's input. We have to make
+        * a new environment for it. Probably it is implementation safe to use
+        * the upper context because cl_lock_put don't modify environment
+        * variables. But in case of ..
+        */
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               /* Mostly because lack of memory, tend to eliminate this lock*/
+               RETURN(0);
+
+       LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+       lock = osc_ast_data_get(dlmlock);
+       if (lock == NULL) {
+               /* cl_lock was destroyed because of memory pressure.
+                * It is much reasonable to assign this type of lock
+                * a lower cost.
+                */
+               GOTO(out, weight = 0);
+       }
+
+       cll = lock->ols_cl.cls_lock;
+       cl_lock_mutex_get(env, cll);
+       weight = cl_lock_weigh(env, cll);
+       cl_lock_mutex_put(env, cll);
+       osc_ast_data_put(env, lock);
+       EXIT;
+
+out:
+       cl_env_nested_put(&nest, env);
+       return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+                                const struct cl_lock *clock,
+                                struct osc_lock *lock,
+                                struct ldlm_enqueue_info *einfo)
+{
+       enum cl_lock_mode mode;
+
+       mode = clock->cll_descr.cld_mode;
+       if (mode == CLM_PHANTOM)
+               /*
+                * For now, enqueue all glimpse locks in read mode. In the
+                * future, client might choose to enqueue LCK_PW lock for
+                * glimpse on a file opened for write.
+                */
+               mode = CLM_READ;
+
+       einfo->ei_type   = LDLM_EXTENT;
+       einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+       einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+       einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+       einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+       einfo->ei_cb_wg  = osc_ldlm_weigh_ast;
+       einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                struct osc_lock *ols, int force)
+{
+       struct cl_lock_slice *slice = &ols->ols_cl;
+
+       LASSERT(ols->ols_state == OLS_NEW ||
+               ols->ols_state == OLS_UPCALL_RECEIVED);
+
+       if (force) {
+               ols->ols_locklessable = 1;
+               slice->cls_ops = &osc_lock_lockless_ops;
+       } else {
+               struct osc_io *oio     = osc_env_io(env);
+               struct cl_io  *io      = oio->oi_cl.cis_io;
+               struct cl_object *obj  = slice->cls_obj;
+               struct osc_object *oob = cl2osc(obj);
+               const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+               struct obd_connect_data *ocd;
+
+               LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+                       io->ci_lockreq == CILR_MAYBE ||
+                       io->ci_lockreq == CILR_NEVER);
+
+               ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+               ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+                               (io->ci_lockreq == CILR_MAYBE) &&
+                               (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+               if (io->ci_lockreq == CILR_NEVER ||
+                       /* lockless IO */
+                   (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+                       /* lockless truncate */
+                   (cl_io_is_trunc(io) &&
+                    (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+                     osd->od_lockless_truncate)) {
+                       ols->ols_locklessable = 1;
+                       slice->cls_ops = &osc_lock_lockless_ops;
+               }
+       }
+       LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+                              const struct osc_lock *qed)
+{
+       enum cl_lock_mode qing_mode;
+       enum cl_lock_mode qed_mode;
+
+       qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+       if (qed->ols_glimpse &&
+           (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+               return 1;
+
+       qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+       return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+                                const struct osc_lock *olck)
+{
+       struct cl_lock    *lock    = olck->ols_cl.cls_lock;
+       struct cl_lock_descr    *descr   = &lock->cll_descr;
+       struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+       struct cl_lock    *scan;
+       struct cl_lock    *conflict= NULL;
+       int lockless                 = osc_lock_is_lockless(olck);
+       int rc                     = 0;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+
+       /* make it enqueue anyway for glimpse lock, because we actually
+        * don't need to cancel any conflicting locks. */
+       if (olck->ols_glimpse)
+               return 0;
+
+       spin_lock(&hdr->coh_lock_guard);
+       list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+               struct cl_lock_descr *cld = &scan->cll_descr;
+               const struct osc_lock *scan_ols;
+
+               if (scan == lock)
+                       break;
+
+               if (scan->cll_state < CLS_QUEUING ||
+                   scan->cll_state == CLS_FREEING ||
+                   cld->cld_start > descr->cld_end ||
+                   cld->cld_end < descr->cld_start)
+                       continue;
+
+               /* overlapped and living locks. */
+
+               /* We're not supposed to give up group lock. */
+               if (scan->cll_descr.cld_mode == CLM_GROUP) {
+                       LASSERT(descr->cld_mode != CLM_GROUP ||
+                               descr->cld_gid != scan->cll_descr.cld_gid);
+                       continue;
+               }
+
+               scan_ols = osc_lock_at(scan);
+
+               /* We need to cancel the compatible locks if we're enqueuing
+                * a lockless lock, for example:
+                * imagine that client has PR lock on [0, 1000], and thread T0
+                * is doing lockless IO in [500, 1500] region. Concurrent
+                * thread T1 can see lockless data in [500, 1000], which is
+                * wrong, because these data are possibly stale. */
+               if (!lockless && osc_lock_compatible(olck, scan_ols))
+                       continue;
+
+               cl_lock_get_trust(scan);
+               conflict = scan;
+               break;
+       }
+       spin_unlock(&hdr->coh_lock_guard);
+
+       if (conflict) {
+               if (lock->cll_descr.cld_mode == CLM_GROUP) {
+                       /* we want a group lock but a previous lock request
+                        * conflicts, we do not wait but return 0 so the
+                        * request is send to the server
+                        */
+                       CDEBUG(D_DLMTRACE, "group lock %p is conflicted "
+                                          "with %p, no wait, send to server\n",
+                              lock, conflict);
+                       cl_lock_put(env, conflict);
+                       rc = 0;
+               } else {
+                       CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, "
+                                          "will wait\n",
+                              lock, conflict);
+                       LASSERT(lock->cll_conflict == NULL);
+                       lu_ref_add(&conflict->cll_reference, "cancel-wait",
+                                  lock);
+                       lock->cll_conflict = conflict;
+                       rc = CLO_WAIT;
+               }
+       }
+       RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *unused, __u32 enqflags)
+{
+       struct osc_lock   *ols     = cl2osc_lock(slice);
+       struct cl_lock     *lock    = ols->ols_cl.cls_lock;
+       int result;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERTF(ols->ols_state == OLS_NEW,
+                "Impossible state: %d\n", ols->ols_state);
+
+       LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+               "lock = %p, ols = %p\n", lock, ols);
+
+       result = osc_lock_enqueue_wait(env, ols);
+       if (result == 0) {
+               if (!osc_lock_is_lockless(ols)) {
+                       struct osc_object       *obj = cl2osc(slice->cls_obj);
+                       struct osc_thread_info   *info = osc_env_info(env);
+                       struct ldlm_res_id       *resname = &info->oti_resname;
+                       ldlm_policy_data_t       *policy = &info->oti_policy;
+                       struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+                       /* lock will be passed as upcall cookie,
+                        * hold ref to prevent to be released. */
+                       cl_lock_hold_add(env, lock, "upcall", lock);
+                       /* a user for lock also */
+                       cl_lock_user_add(env, lock);
+                       ols->ols_state = OLS_ENQUEUED;
+
+                       /*
+                        * XXX: this is possible blocking point as
+                        * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+                        * LDLM_CP_CALLBACK.
+                        */
+                       ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+                       osc_lock_build_policy(env, lock, policy);
+                       result = osc_enqueue_base(osc_export(obj), resname,
+                                         &ols->ols_flags, policy,
+                                         &ols->ols_lvb,
+                                         obj->oo_oinfo->loi_kms_valid,
+                                         osc_lock_upcall,
+                                         ols, einfo, &ols->ols_handle,
+                                         PTLRPCD_SET, 1, ols->ols_agl);
+                       if (result != 0) {
+                               cl_lock_user_del(env, lock);
+                               cl_lock_unhold(env, lock, "upcall", lock);
+                               if (unlikely(result == -ECANCELED)) {
+                                       ols->ols_state = OLS_NEW;
+                                       result = 0;
+                               }
+                       }
+               } else {
+                       ols->ols_state = OLS_GRANTED;
+                       ols->ols_owner = osc_env_io(env);
+               }
+       }
+       LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+       RETURN(result);
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+       LINVRNT(osc_lock_invariant(olck));
+
+       if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+               if (olck->ols_flags & LDLM_FL_LVB_READY) {
+                       return 0;
+               } else if (olck->ols_agl) {
+                       if (lock->cll_flags & CLF_FROM_UPCALL)
+                               /* It is from enqueue RPC reply upcall for
+                                * updating state. Do not re-enqueue. */
+                               return -ENAVAIL;
+                       else
+                               olck->ols_state = OLS_NEW;
+               } else {
+                       LASSERT(lock->cll_error);
+                       return lock->cll_error;
+               }
+       }
+
+       if (olck->ols_state == OLS_NEW) {
+               int rc;
+
+               LASSERT(olck->ols_agl);
+               olck->ols_agl = 0;
+               rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+               if (rc != 0)
+                       return rc;
+               else
+                       return CLO_REENQUEUED;
+       }
+
+       LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+                    lock->cll_error == 0, olck->ols_lock != NULL));
+
+       return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+                       const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       int rc;
+
+       LASSERT(!olck->ols_hold);
+
+       /*
+        * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+        * flag is not set. This protects us from a concurrent blocking ast.
+        */
+       rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+       if (rc == 0) {
+               olck->ols_hold = 1;
+               olck->ols_state = OLS_GRANTED;
+       } else {
+               struct cl_lock *lock;
+
+               /*
+                * Lock is being cancelled somewhere within
+                * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+                * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+                * cl_lock mutex.
+                */
+               lock = slice->cls_lock;
+               LASSERT(lock->cll_state == CLS_INTRANSIT);
+               LASSERT(lock->cll_users > 0);
+               /* set a flag for osc_dlm_blocking_ast0() to signal the
+                * lock.*/
+               olck->ols_ast_wait = 1;
+               rc = CLO_WAIT;
+       }
+       return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+       struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+       struct cl_env_nest    nest;
+       struct lu_env   *env;
+       int result = 0;
+       ENTRY;
+
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               struct osc_object    *obj   = cl2osc(ols->ols_cl.cls_obj);
+               struct cl_lock_descr *descr = &lock->cll_descr;
+               int rc = 0;
+
+               if (descr->cld_mode >= CLM_WRITE) {
+                       result = osc_cache_writeback_range(env, obj,
+                                       descr->cld_start, descr->cld_end,
+                                       1, discard);
+                       LDLM_DEBUG(ols->ols_lock,
+                               "lock %p: %d pages were %s.\n", lock, result,
+                               discard ? "discarded" : "written");
+                       if (result > 0)
+                               result = 0;
+               }
+
+               rc = cl_lock_discard_pages(env, lock);
+               if (result == 0 && rc < 0)
+                       result = rc;
+
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       if (result == 0) {
+               ols->ols_flush = 1;
+               LINVRNT(!osc_lock_has_pages(ols));
+       }
+       RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct cl_lock   *lock    = slice->cls_lock;
+       struct osc_lock  *olck    = cl2osc_lock(slice);
+       struct ldlm_lock *dlmlock = olck->ols_lock;
+       int            result  = 0;
+       int            discard;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LINVRNT(osc_lock_invariant(olck));
+
+       if (dlmlock != NULL) {
+               int do_cancel;
+
+               discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+               if (olck->ols_state >= OLS_GRANTED)
+                       result = osc_lock_flush(olck, discard);
+               osc_lock_unhold(olck);
+
+               lock_res_and_lock(dlmlock);
+               /* Now that we're the only user of dlm read/write reference,
+                * mostly the ->l_readers + ->l_writers should be zero.
+                * However, there is a corner case.
+                * See bug 18829 for details.*/
+               do_cancel = (dlmlock->l_readers == 0 &&
+                            dlmlock->l_writers == 0);
+               dlmlock->l_flags |= LDLM_FL_CBPENDING;
+               unlock_res_and_lock(dlmlock);
+               if (do_cancel)
+                       result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+               if (result < 0)
+                       CL_LOCK_DEBUG(D_ERROR, env, lock,
+                                     "lock %p cancel failure with error(%d)\n",
+                                     lock, result);
+       }
+       olck->ols_state = OLS_CANCELLED;
+       olck->ols_flags &= ~LDLM_FL_LVB_READY;
+       osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+       return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck;
+
+       olck = cl2osc_lock(slice);
+       if (olck->ols_glimpse) {
+               LASSERT(!olck->ols_hold);
+               LASSERT(!olck->ols_lock);
+               return;
+       }
+
+       LINVRNT(osc_lock_invariant(olck));
+       LINVRNT(!osc_lock_has_pages(olck));
+
+       osc_lock_unhold(olck);
+       osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          enum cl_lock_state state)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       /*
+        * XXX multiple io contexts can use the lock at the same time.
+        */
+       LINVRNT(osc_lock_invariant(lock));
+       if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+               struct osc_io *oio = osc_env_io(env);
+
+               LASSERT(lock->ols_owner == NULL);
+               lock->ols_owner = oio;
+       } else if (state != CLS_HELD)
+               lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       /*
+        * XXX print ldlm lock and einfo properly.
+        */
+       (*p)(env, cookie, "%p %#16llx "LPX64" %d %p ",
+            lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+            lock->ols_state, lock->ols_owner);
+       osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+       return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+
+       if (need->cld_enq_flags & CEF_NEVER)
+               return 0;
+
+       if (ols->ols_state >= OLS_CANCELLED)
+               return 0;
+
+       if (need->cld_mode == CLM_PHANTOM) {
+               if (ols->ols_agl)
+                       return !(ols->ols_state > OLS_RELEASED);
+
+               /*
+                * Note: the QUEUED lock can't be matched here, otherwise
+                * it might cause the deadlocks.
+                * In read_process,
+                * P1: enqueued read lock, create sublock1
+                * P2: enqueued write lock, create sublock2(conflicted
+                *     with sublock1).
+                * P1: Grant read lock.
+                * P1: enqueued glimpse lock(with holding sublock1_read),
+                *     matched with sublock2, waiting sublock2 to be granted.
+                *     But sublock2 can not be granted, because P1
+                *     will not release sublock1. Bang!
+                */
+               if (ols->ols_state < OLS_GRANTED ||
+                   ols->ols_state > OLS_RELEASED)
+                       return 0;
+       } else if (need->cld_enq_flags & CEF_MUST) {
+               /*
+                * If the lock hasn't ever enqueued, it can't be matched
+                * because enqueue process brings in many information
+                * which can be used to determine things such as lockless,
+                * CEF_MUST, etc.
+                */
+               if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+                   ols->ols_locklessable)
+                       return 0;
+       }
+       return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+       .clo_fini    = osc_lock_fini,
+       .clo_enqueue = osc_lock_enqueue,
+       .clo_wait    = osc_lock_wait,
+       .clo_unuse   = osc_lock_unuse,
+       .clo_use     = osc_lock_use,
+       .clo_delete  = osc_lock_delete,
+       .clo_state   = osc_lock_state,
+       .clo_cancel  = osc_lock_cancel,
+       .clo_weigh   = osc_lock_weigh,
+       .clo_print   = osc_lock_print,
+       .clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+                                  const struct cl_lock_slice *slice)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+       struct cl_lock *lock = slice->cls_lock;
+
+       LASSERT(ols->ols_state == OLS_GRANTED);
+       LINVRNT(osc_lock_invariant(ols));
+
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+       return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+                                    const struct cl_lock_slice *slice)
+{
+       struct osc_lock   *ols  = cl2osc_lock(slice);
+       int result;
+
+       result = osc_lock_flush(ols, 0);
+       if (result)
+               CERROR("Pages for lockless lock %p were not purged(%d)\n",
+                      ols, result);
+       ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+                                 const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+       LINVRNT(osc_lock_invariant(olck));
+       LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+       return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice,
+                                   enum cl_lock_state state)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(lock));
+       if (state == CLS_HELD) {
+               struct osc_io *oio  = osc_env_io(env);
+
+               LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+               lock->ols_owner = oio;
+
+               /* set the io to be lockless if this lock is for io's
+                * host object */
+               if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+                       oio->oi_lockless = 1;
+       }
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+                                      const struct cl_lock_slice *slice,
+                                      const struct cl_lock_descr *need,
+                                      const struct cl_io *io)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       if (!(need->cld_enq_flags & CEF_NEVER))
+               return 0;
+
+       /* lockless lock should only be used by its owning io. b22147 */
+       return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+       .clo_fini      = osc_lock_fini,
+       .clo_enqueue   = osc_lock_enqueue,
+       .clo_wait      = osc_lock_lockless_wait,
+       .clo_unuse     = osc_lock_lockless_unuse,
+       .clo_state     = osc_lock_lockless_state,
+       .clo_fits_into = osc_lock_lockless_fits_into,
+       .clo_cancel    = osc_lock_lockless_cancel,
+       .clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *unused)
+{
+       struct osc_lock *clk;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, __GFP_IO);
+       if (clk != NULL) {
+               __u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+               osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+               atomic_set(&clk->ols_pageref, 0);
+               clk->ols_state = OLS_NEW;
+
+               clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+               clk->ols_agl = !!(enqflags & CEF_AGL);
+               if (clk->ols_agl)
+                       clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+               if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+                       clk->ols_glimpse = 1;
+
+               cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+               if (!(enqflags & CEF_MUST))
+                       /* try to convert this lock to a lockless lock */
+                       osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+               if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+                       clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+               LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+                               lock, clk, clk->ols_flags);
+
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+       struct osc_lock *olock;
+       int           rc = 0;
+
+       spin_lock(&osc_ast_guard);
+       olock = dlm->l_ast_data;
+       /*
+        * there's a very rare race with osc_page_addref_lock(), but that
+        * doesn't matter because in the worst case we don't cancel a lock
+        * which we actually can, that's no harm.
+        */
+       if (olock != NULL &&
+           atomic_add_return(_PAGEREF_MAGIC,
+                                 &olock->ols_pageref) != _PAGEREF_MAGIC) {
+               atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+               rc = 1;
+       }
+       spin_unlock(&osc_ast_guard);
+       return rc;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644 (file)
index 0000000..ca94e63
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+       return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+       LINVRNT(osc_is_object(obj));
+       return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf)
+{
+       struct osc_object          *osc   = lu2osc(obj);
+       const struct cl_object_conf *cconf = lu2cl_conf(conf);
+       int i;
+
+       osc->oo_oinfo = cconf->u.coc_oinfo;
+       spin_lock_init(&osc->oo_seatbelt);
+       for (i = 0; i < CRT_NR; ++i)
+               INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+       INIT_LIST_HEAD(&osc->oo_ready_item);
+       INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+       INIT_LIST_HEAD(&osc->oo_write_item);
+       INIT_LIST_HEAD(&osc->oo_read_item);
+
+       osc->oo_root.rb_node = NULL;
+       INIT_LIST_HEAD(&osc->oo_hp_exts);
+       INIT_LIST_HEAD(&osc->oo_urgent_exts);
+       INIT_LIST_HEAD(&osc->oo_rpc_exts);
+       INIT_LIST_HEAD(&osc->oo_reading_exts);
+       atomic_set(&osc->oo_nr_reads, 0);
+       atomic_set(&osc->oo_nr_writes, 0);
+       spin_lock_init(&osc->oo_lock);
+
+       cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+       return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct osc_object *osc = lu2osc(obj);
+       int i;
+
+       for (i = 0; i < CRT_NR; ++i)
+               LASSERT(list_empty(&osc->oo_inflight[i]));
+
+       LASSERT(list_empty(&osc->oo_ready_item));
+       LASSERT(list_empty(&osc->oo_hp_ready_item));
+       LASSERT(list_empty(&osc->oo_write_item));
+       LASSERT(list_empty(&osc->oo_read_item));
+
+       LASSERT(osc->oo_root.rb_node == NULL);
+       LASSERT(list_empty(&osc->oo_hp_exts));
+       LASSERT(list_empty(&osc->oo_urgent_exts));
+       LASSERT(list_empty(&osc->oo_rpc_exts));
+       LASSERT(list_empty(&osc->oo_reading_exts));
+       LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+       LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+       lu_object_fini(obj);
+       OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t p, const struct ost_lvb *lvb)
+{
+       return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                   "ctime: "LPU64" blocks: "LPU64,
+                   lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                   lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *obj)
+{
+       struct osc_object   *osc   = lu2osc(obj);
+       struct lov_oinfo    *oinfo = osc->oo_oinfo;
+       struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+       (*p)(env, cookie, "id: "DOSTID" "
+            "idx: %d gen: %d kms_valid: %u kms "LPU64" "
+            "rc: %d force_sync: %d min_xid: "LPU64" ",
+            POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+            oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+            ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+       osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+       return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+       cl_lvb2attr(attr, &oinfo->loi_lvb);
+       attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+       return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+       struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+       if (valid & CAT_SIZE)
+               lvb->lvb_size = attr->cat_size;
+       if (valid & CAT_MTIME)
+               lvb->lvb_mtime = attr->cat_mtime;
+       if (valid & CAT_ATIME)
+               lvb->lvb_atime = attr->cat_atime;
+       if (valid & CAT_CTIME)
+               lvb->lvb_ctime = attr->cat_ctime;
+       if (valid & CAT_BLOCKS)
+               lvb->lvb_blocks = attr->cat_blocks;
+       if (valid & CAT_KMS) {
+               CDEBUG(D_CACHE, "set kms from "LPU64"to "LPU64"\n",
+                      oinfo->loi_kms, (__u64)attr->cat_kms);
+               loi_kms_set(oinfo, attr->cat_kms);
+       }
+       return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+                             const struct cl_object *obj, struct ost_lvb *lvb)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+       ENTRY;
+       lvb->lvb_size   = oinfo->loi_kms;
+       lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+       RETURN(0);
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+       obj->oo_contention_time = cfs_time_current();
+       /* mb(); */
+       obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+       obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+       struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+       int osc_contention_time = dev->od_contention_time;
+       cfs_time_t cur_time     = cfs_time_current();
+       cfs_time_t retry_time;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+               return 1;
+
+       if (!obj->oo_contended)
+               return 0;
+
+       /*
+        * I like copy-paste. the code is copied from
+        * ll_file_is_contended.
+        */
+       retry_time = cfs_time_add(obj->oo_contention_time,
+                                 cfs_time_seconds(osc_contention_time));
+       if (cfs_time_after(cur_time, retry_time)) {
+               osc_object_clear_contended(obj);
+               return 0;
+       }
+       return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+       .coo_page_init = osc_page_init,
+       .coo_lock_init = osc_lock_init,
+       .coo_io_init   = osc_io_init,
+       .coo_attr_get  = osc_attr_get,
+       .coo_attr_set  = osc_attr_set,
+       .coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+       .loo_object_init      = osc_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = osc_object_free,
+       .loo_object_print     = osc_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev)
+{
+       struct osc_object *osc;
+       struct lu_object  *obj;
+
+       OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, __GFP_IO);
+       if (osc != NULL) {
+               obj = osc2lu(osc);
+               lu_object_init(obj, NULL, dev);
+               osc->oo_cl.co_ops = &osc_ops;
+               obj->lo_ops = &osc_lu_obj_ops;
+       } else
+               obj = NULL;
+       return obj;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644 (file)
index 0000000..07d3702
--- /dev/null
@@ -0,0 +1,926 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+                          struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ *   -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ *   -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+                              const struct osc_page *opg,
+                              enum cl_lock_mode mode, int pending, int unref)
+{
+       struct cl_page   *page;
+       struct osc_object      *obj;
+       struct osc_thread_info *info;
+       struct ldlm_res_id     *resname;
+       struct lustre_handle   *lockh;
+       ldlm_policy_data_t     *policy;
+       ldlm_mode_t          dlmmode;
+       int                  flags;
+
+       might_sleep();
+
+       info = osc_env_info(env);
+       resname = &info->oti_resname;
+       policy = &info->oti_policy;
+       lockh = &info->oti_handle;
+       page = opg->ops_cl.cpl_page;
+       obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+       if (pending)
+               flags |= LDLM_FL_CBPENDING;
+
+       dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+       osc_lock_build_res(env, obj, resname);
+       osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+       return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+                             dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+                             const struct osc_page *opg,
+                             enum cl_lock_mode mode, int unref)
+{
+       struct cl_object_header *hdr;
+       struct cl_lock    *scan;
+       struct cl_page    *page;
+       struct cl_lock_descr    *descr;
+       int result;
+
+       LINVRNT(!opg->ops_temp);
+
+       page = opg->ops_cl.cpl_page;
+       if (page->cp_owner != NULL &&
+           cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+               /*
+                * If IO is done without locks (liblustre, or lloop), lock is
+                * not required.
+                */
+               result = 1;
+       else
+               /* otherwise check for a DLM lock */
+       result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+       if (result == 0) {
+               /* maybe this page is a part of a lockless io? */
+               hdr = cl_object_header(opg->ops_cl.cpl_obj);
+               descr = &osc_env_info(env)->oti_descr;
+               descr->cld_mode = mode;
+               descr->cld_start = page->cp_index;
+               descr->cld_end   = page->cp_index;
+               spin_lock(&hdr->coh_lock_guard);
+               list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+                       /*
+                        * Lock-less sub-lock has to be either in HELD state
+                        * (when io is actively going on), or in CACHED state,
+                        * when top-lock is being unlocked:
+                        * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+                        */
+                       if ((scan->cll_state == CLS_HELD ||
+                            scan->cll_state == CLS_CACHED) &&
+                           cl_lock_ext_match(&scan->cll_descr, descr)) {
+                               struct osc_lock *olck;
+
+                               olck = osc_lock_at(scan);
+                               result = osc_lock_is_lockless(olck);
+                               break;
+                       }
+               }
+               spin_unlock(&hdr->coh_lock_guard);
+       }
+       return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+                             const struct osc_page *opg,
+                             enum cl_lock_mode mode, int unref)
+{
+       return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       CDEBUG(D_TRACE, "%p\n", opg);
+       LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+       struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+       LASSERT(!opg->ops_transfer_pinned);
+       cl_page_get(page);
+       lu_ref_add_atomic(&page->cp_reference, label, page);
+       opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+                                 struct osc_page *opg)
+{
+       struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+       if (opg->ops_transfer_pinned) {
+               lu_ref_del(&page->cp_reference, "transfer", page);
+               opg->ops_transfer_pinned = 0;
+               cl_page_put(env, page);
+       }
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+                                 struct osc_page *opg, enum cl_req_type crt)
+{
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       /* ops_lru and ops_inflight share the same field, so take it from LRU
+        * first and then use it as inflight. */
+       osc_lru_del(osc_cli(obj), opg, false);
+
+       spin_lock(&obj->oo_seatbelt);
+       list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+       opg->ops_submitter = current;
+       spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io)
+{
+       struct osc_io   *oio = osc_env_io(env);
+       struct osc_page *opg = cl2osc_page(slice);
+       int result;
+       ENTRY;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+       osc_page_transfer_get(opg, "transfer\0cache");
+       result = osc_queue_async_io(env, io, opg);
+       if (result != 0)
+               osc_page_transfer_put(env, opg);
+       else
+               osc_page_transfer_add(env, opg, CRT_WRITE);
+
+       /* for sync write, kernel will wait for this page to be flushed before
+        * osc_io_end() is called, so release it earlier.
+        * for mkwrite(), it's known there is no further pages. */
+       if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+               if (oio->oi_active != NULL) {
+                       osc_extent_release(env, oio->oi_active);
+                       oio->oi_active = NULL;
+               }
+       }
+
+       RETURN(result);
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+                     pgoff_t start, pgoff_t end)
+{
+       memset(policy, 0, sizeof *policy);
+       policy->l_extent.start = cl_offset(obj, start);
+       policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+                               struct osc_page *opg,
+                               struct cl_lock *lock)
+{
+       struct osc_lock *olock;
+       int           rc;
+
+       LASSERT(opg->ops_lock == NULL);
+
+       olock = osc_lock_at(lock);
+       if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+               atomic_dec(&olock->ols_pageref);
+               rc = -ENODATA;
+       } else {
+               cl_lock_get(lock);
+               opg->ops_lock = lock;
+               rc = 0;
+       }
+       return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+                                struct osc_page *opg)
+{
+       struct cl_lock  *lock = opg->ops_lock;
+       struct osc_lock *olock;
+
+       LASSERT(lock != NULL);
+       olock = osc_lock_at(lock);
+
+       atomic_dec(&olock->ols_pageref);
+       opg->ops_lock = NULL;
+
+       cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *unused)
+{
+       struct cl_lock *lock;
+       int          result = -ENODATA;
+
+       ENTRY;
+       lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+                              NULL, 1, 0);
+       if (lock != NULL) {
+               if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+                       result = -EBUSY;
+               cl_lock_put(env, lock);
+       }
+       RETURN(result);
+}
+
+static void osc_page_disown(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+
+       if (unlikely(opg->ops_lock))
+               osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+                                    const struct cl_page_slice *slice,
+                                    int ioret)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       if (likely(opg->ops_lock))
+               osc_page_putref_lock(env, opg);
+       osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+       osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *unused)
+{
+       /*
+        * Cached read?
+        */
+       LBUG();
+       return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+       return list_empty(head) ? "-" : "+";
+}
+
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+{
+       if (opg->ops_submit_time == 0)
+               return 0;
+
+       return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_async_page *oap = &opg->ops_oap;
+       struct osc_object     *obj = cl2osc(slice->cpl_obj);
+       struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+       return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
+                         "1< %#x %d %u %s %s > "
+                         "2< "LPU64" %u %u %#x %#x | %p %p %p > "
+                         "3< %s %p %d %lu %d > "
+                         "4< %d %d %d %lu %s | %s %s %s %s > "
+                         "5< %s %s %s %s | %d %s | %d %s %s>\n",
+                         opg,
+                         /* 1 */
+                         oap->oap_magic, oap->oap_cmd,
+                         oap->oap_interrupted,
+                         osc_list(&oap->oap_pending_item),
+                         osc_list(&oap->oap_rpc_item),
+                         /* 2 */
+                         oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+                         oap->oap_async_flags, oap->oap_brw_flags,
+                         oap->oap_request, oap->oap_cli, obj,
+                         /* 3 */
+                         osc_list(&opg->ops_inflight),
+                         opg->ops_submitter, opg->ops_transfer_pinned,
+                         osc_submit_duration(opg), opg->ops_srvlock,
+                         /* 4 */
+                         cli->cl_r_in_flight, cli->cl_w_in_flight,
+                         cli->cl_max_rpcs_in_flight,
+                         cli->cl_avail_grant,
+                         osc_list(&cli->cl_cache_waiters),
+                         osc_list(&cli->cl_loi_ready_list),
+                         osc_list(&cli->cl_loi_hp_ready_list),
+                         osc_list(&cli->cl_loi_write_list),
+                         osc_list(&cli->cl_loi_read_list),
+                         /* 5 */
+                         osc_list(&obj->oo_ready_item),
+                         osc_list(&obj->oo_hp_ready_item),
+                         osc_list(&obj->oo_write_item),
+                         osc_list(&obj->oo_read_item),
+                         atomic_read(&obj->oo_nr_reads),
+                         osc_list(&obj->oo_reading_exts),
+                         atomic_read(&obj->oo_nr_writes),
+                         osc_list(&obj->oo_hp_exts),
+                         osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+       int rc;
+
+       LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+       ENTRY;
+       CDEBUG(D_TRACE, "%p\n", opg);
+       osc_page_transfer_put(env, opg);
+       rc = osc_teardown_async_page(env, obj, opg);
+       if (rc) {
+               CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+                             "Trying to teardown failed: %d\n", rc);
+               LASSERT(0);
+       }
+
+       spin_lock(&obj->oo_seatbelt);
+       if (opg->ops_submitter != NULL) {
+               LASSERT(!list_empty(&opg->ops_inflight));
+               list_del_init(&opg->ops_inflight);
+               opg->ops_submitter = NULL;
+       }
+       spin_unlock(&obj->oo_seatbelt);
+
+       osc_lru_del(osc_cli(obj), opg, true);
+       EXIT;
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+                  int from, int to)
+{
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_async_page *oap = &opg->ops_oap;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+       opg->ops_from = from;
+       opg->ops_to   = to;
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+       spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+                          const struct cl_page_slice *slice)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       int rc = 0;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+       /* Check if the transferring against this page
+        * is completed, or not even queued. */
+       if (opg->ops_transfer_pinned)
+               /* FIXME: may not be interrupted.. */
+               rc = osc_cancel_async_page(env, opg);
+       LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+       return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *io)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       int rc = 0;
+       ENTRY;
+       rc = osc_flush_async_page(env, io, opg);
+       RETURN(rc);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+       .cpo_fini         = osc_page_fini,
+       .cpo_print       = osc_page_print,
+       .cpo_delete     = osc_page_delete,
+       .cpo_is_under_lock = osc_page_is_under_lock,
+       .cpo_disown     = osc_page_disown,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_cache_add  = osc_page_fail,
+                       .cpo_completion = osc_page_completion_read
+               },
+               [CRT_WRITE] = {
+                       .cpo_cache_add  = osc_page_cache_add,
+                       .cpo_completion = osc_page_completion_write
+               }
+       },
+       .cpo_clip          = osc_page_clip,
+       .cpo_cancel      = osc_page_cancel,
+       .cpo_flush        = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       struct osc_object *osc = cl2osc(obj);
+       struct osc_page   *opg = cl_object_page_slice(obj, page);
+       int result;
+
+       opg->ops_from = 0;
+       opg->ops_to   = PAGE_CACHE_SIZE;
+
+       result = osc_prep_async_page(osc, opg, vmpage,
+                                       cl_offset(obj, page->cp_index));
+       if (result == 0) {
+               struct osc_io *oio = osc_env_io(env);
+               opg->ops_srvlock = osc_io_srvlock(oio);
+               cl_page_slice_add(page, &opg->ops_cl, obj,
+                               &osc_page_ops);
+       }
+       /*
+        * Cannot assert osc_page_protected() here as read-ahead
+        * creates temporary pages outside of a lock.
+        */
+       /* ops_inflight and ops_lru are the same field, but it doesn't
+        * hurt to initialize it twice :-) */
+       INIT_LIST_HEAD(&opg->ops_inflight);
+       INIT_LIST_HEAD(&opg->ops_lru);
+
+       /* reserve an LRU space for this page */
+       if (page->cp_type == CPT_CACHEABLE && result == 0)
+               result = osc_lru_reserve(env, osc, opg);
+
+       return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+                    enum cl_req_type crt, int brw_flags)
+{
+       struct osc_async_page *oap = &opg->ops_oap;
+       struct osc_object     *obj = oap->oap_obj;
+
+       LINVRNT(osc_page_protected(env, opg,
+                                  crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+       LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+                "magic 0x%x\n", oap, oap->oap_magic);
+       LASSERT(oap->oap_async_flags & ASYNC_READY);
+       LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+       oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+       oap->oap_page_off  = opg->ops_from;
+       oap->oap_count     = opg->ops_to - opg->ops_from;
+       oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+       if (!client_is_remote(osc_export(obj)) &&
+                       cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+               oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+               oap->oap_cmd |= OBD_BRW_NOQUOTA;
+       }
+
+       opg->ops_submit_time = cfs_time_current();
+       osc_page_transfer_get(opg, "transfer\0imm");
+       osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static CFS_DECL_WAITQ(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+/* free this number at most otherwise it will take too long time to finsih. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+       if (atomic_read(&osc_lru_waiters) > 0 &&
+           atomic_read(cli->cl_lru_left) < lru_shrink_max)
+               /* drop lru pages aggressively */
+               return min(pages, lru_shrink_max);
+
+       /* if it's going to run out LRU slots, we should free some, but not
+        * too much to maintain faireness among OSCs. */
+       if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+               unsigned long tmp;
+
+               tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+               if (pages > tmp)
+                       return min(pages, lru_shrink_max);
+
+               return pages > lru_shrink_min ? lru_shrink_min : 0;
+       }
+
+       return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page **pvec, int max_index)
+{
+       int count;
+       int i;
+
+       for (count = 0, i = 0; i < max_index; i++) {
+               struct cl_page *page = pvec[i];
+               if (cl_page_own_try(env, io, page) == 0) {
+                       /* free LRU page only if nobody is using it.
+                        * This check is necessary to avoid freeing the pages
+                        * having already been removed from LRU and pinned
+                        * for IO. */
+                       if (!cl_page_in_use(page)) {
+                               cl_page_unmap(env, io, page);
+                               cl_page_discard(env, io, page);
+                               ++count;
+                       }
+                       cl_page_disown(env, io, page);
+               }
+               cl_page_put(env, page);
+               pvec[i] = NULL;
+       }
+       return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       struct cl_io *io;
+       struct cl_object *clobj = NULL;
+       struct cl_page **pvec;
+       struct osc_page *opg;
+       int maxscan = 0;
+       int count = 0;
+       int index = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+       if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+               RETURN(0);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       pvec = osc_env_info(env)->oti_pvec;
+       io = &osc_env_info(env)->oti_io;
+
+       client_obd_list_lock(&cli->cl_lru_list_lock);
+       atomic_inc(&cli->cl_lru_shrinkers);
+       maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+       while (!list_empty(&cli->cl_lru_list)) {
+               struct cl_page *page;
+
+               if (--maxscan < 0)
+                       break;
+
+               opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+                                    ops_lru);
+               page = cl_page_top(opg->ops_cl.cpl_page);
+               if (cl_page_in_use_noref(page)) {
+                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+                       continue;
+               }
+
+               LASSERT(page->cp_obj != NULL);
+               if (clobj != page->cp_obj) {
+                       struct cl_object *tmp = page->cp_obj;
+
+                       cl_object_get(tmp);
+                       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+                       if (clobj != NULL) {
+                               count -= discard_pagevec(env, io, pvec, index);
+                               index = 0;
+
+                               cl_io_fini(env, io);
+                               cl_object_put(env, clobj);
+                               clobj = NULL;
+                       }
+
+                       clobj = tmp;
+                       io->ci_obj = clobj;
+                       io->ci_ignore_layout = 1;
+                       rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+                       client_obd_list_lock(&cli->cl_lru_list_lock);
+
+                       if (rc != 0)
+                               break;
+
+                       ++maxscan;
+                       continue;
+               }
+
+               /* move this page to the end of list as it will be discarded
+                * soon. The page will be finally removed from LRU list in
+                * osc_page_delete().  */
+               list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+               /* it's okay to grab a refcount here w/o holding lock because
+                * it has to grab cl_lru_list_lock to delete the page. */
+               cl_page_get(page);
+               pvec[index++] = page;
+               if (++count >= target)
+                       break;
+
+               if (unlikely(index == OTI_PVEC_SIZE)) {
+                       client_obd_list_unlock(&cli->cl_lru_list_lock);
+                       count -= discard_pagevec(env, io, pvec, index);
+                       index = 0;
+
+                       client_obd_list_lock(&cli->cl_lru_list_lock);
+               }
+       }
+       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+       if (clobj != NULL) {
+               count -= discard_pagevec(env, io, pvec, index);
+
+               cl_io_fini(env, io);
+               cl_object_put(env, clobj);
+       }
+       cl_env_nested_put(&nest, env);
+
+       atomic_dec(&cli->cl_lru_shrinkers);
+       RETURN(count > 0 ? count : rc);
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+       bool wakeup = false;
+
+       if (!opg->ops_in_lru)
+               return;
+
+       atomic_dec(&cli->cl_lru_busy);
+       client_obd_list_lock(&cli->cl_lru_list_lock);
+       if (list_empty(&opg->ops_lru)) {
+               list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               atomic_inc_return(&cli->cl_lru_in_list);
+               wakeup = atomic_read(&osc_lru_waiters) > 0;
+       }
+       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+       if (wakeup) {
+               osc_lru_shrink(cli, osc_cache_too_much(cli));
+               wake_up_all(&osc_lru_waitq);
+       }
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+       if (opg->ops_in_lru) {
+               client_obd_list_lock(&cli->cl_lru_list_lock);
+               if (!list_empty(&opg->ops_lru)) {
+                       LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+                       list_del_init(&opg->ops_lru);
+                       atomic_dec(&cli->cl_lru_in_list);
+                       if (!del)
+                               atomic_inc(&cli->cl_lru_busy);
+               } else if (del) {
+                       LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+                       atomic_dec(&cli->cl_lru_busy);
+               }
+               client_obd_list_unlock(&cli->cl_lru_list_lock);
+               if (del) {
+                       atomic_inc(cli->cl_lru_left);
+                       /* this is a great place to release more LRU pages if
+                        * this osc occupies too many LRU pages and kernel is
+                        * stealing one of them.
+                        * cl_lru_shrinkers is to avoid recursive call in case
+                        * we're already in the context of osc_lru_shrink(). */
+                       if (atomic_read(&cli->cl_lru_shrinkers) == 0)
+                               osc_lru_shrink(cli, osc_cache_too_much(cli));
+                       wake_up(&osc_lru_waitq);
+               }
+       } else {
+               LASSERT(list_empty(&opg->ops_lru));
+       }
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+       return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       int max_scans;
+       int rc;
+
+       LASSERT(cache != NULL);
+       LASSERT(!list_empty(&cache->ccc_lru));
+
+       rc = osc_lru_shrink(cli, lru_shrink_min);
+       if (rc != 0) {
+               CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+                       cli->cl_import->imp_obd->obd_name, rc, cli);
+               return rc;
+       }
+
+       CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+               cli->cl_import->imp_obd->obd_name, cli,
+               atomic_read(&cli->cl_lru_in_list),
+               atomic_read(&cli->cl_lru_busy));
+
+       /* Reclaim LRU slots from other client_obd as it can't free enough
+        * from its own. This should rarely happen. */
+       spin_lock(&cache->ccc_lru_lock);
+       cache->ccc_lru_shrinkers++;
+       list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+       max_scans = atomic_read(&cache->ccc_users);
+       while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+               cli = list_entry(cache->ccc_lru.next, struct client_obd,
+                                       cl_lru_osc);
+
+               CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+                       cli->cl_import->imp_obd->obd_name, cli,
+                       atomic_read(&cli->cl_lru_in_list),
+                       atomic_read(&cli->cl_lru_busy));
+
+               list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+               if (atomic_read(&cli->cl_lru_in_list) > 0) {
+                       spin_unlock(&cache->ccc_lru_lock);
+
+                       rc = osc_lru_shrink(cli, max_to_shrink(cli));
+                       spin_lock(&cache->ccc_lru_lock);
+                       if (rc != 0)
+                               break;
+               }
+       }
+       spin_unlock(&cache->ccc_lru_lock);
+
+       CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+               cli->cl_import->imp_obd->obd_name, cli, rc);
+       return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+                          struct osc_page *opg)
+{
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       struct client_obd *cli = osc_cli(obj);
+       int rc = 0;
+       ENTRY;
+
+       if (cli->cl_cache == NULL) /* shall not be in LRU */
+               RETURN(0);
+
+       LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+       while (!cfs_atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+               int gen;
+
+               /* run out of LRU spaces, try to drop some by itself */
+               rc = osc_lru_reclaim(cli);
+               if (rc < 0)
+                       break;
+               if (rc > 0)
+                       continue;
+
+               cond_resched();
+
+               /* slowest case, all of caching pages are busy, notifying
+                * other OSCs that we're lack of LRU slots. */
+               atomic_inc(&osc_lru_waiters);
+
+               gen = atomic_read(&cli->cl_lru_in_list);
+               rc = l_wait_event(osc_lru_waitq,
+                               atomic_read(cli->cl_lru_left) > 0 ||
+                               (atomic_read(&cli->cl_lru_in_list) > 0 &&
+                                gen != atomic_read(&cli->cl_lru_in_list)),
+                               &lwi);
+
+               atomic_dec(&osc_lru_waiters);
+               if (rc < 0)
+                       break;
+       }
+
+       if (rc >= 0) {
+               atomic_inc(&cli->cl_lru_busy);
+               opg->ops_in_lru = 1;
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644 (file)
index 0000000..69caab7
--- /dev/null
@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd_ost.h>
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(obd_uid id)
+{
+       struct osc_quota_info *oqi;
+
+       OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+       if (oqi != NULL)
+               oqi->oqi_id = id;
+
+       return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+       int type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               struct osc_quota_info *oqi;
+
+               oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+               if (oqi) {
+                       obd_uid id = oqi->oqi_id;
+
+                       LASSERTF(id == qid[type],
+                                "The ids don't match %u != %u\n",
+                                id, qid[type]);
+
+                       /* the slot is busy, the user is about to run out of
+                        * quota space on this OST */
+                       CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+                              type == USRQUOTA ? "user" : "grout", qid[type]);
+                       RETURN(NO_QUOTA);
+               }
+       }
+
+       RETURN(QUOTA_OK);
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+                                               : OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+                                               : OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+                   obd_flag valid, obd_flag flags)
+{
+       int type;
+       int rc = 0;
+       ENTRY;
+
+       if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+               RETURN(0);
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               struct osc_quota_info *oqi;
+
+               if ((valid & MD_QUOTA_FLAG(type)) == 0)
+                       continue;
+
+               /* lookup the ID in the per-type hash table */
+               oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+               if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+                       /* This ID is getting close to its quota limit, let's
+                        * switch to sync I/O */
+                       if (oqi != NULL)
+                               continue;
+
+                       oqi = osc_oqi_alloc(qid[type]);
+                       if (oqi == NULL) {
+                               rc = -ENOMEM;
+                               break;
+                       }
+
+                       rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+                                                &qid[type], &oqi->oqi_hash);
+                       /* race with others? */
+                       if (rc == -EALREADY) {
+                               rc = 0;
+                               OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+                       }
+
+                       CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+                              cli->cl_import->imp_obd->obd_name,
+                              type == USRQUOTA ? "user" : "group",
+                              qid[type], rc);
+               } else {
+                       /* This ID is now off the hook, let's remove it from
+                        * the hash table */
+                       if (oqi == NULL)
+                               continue;
+
+                       oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+                                              &qid[type]);
+                       if (oqi)
+                               OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+                       CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+                              cli->cl_import->imp_obd->obd_name,
+                              type == USRQUOTA ? "user" : "group",
+                              qid[type], oqi);
+               }
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+       obd_uid uid;
+
+       LASSERT(key != NULL);
+       uid = *((obd_uid*)key);
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+       return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+       return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+       OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+       .hs_hash        = oqi_hashfn,
+       .hs_keycmp      = oqi_keycmp,
+       .hs_key         = oqi_key,
+       .hs_object      = oqi_object,
+       .hs_get         = oqi_get,
+       .hs_put_locked  = oqi_put_locked,
+       .hs_exit        = oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int i, type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+                                                          HASH_QUOTA_CUR_BITS,
+                                                          HASH_QUOTA_MAX_BITS,
+                                                          HASH_QUOTA_BKT_BITS,
+                                                          0,
+                                                          CFS_HASH_MIN_THETA,
+                                                          CFS_HASH_MAX_THETA,
+                                                          &quota_hash_ops,
+                                                          CFS_HASH_DEFAULT);
+               if (cli->cl_quota_hash[type] == NULL)
+                       break;
+       }
+
+       if (type == MAXQUOTAS)
+               RETURN(0);
+
+       for (i = 0; i < type; i++)
+               cfs_hash_putref(cli->cl_quota_hash[i]);
+
+       RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+       struct client_obd     *cli = &obd->u.cli;
+       int type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++)
+               cfs_hash_putref(cli->cl_quota_hash[type]);
+
+       RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl)
+{
+       struct ptlrpc_request *req;
+       struct obd_quotactl   *oqc;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+                                       OST_QUOTACTL);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *oqc = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+       ptlrpc_at_set_req_timeout(req);
+       req->rq_no_resend = 1;
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+       if (req->rq_repmsg &&
+           (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+               *oqctl = *oqc;
+       } else if (!rc) {
+               CERROR ("Can't unpack obd_quotactl\n");
+               rc = -EPROTO;
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl)
+{
+       struct client_obd       *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+                                       OST_QUOTACHECK);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *body = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+
+       /* the next poll will find -ENODATA, that means quotacheck is
+        * going on */
+       cli->cl_qchk_stat = -ENODATA;
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               cli->cl_qchk_stat = rc;
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       int rc;
+       ENTRY;
+
+       qchk->obd_uuid = cli->cl_target_uuid;
+       memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+       rc = cli->cl_qchk_stat;
+       /* the client is not the previous one */
+       if (rc == CL_NOT_QUOTACHECKED)
+               rc = -EINTR;
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644 (file)
index 0000000..3062e47
--- /dev/null
@@ -0,0 +1,3668 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <linux/libcfs/libcfs.h>
+
+
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre/lustre_user.h>
+#include <obd_cksum.h>
+#include <obd_ost.h>
+#include <obd_lov.h>
+
+#ifdef  __CYGWIN__
+# include <ctype.h>
+#endif
+
+#include <lustre_ha.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_debug.h>
+#include <lustre_param.h>
+#include <lustre_fid.h>
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(const struct lu_env *env,
+                        struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+                     struct lov_stripe_md *lsm)
+{
+       int lmm_size;
+       ENTRY;
+
+       lmm_size = sizeof(**lmmp);
+       if (lmmp == NULL)
+               RETURN(lmm_size);
+
+       if (*lmmp != NULL && lsm == NULL) {
+               OBD_FREE(*lmmp, lmm_size);
+               *lmmp = NULL;
+               RETURN(0);
+       } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+               RETURN(-EBADF);
+       }
+
+       if (*lmmp == NULL) {
+               OBD_ALLOC(*lmmp, lmm_size);
+               if (*lmmp == NULL)
+                       RETURN(-ENOMEM);
+       }
+
+       if (lsm)
+               ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+       RETURN(lmm_size);
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                       struct lov_mds_md *lmm, int lmm_bytes)
+{
+       int lsm_size;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       ENTRY;
+
+       if (lmm != NULL) {
+               if (lmm_bytes < sizeof(*lmm)) {
+                       CERROR("%s: lov_mds_md too small: %d, need %d\n",
+                              exp->exp_obd->obd_name, lmm_bytes,
+                              (int)sizeof(*lmm));
+                       RETURN(-EINVAL);
+               }
+               /* XXX LOV_MAGIC etc check? */
+
+               if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+                       CERROR("%s: zero lmm_object_id: rc = %d\n",
+                              exp->exp_obd->obd_name, -EINVAL);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       lsm_size = lov_stripe_md_size(1);
+       if (lsmp == NULL)
+               RETURN(lsm_size);
+
+       if (*lsmp != NULL && lmm == NULL) {
+               OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+               OBD_FREE(*lsmp, lsm_size);
+               *lsmp = NULL;
+               RETURN(0);
+       }
+
+       if (*lsmp == NULL) {
+               OBD_ALLOC(*lsmp, lsm_size);
+               if (unlikely(*lsmp == NULL))
+                       RETURN(-ENOMEM);
+               OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+               if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+                       OBD_FREE(*lsmp, lsm_size);
+                       RETURN(-ENOMEM);
+               }
+               loi_init((*lsmp)->lsm_oinfo[0]);
+       } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+               RETURN(-EBADF);
+       }
+
+       if (lmm != NULL)
+               /* XXX zero *lsmp? */
+               ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+       if (imp != NULL &&
+           (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+               (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+       else
+               (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+       RETURN(lsm_size);
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+                                struct ost_body *body, void *capa)
+{
+       struct obd_capa *oc = (struct obd_capa *)capa;
+       struct lustre_capa *c;
+
+       if (!capa)
+               return;
+
+       c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+       LASSERT(c);
+       capa_cpy(c, oc);
+       body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+       DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+                                    struct obd_info *oinfo)
+{
+       struct ost_body *body;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+
+       lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+                                    const struct req_msg_field *field,
+                                    struct obd_capa *oc)
+{
+       if (oc == NULL)
+               req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+       else
+               /* it is already calculated as sizeof struct obd_capa */
+               ;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_async_args *aa, int rc)
+{
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body) {
+               CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+               lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa);
+
+               /* This should really be sent by the OST */
+               aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+               aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+       } else {
+               CDEBUG(D_INFO, "can't unpack ost_body\n");
+               rc = -EPROTO;
+               aa->aa_oi->oi_oa->o_valid = 0;
+       }
+out:
+       rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+       RETURN(rc);
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       struct osc_async_args *aa;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oi = oinfo;
+
+       ptlrpc_set_add_req(set, req);
+       RETURN(0);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+       lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
+
+       oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+       oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+       EXIT;
+ out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_setattr_args *sa, int rc)
+{
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lustre_get_wire_obdo(sa->sa_oa, &body->oa);
+out:
+       rc = sa->sa_upcall(sa->sa_cookie, rc);
+       RETURN(rc);
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+                          struct obd_trans_info *oti,
+                          obd_enqueue_update_f upcall, void *cookie,
+                          struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request   *req;
+       struct osc_setattr_args *sa;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+               oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       /* do mds to ost setattr asynchronously */
+       if (!rqset) {
+               /* Do not wait for response. */
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       } else {
+               req->rq_interpret_reply =
+                       (ptlrpc_interpterer_t)osc_setattr_interpret;
+
+               CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+               sa = ptlrpc_req_async_args(req);
+               sa->sa_oa = oinfo->oi_oa;
+               sa->sa_upcall = upcall;
+               sa->sa_cookie = cookie;
+
+               if (rqset == PTLRPCD_SET)
+                       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+               else
+                       ptlrpc_set_add_req(rqset, req);
+       }
+
+       RETURN(0);
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct obd_trans_info *oti,
+                            struct ptlrpc_request_set *rqset)
+{
+       return osc_setattr_async_base(exp, oinfo, oti,
+                                     oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+                   struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       struct lov_stripe_md  *lsm;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oa);
+       LASSERT(ea);
+
+       lsm = *ea;
+       if (!lsm) {
+               rc = obd_alloc_memmd(exp, &lsm);
+               if (rc < 0)
+                       RETURN(rc);
+       }
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&body->oa, oa);
+
+       ptlrpc_request_set_replen(req);
+
+       if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+           oa->o_flags == OBD_FL_DELORPHAN) {
+               DEBUG_REQ(D_HA, req,
+                         "delorphan from OST integration");
+               /* Don't resend the delorphan req */
+               req->rq_no_resend = req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out_req, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out_req, rc = -EPROTO);
+
+       lustre_get_wire_obdo(oa, &body->oa);
+
+       oa->o_blksize = cli_brw_size(exp->exp_obd);
+       oa->o_valid |= OBD_MD_FLBLKSZ;
+
+       /* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+        * have valid lsm_oinfo data structs, so don't go touching that.
+        * This needs to be fixed in a big way.
+        */
+       lsm->lsm_oi = oa->o_oi;
+       *ea = lsm;
+
+       if (oti != NULL) {
+               oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+               if (oa->o_valid & OBD_MD_FLCOOKIE) {
+                       if (!oti->oti_logcookies)
+                               oti_alloc_cookies(oti, 1);
+                       *oti->oti_logcookies = oa->o_lcookie;
+               }
+       }
+
+       CDEBUG(D_HA, "transno: "LPD64"\n",
+              lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+       ptlrpc_req_finished(req);
+out:
+       if (rc && !*ea)
+               obd_free_memmd(exp, &lsm);
+       RETURN(rc);
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+                  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request   *req;
+       struct osc_setattr_args *sa;
+       struct ost_body  *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+
+       ptlrpc_request_set_replen(req);
+
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+       CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+       sa = ptlrpc_req_async_args(req);
+       sa->sa_oa     = oinfo->oi_oa;
+       sa->sa_upcall = upcall;
+       sa->sa_cookie = cookie;
+       if (rqset == PTLRPCD_SET)
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       else
+               ptlrpc_set_add_req(rqset, req);
+
+       RETURN(0);
+}
+
+static int osc_punch(const struct lu_env *env, struct obd_export *exp,
+                    struct obd_info *oinfo, struct obd_trans_info *oti,
+                    struct ptlrpc_request_set *rqset)
+{
+       oinfo->oi_oa->o_size   = oinfo->oi_policy.l_extent.start;
+       oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end;
+       oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+       return osc_punch_base(exp, oinfo,
+                             oinfo->oi_cb_up, oinfo, rqset);
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+                             struct ptlrpc_request *req,
+                             void *arg, int rc)
+{
+       struct osc_fsync_args *fa = arg;
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL) {
+               CERROR ("can't unpack ost_body\n");
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *fa->fa_oi->oi_oa = body->oa;
+out:
+       rc = fa->fa_upcall(fa->fa_cookie, rc);
+       RETURN(rc);
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+                 obd_enqueue_update_f upcall, void *cookie,
+                 struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       struct osc_fsync_args *fa;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /* overload the size and blocks fields in the oa with start/end */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+
+       ptlrpc_request_set_replen(req);
+       req->rq_interpret_reply = osc_sync_interpret;
+
+       CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+       fa = ptlrpc_req_async_args(req);
+       fa->fa_oi = oinfo;
+       fa->fa_upcall = upcall;
+       fa->fa_cookie = cookie;
+
+       if (rqset == PTLRPCD_SET)
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       else
+               ptlrpc_set_add_req(rqset, req);
+
+       RETURN (0);
+}
+
+static int osc_sync(const struct lu_env *env, struct obd_export *exp,
+                   struct obd_info *oinfo, obd_size start, obd_size end,
+                   struct ptlrpc_request_set *set)
+{
+       ENTRY;
+
+       if (!oinfo->oi_oa) {
+               CDEBUG(D_INFO, "oa NULL\n");
+               RETURN(-EINVAL);
+       }
+
+       oinfo->oi_oa->o_size = start;
+       oinfo->oi_oa->o_blocks = end;
+       oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+       RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set));
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+                                  struct list_head *cancels,
+                                  ldlm_mode_t mode, int lock_flags)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       int count;
+       ENTRY;
+
+       /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+        * export) but disabled through procfs (flag in NS).
+        *
+        * This distinguishes from a case when ELC is not supported originally,
+        * when we still want to cancel locks in advance and just cancel them
+        * locally, without sending any RPC. */
+       if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+               RETURN(0);
+
+       ostid_build_res_name(&oa->o_oi, &res_id);
+       res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+
+       LDLM_RESOURCE_ADDREF(res);
+       count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+                                          lock_flags, 0, NULL);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req, void *data,
+                                int rc)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+       atomic_dec(&cli->cl_destroy_in_flight);
+       wake_up(&cli->cl_destroy_waitq);
+       return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+       if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+           cli->cl_max_rpcs_in_flight) {
+               /* The destroy request can be sent */
+               return 1;
+       }
+       if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+           cli->cl_max_rpcs_in_flight) {
+               /*
+                * The counter has been modified between the two atomic
+                * operations.
+                */
+               wake_up(&cli->cl_destroy_waitq);
+       }
+       return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+              struct obdo *oa, struct lov_stripe_md **ea,
+              struct obd_trans_info *oti)
+{
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oa);
+       LASSERT(ea);
+       LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+       if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+           oa->o_flags == OBD_FL_RECREATE_OBJS) {
+               RETURN(osc_real_create(exp, oa, ea, oti));
+       }
+
+       if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+               RETURN(osc_real_create(exp, oa, ea, oti));
+
+       /* we should not get here anymore */
+       LBUG();
+
+       RETURN(rc);
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md *ea,
+                      struct obd_trans_info *oti, struct obd_export *md_export,
+                      void *capa)
+{
+       struct client_obd     *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       LIST_HEAD(cancels);
+       int rc, count;
+       ENTRY;
+
+       if (!oa) {
+               CDEBUG(D_INFO, "oa NULL\n");
+               RETURN(-EINVAL);
+       }
+
+       count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+                                       LDLM_FL_DISCARD_DATA);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+       rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+                              0, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+
+       if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+               oa->o_lcookie = *oti->oti_logcookies;
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&body->oa, oa);
+
+       osc_pack_capa(req, body, (struct obd_capa *)capa);
+       ptlrpc_request_set_replen(req);
+
+       /* If osc_destory is for destroying the unlink orphan,
+        * sent from MDT to OST, which should not be blocked here,
+        * because the process might be triggered by ptlrpcd, and
+        * it is not good to block ptlrpcd thread (b=16006)*/
+       if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+               req->rq_interpret_reply = osc_destroy_interpret;
+               if (!osc_can_send_destroy(cli)) {
+                       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+                                                         NULL);
+
+                       /*
+                        * Wait until the number of on-going destroy RPCs drops
+                        * under max_rpc_in_flight
+                        */
+                       l_wait_event_exclusive(cli->cl_destroy_waitq,
+                                              osc_can_send_destroy(cli), &lwi);
+               }
+       }
+
+       /* Do not wait for response */
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+                               long writing_bytes)
+{
+       obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+       LASSERT(!(oa->o_valid & bits));
+
+       oa->o_valid |= bits;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       oa->o_dirty = cli->cl_dirty;
+       if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+                    cli->cl_dirty_max)) {
+               CERROR("dirty %lu - %lu > dirty_max %lu\n",
+                      cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+               oa->o_undirty = 0;
+       } else if (unlikely(atomic_read(&obd_unstable_pages) +
+                           atomic_read(&obd_dirty_pages) -
+                           atomic_read(&obd_dirty_transit_pages) >
+                           (long)(obd_max_dirty_pages + 1))) {
+               /* The atomic_read() allowing the atomic_inc() are
+                * not covered by a lock thus they may safely race and trip
+                * this CERROR() unless we add in a small fudge factor (+1). */
+               CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n",
+                      cli->cl_import->imp_obd->obd_name,
+                      atomic_read(&obd_unstable_pages),
+                      atomic_read(&obd_dirty_pages),
+                      atomic_read(&obd_dirty_transit_pages),
+                      obd_max_dirty_pages);
+               oa->o_undirty = 0;
+       } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+               CERROR("dirty %lu - dirty_max %lu too big???\n",
+                      cli->cl_dirty, cli->cl_dirty_max);
+               oa->o_undirty = 0;
+       } else {
+               long max_in_flight = (cli->cl_max_pages_per_rpc <<
+                                     PAGE_CACHE_SHIFT)*
+                                    (cli->cl_max_rpcs_in_flight + 1);
+               oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+       }
+       oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+       oa->o_dropped = cli->cl_lost_grant;
+       cli->cl_lost_grant = 0;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n",
+              oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+       cli->cl_next_shrink_grant =
+               cfs_time_shift(cli->cl_grant_shrink_interval);
+       CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+              cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, obd_size grant)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_avail_grant += grant;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+       if (body->oa.o_valid & OBD_MD_FLGRANT) {
+               CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
+               __osc_update_grant(cli, body->oa.o_grant);
+       }
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+                                     struct ptlrpc_request *req,
+                                     void *aa, int rc)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+       struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+       struct ost_body *body;
+
+       if (rc != 0) {
+               __osc_update_grant(cli, oa->o_grant);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       osc_update_grant(cli, body);
+out:
+       OBDO_FREE(oa);
+       return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       oa->o_grant = cli->cl_avail_grant / 4;
+       cli->cl_avail_grant -= oa->o_grant;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+               oa->o_valid |= OBD_MD_FLFLAGS;
+               oa->o_flags = 0;
+       }
+       oa->o_flags |= OBD_FL_SHRINK_GRANT;
+       osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+       __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+                            (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_avail_grant <= target_bytes)
+               target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+       int                     rc = 0;
+       struct ost_body *body;
+       ENTRY;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       /* Don't shrink if we are already above or below the desired limit
+        * We don't want to shrink below a single RPC, as that will negatively
+        * impact block allocation and long-term performance. */
+       if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+               target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+       if (target_bytes >= cli->cl_avail_grant) {
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               RETURN(0);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       OBD_ALLOC_PTR(body);
+       if (!body)
+               RETURN(-ENOMEM);
+
+       osc_announce_cached(cli, &body->oa, 0);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+       cli->cl_avail_grant = target_bytes;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+               body->oa.o_valid |= OBD_MD_FLFLAGS;
+               body->oa.o_flags = 0;
+       }
+       body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+       osc_update_next_shrink(cli);
+
+       rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+                               sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+                               sizeof(*body), body, NULL);
+       if (rc != 0)
+               __osc_update_grant(cli, body->oa.o_grant);
+       OBD_FREE_PTR(body);
+       RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+       cfs_time_t time = cfs_time_current();
+       cfs_time_t next_shrink = client->cl_next_shrink_grant;
+
+       if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+            OBD_CONNECT_GRANT_SHRINK) == 0)
+               return 0;
+
+       if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+               /* Get the current RPC size directly, instead of going via:
+                * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+                * Keep comment here so that it can be found by searching. */
+               int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+               if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+                   client->cl_avail_grant > brw_size)
+                       return 1;
+               else
+                       osc_update_next_shrink(client);
+       }
+       return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+       struct client_obd *client;
+
+       list_for_each_entry(client, &item->ti_obd_list,
+                               cl_grant_shrink_list) {
+               if (osc_should_shrink_grant(client))
+                       osc_shrink_grant(client);
+       }
+       return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+       int rc;
+
+       rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+                                      TIMEOUT_GRANT,
+                                      osc_grant_shrink_grant_cb, NULL,
+                                      &client->cl_grant_shrink_list);
+       if (rc) {
+               CERROR("add grant client %s error %d\n",
+                       client->cl_import->imp_obd->obd_name, rc);
+               return rc;
+       }
+       CDEBUG(D_CACHE, "add grant client %s \n",
+              client->cl_import->imp_obd->obd_name);
+       osc_update_next_shrink(client);
+       return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+       return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+                                        TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+       /*
+        * ocd_grant is the total grant amount we're expect to hold: if we've
+        * been evicted, it's the new avail_grant amount, cl_dirty will drop
+        * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+        *
+        * race is tolerable here: if we're evicted, but imp_state already
+        * left EVICTED state, then cl_dirty must be 0 already.
+        */
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+               cli->cl_avail_grant = ocd->ocd_grant;
+       else
+               cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+       if (cli->cl_avail_grant < 0) {
+               CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+                     cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+                     ocd->ocd_grant, cli->cl_dirty);
+               /* workaround for servers which do not have the patch from
+                * LU-2679 */
+               cli->cl_avail_grant = ocd->ocd_grant;
+       }
+
+       /* determine the appropriate chunk size used by osc_extent. */
+       cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
+               "chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name,
+               cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+       if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+           list_empty(&cli->cl_grant_shrink_list))
+               osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, obd_count page_count,
+                             struct brw_page **pga)
+{
+       char *ptr;
+       int i = 0;
+
+       /* skip bytes read OK */
+       while (nob_read > 0) {
+               LASSERT (page_count > 0);
+
+               if (pga[i]->count > nob_read) {
+                       /* EOF inside this page */
+                       ptr = kmap(pga[i]->pg) +
+                               (pga[i]->off & ~CFS_PAGE_MASK);
+                       memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+                       kunmap(pga[i]->pg);
+                       page_count--;
+                       i++;
+                       break;
+               }
+
+               nob_read -= pga[i]->count;
+               page_count--;
+               i++;
+       }
+
+       /* zero remaining pages */
+       while (page_count-- > 0) {
+               ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+               memset(ptr, 0, pga[i]->count);
+               kunmap(pga[i]->pg);
+               i++;
+       }
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+                          int requested_nob, int niocount,
+                          obd_count page_count, struct brw_page **pga)
+{
+       int     i;
+       __u32   *remote_rcs;
+
+       remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+                                                 sizeof(*remote_rcs) *
+                                                 niocount);
+       if (remote_rcs == NULL) {
+               CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+               return(-EPROTO);
+       }
+
+       /* return error if any niobuf was in error */
+       for (i = 0; i < niocount; i++) {
+               if ((int)remote_rcs[i] < 0)
+                       return(remote_rcs[i]);
+
+               if (remote_rcs[i] != 0) {
+                       CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+                               i, remote_rcs[i], req);
+                       return(-EPROTO);
+               }
+       }
+
+       if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+               CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+                      req->rq_bulk->bd_nob_transferred, requested_nob);
+               return(-EPROTO);
+       }
+
+       return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+       if (p1->flag != p2->flag) {
+               unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE|
+                                 OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+               /* warn if we try to combine flags that we don't know to be
+                * safe to combine */
+               if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+                       CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+                             "report this at http://bugs.whamcloud.com/\n",
+                             p1->flag, p2->flag);
+               }
+               return 0;
+       }
+
+       return (p1->off + p1->count == p2->off);
+}
+
+static obd_count osc_checksum_bulk(int nob, obd_count pg_count,
+                                  struct brw_page **pga, int opc,
+                                  cksum_type_t cksum_type)
+{
+       __u32                           cksum;
+       int                             i = 0;
+       struct cfs_crypto_hash_desc     *hdesc;
+       unsigned int                    bufsize;
+       int                             err;
+       unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
+
+       LASSERT(pg_count > 0);
+
+       hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+       if (IS_ERR(hdesc)) {
+               CERROR("Unable to initialize checksum hash %s\n",
+                      cfs_crypto_hash_name(cfs_alg));
+               return PTR_ERR(hdesc);
+       }
+
+       while (nob > 0 && pg_count > 0) {
+               int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+               /* corrupt the data before we compute the checksum, to
+                * simulate an OST->client data error */
+               if (i == 0 && opc == OST_READ &&
+                   OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+                       unsigned char *ptr = kmap(pga[i]->pg);
+                       int off = pga[i]->off & ~CFS_PAGE_MASK;
+                       memcpy(ptr + off, "bad1", min(4, nob));
+                       kunmap(pga[i]->pg);
+               }
+               cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+                                 pga[i]->off & ~CFS_PAGE_MASK,
+                                 count);
+               LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+                              (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+               nob -= pga[i]->count;
+               pg_count--;
+               i++;
+       }
+
+       bufsize = 4;
+       err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+       if (err)
+               cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+       /* For sending we only compute the wrong checksum instead
+        * of corrupting the data so it is still correct on a redo */
+       if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+               cksum++;
+
+       return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
+                               struct lov_stripe_md *lsm, obd_count page_count,
+                               struct brw_page **pga,
+                               struct ptlrpc_request **reqp,
+                               struct obd_capa *ocapa, int reserve,
+                               int resend)
+{
+       struct ptlrpc_request   *req;
+       struct ptlrpc_bulk_desc *desc;
+       struct ost_body  *body;
+       struct obd_ioobj        *ioobj;
+       struct niobuf_remote    *niobuf;
+       int niocount, i, requested_nob, opc, rc;
+       struct osc_brw_async_args *aa;
+       struct req_capsule      *pill;
+       struct brw_page *pg_prev;
+
+       ENTRY;
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+               RETURN(-ENOMEM); /* Recoverable */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+               RETURN(-EINVAL); /* Fatal */
+
+       if ((cmd & OBD_BRW_WRITE) != 0) {
+               opc = OST_WRITE;
+               req = ptlrpc_request_alloc_pool(cli->cl_import,
+                                               cli->cl_import->imp_rq_pool,
+                                               &RQF_OST_BRW_WRITE);
+       } else {
+               opc = OST_READ;
+               req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+       }
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       for (niocount = i = 1; i < page_count; i++) {
+               if (!can_merge_pages(pga[i - 1], pga[i]))
+                       niocount++;
+       }
+
+       pill = &req->rq_pill;
+       req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+                            sizeof(*ioobj));
+       req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+                            niocount * sizeof(*niobuf));
+       osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+       /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+        * retry logic */
+       req->rq_no_retry_einprogress = 1;
+
+       desc = ptlrpc_prep_bulk_imp(req, page_count,
+               cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+               opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+               OST_BULK_PORTAL);
+
+       if (desc == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /* NB request now owns desc and will free it when it gets freed */
+
+       body = req_capsule_client_get(pill, &RMF_OST_BODY);
+       ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+       niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+       LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+       lustre_set_wire_obdo(&body->oa, oa);
+
+       obdo_to_ioobj(oa, ioobj);
+       ioobj->ioo_bufcnt = niocount;
+       /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+        * that might be send for this request.  The actual number is decided
+        * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+        * "max - 1" for old client compatibility sending "0", and also so the
+        * the actual maximum is a power-of-two number, not one less. LU-1431 */
+       ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+       osc_pack_capa(req, body, ocapa);
+       LASSERT(page_count > 0);
+       pg_prev = pga[0];
+       for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+               struct brw_page *pg = pga[i];
+               int poff = pg->off & ~CFS_PAGE_MASK;
+
+               LASSERT(pg->count > 0);
+               /* make sure there is no gap in the middle of page array */
+               LASSERTF(page_count == 1 ||
+                        (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+                         ergo(i > 0 && i < page_count - 1,
+                              poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+                         ergo(i == page_count - 1, poff == 0)),
+                        "i: %d/%d pg: %p off: "LPU64", count: %u\n",
+                        i, page_count, pg, pg->off, pg->count);
+               LASSERTF(i == 0 || pg->off > pg_prev->off,
+                        "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
+                        " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
+                        i, page_count,
+                        pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+                        pg_prev->pg, page_private(pg_prev->pg),
+                        pg_prev->pg->index, pg_prev->off);
+               LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+                       (pg->flag & OBD_BRW_SRVLOCK));
+
+               ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+               requested_nob += pg->count;
+
+               if (i > 0 && can_merge_pages(pg_prev, pg)) {
+                       niobuf--;
+                       niobuf->len += pg->count;
+               } else {
+                       niobuf->offset = pg->off;
+                       niobuf->len    = pg->count;
+                       niobuf->flags  = pg->flag;
+               }
+               pg_prev = pg;
+       }
+
+       LASSERTF((void *)(niobuf - niocount) ==
+               req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+               "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+               &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+       osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+       if (resend) {
+               if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                       body->oa.o_valid |= OBD_MD_FLFLAGS;
+                       body->oa.o_flags = 0;
+               }
+               body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+       }
+
+       if (osc_should_shrink_grant(cli))
+               osc_shrink_grant_local(cli, &body->oa);
+
+       /* size[REQ_REC_OFF] still sizeof (*body) */
+       if (opc == OST_WRITE) {
+               if (cli->cl_checksum &&
+                   !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                       /* store cl_cksum_type in a local variable since
+                        * it can be changed via lprocfs */
+                       cksum_type_t cksum_type = cli->cl_cksum_type;
+
+                       if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                               oa->o_flags &= OBD_FL_LOCAL_MASK;
+                               body->oa.o_flags = 0;
+                       }
+                       body->oa.o_flags |= cksum_type_pack(cksum_type);
+                       body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                       body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+                                                            page_count, pga,
+                                                            OST_WRITE,
+                                                            cksum_type);
+                       CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+                              body->oa.o_cksum);
+                       /* save this in 'oa', too, for later checking */
+                       oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                       oa->o_flags |= cksum_type_pack(cksum_type);
+               } else {
+                       /* clear out the checksum flag, in case this is a
+                        * resend but cl_checksum is no longer set. b=11238 */
+                       oa->o_valid &= ~OBD_MD_FLCKSUM;
+               }
+               oa->o_cksum = body->oa.o_cksum;
+               /* 1 RC per niobuf */
+               req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+                                    sizeof(__u32) * niocount);
+       } else {
+               if (cli->cl_checksum &&
+                   !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                       if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                               body->oa.o_flags = 0;
+                       body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+                       body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+               }
+       }
+       ptlrpc_request_set_replen(req);
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oa = oa;
+       aa->aa_requested_nob = requested_nob;
+       aa->aa_nio_count = niocount;
+       aa->aa_page_count = page_count;
+       aa->aa_resends = 0;
+       aa->aa_ppga = pga;
+       aa->aa_cli = cli;
+       INIT_LIST_HEAD(&aa->aa_oaps);
+       if (ocapa && reserve)
+               aa->aa_ocapa = capa_get(ocapa);
+
+       *reqp = req;
+       RETURN(0);
+
+ out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+                               __u32 client_cksum, __u32 server_cksum, int nob,
+                               obd_count page_count, struct brw_page **pga,
+                               cksum_type_t client_cksum_type)
+{
+       __u32 new_cksum;
+       char *msg;
+       cksum_type_t cksum_type;
+
+       if (server_cksum == client_cksum) {
+               CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+               return 0;
+       }
+
+       cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+                                      oa->o_flags : 0);
+       new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+                                     cksum_type);
+
+       if (cksum_type != client_cksum_type)
+               msg = "the server did not use the checksum type specified in "
+                     "the original request - likely a protocol problem";
+       else if (new_cksum == server_cksum)
+               msg = "changed on the client after we checksummed it - "
+                     "likely false positive due to mmap IO (bug 11742)";
+       else if (new_cksum == client_cksum)
+               msg = "changed in transit before arrival at OST";
+       else
+               msg = "changed in transit AND doesn't match the original - "
+                     "likely false positive due to mmap IO (bug 11742)";
+
+       LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+                          " object "DOSTID" extent ["LPU64"-"LPU64"]\n",
+                          msg, libcfs_nid2str(peer->nid),
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+                          POSTID(&oa->o_oi), pga[0]->off,
+                          pga[page_count-1]->off + pga[page_count-1]->count - 1);
+       CERROR("original client csum %x (type %x), server csum %x (type %x), "
+              "client csum now %x\n", client_cksum, client_cksum_type,
+              server_cksum, cksum_type, new_cksum);
+       return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+       struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+       const lnet_process_id_t *peer =
+                       &req->rq_import->imp_connection->c_peer;
+       struct client_obd *cli = aa->aa_cli;
+       struct ost_body *body;
+       __u32 client_cksum = 0;
+       ENTRY;
+
+       if (rc < 0 && rc != -EDQUOT) {
+               DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL) {
+               DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+               RETURN(-EPROTO);
+       }
+
+       /* set/clear over quota flag for a uid/gid */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+           body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+               unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+               CDEBUG(D_QUOTA, "setdq for [%u %u] with valid "LPX64", flags %x\n",
+                      body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+                      body->oa.o_flags);
+               osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+       }
+
+       osc_update_grant(cli, body);
+
+       if (rc < 0)
+               RETURN(rc);
+
+       if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+               client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+               if (rc > 0) {
+                       CERROR("Unexpected +ve rc %d\n", rc);
+                       RETURN(-EPROTO);
+               }
+               LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+               if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+                       RETURN(-EAGAIN);
+
+               if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+                   check_write_checksum(&body->oa, peer, client_cksum,
+                                        body->oa.o_cksum, aa->aa_requested_nob,
+                                        aa->aa_page_count, aa->aa_ppga,
+                                        cksum_type_unpack(aa->aa_oa->o_flags)))
+                       RETURN(-EAGAIN);
+
+               rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
+                                    aa->aa_page_count, aa->aa_ppga);
+               GOTO(out, rc);
+       }
+
+       /* The rest of this function executes only for OST_READs */
+
+       /* if unwrap_bulk failed, return -EAGAIN to retry */
+       rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+       if (rc < 0)
+               GOTO(out, rc = -EAGAIN);
+
+       if (rc > aa->aa_requested_nob) {
+               CERROR("Unexpected rc %d (%d requested)\n", rc,
+                      aa->aa_requested_nob);
+               RETURN(-EPROTO);
+       }
+
+       if (rc != req->rq_bulk->bd_nob_transferred) {
+               CERROR ("Unexpected rc %d (%d transferred)\n",
+                       rc, req->rq_bulk->bd_nob_transferred);
+               return (-EPROTO);
+       }
+
+       if (rc < aa->aa_requested_nob)
+               handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+       if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+               static int cksum_counter;
+               __u32      server_cksum = body->oa.o_cksum;
+               char      *via;
+               char      *router;
+               cksum_type_t cksum_type;
+
+               cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+                                              body->oa.o_flags : 0);
+               client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+                                                aa->aa_ppga, OST_READ,
+                                                cksum_type);
+
+               if (peer->nid == req->rq_bulk->bd_sender) {
+                       via = router = "";
+               } else {
+                       via = " via ";
+                       router = libcfs_nid2str(req->rq_bulk->bd_sender);
+               }
+
+               if (server_cksum == ~0 && rc > 0) {
+                       CERROR("Protocol error: server %s set the 'checksum' "
+                              "bit, but didn't send a checksum.  Not fatal, "
+                              "but please notify on http://bugs.whamcloud.com/\n",
+                              libcfs_nid2str(peer->nid));
+               } else if (server_cksum != client_cksum) {
+                       LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+                                          "%s%s%s inode "DFID" object "DOSTID
+                                          " extent ["LPU64"-"LPU64"]\n",
+                                          req->rq_import->imp_obd->obd_name,
+                                          libcfs_nid2str(peer->nid),
+                                          via, router,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_seq : (__u64)0,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_oid : 0,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_ver : 0,
+                                          POSTID(&body->oa.o_oi),
+                                          aa->aa_ppga[0]->off,
+                                          aa->aa_ppga[aa->aa_page_count-1]->off +
+                                          aa->aa_ppga[aa->aa_page_count-1]->count -
+                                                                       1);
+                       CERROR("client %x, server %x, cksum_type %x\n",
+                              client_cksum, server_cksum, cksum_type);
+                       cksum_counter = 0;
+                       aa->aa_oa->o_cksum = client_cksum;
+                       rc = -EAGAIN;
+               } else {
+                       cksum_counter++;
+                       CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+                       rc = 0;
+               }
+       } else if (unlikely(client_cksum)) {
+               static int cksum_missed;
+
+               cksum_missed++;
+               if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+                       CERROR("Checksum %u requested from %s but not sent\n",
+                              cksum_missed, libcfs_nid2str(peer->nid));
+       } else {
+               rc = 0;
+       }
+out:
+       if (rc >= 0)
+               lustre_get_wire_obdo(aa->aa_oa, &body->oa);
+
+       RETURN(rc);
+}
+
+static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa,
+                           struct lov_stripe_md *lsm,
+                           obd_count page_count, struct brw_page **pga,
+                           struct obd_capa *ocapa)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       wait_queue_head_t           waitq;
+       int                 generation, resends = 0;
+       struct l_wait_info     lwi;
+
+       ENTRY;
+
+       init_waitqueue_head(&waitq);
+       generation = exp->exp_obd->u.cli.cl_import->imp_generation;
+
+restart_bulk:
+       rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm,
+                                 page_count, pga, &req, ocapa, 0, resends);
+       if (rc != 0)
+               return (rc);
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+
+       if (rc == -ETIMEDOUT && req->rq_resend) {
+               DEBUG_REQ(D_HA, req,  "BULK TIMEOUT");
+               ptlrpc_req_finished(req);
+               goto restart_bulk;
+       }
+
+       rc = osc_brw_fini_request(req, rc);
+
+       ptlrpc_req_finished(req);
+       /* When server return -EINPROGRESS, client should always retry
+        * regardless of the number of times the bulk was resent already.*/
+       if (osc_recoverable_error(rc)) {
+               resends++;
+               if (rc != -EINPROGRESS &&
+                   !client_should_resend(resends, &exp->exp_obd->u.cli)) {
+                       CERROR("%s: too many resend retries for object: "
+                              ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+                              POSTID(&oa->o_oi), rc);
+                       goto out;
+               }
+               if (generation !=
+                   exp->exp_obd->u.cli.cl_import->imp_generation) {
+                       CDEBUG(D_HA, "%s: resend cross eviction for object: "
+                              ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+                              POSTID(&oa->o_oi), rc);
+                       goto out;
+               }
+
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL,
+                                      NULL);
+               l_wait_event(waitq, 0, &lwi);
+
+               goto restart_bulk;
+       }
+out:
+       if (rc == -EAGAIN || rc == -EINPROGRESS)
+               rc = -EIO;
+       RETURN (rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+                               struct osc_brw_async_args *aa, int rc)
+{
+       struct ptlrpc_request *new_req;
+       struct osc_brw_async_args *new_aa;
+       struct osc_async_page *oap;
+       ENTRY;
+
+       DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+                 "redo for recoverable error %d", rc);
+
+       rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+                                       OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+                                 aa->aa_cli, aa->aa_oa,
+                                 NULL /* lsm unused by osc currently */,
+                                 aa->aa_page_count, aa->aa_ppga,
+                                 &new_req, aa->aa_ocapa, 0, 1);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+               if (oap->oap_request != NULL) {
+                       LASSERTF(request == oap->oap_request,
+                                "request %p != oap_request %p\n",
+                                request, oap->oap_request);
+                       if (oap->oap_interrupted) {
+                               ptlrpc_req_finished(new_req);
+                               RETURN(-EINTR);
+                       }
+               }
+       }
+       /* New request takes over pga and oaps from old request.
+        * Note that copying a list_head doesn't work, need to move it... */
+       aa->aa_resends++;
+       new_req->rq_interpret_reply = request->rq_interpret_reply;
+       new_req->rq_async_args = request->rq_async_args;
+       new_req->rq_commit_cb = request->rq_commit_cb;
+       /* cap resend delay to the current request timeout, this is similar to
+        * what ptlrpc does (see after_reply()) */
+       if (aa->aa_resends > new_req->rq_timeout)
+               new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout;
+       else
+               new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends;
+       new_req->rq_generation_set = 1;
+       new_req->rq_import_generation = request->rq_import_generation;
+
+       new_aa = ptlrpc_req_async_args(new_req);
+
+       INIT_LIST_HEAD(&new_aa->aa_oaps);
+       list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+       INIT_LIST_HEAD(&new_aa->aa_exts);
+       list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+       new_aa->aa_resends = aa->aa_resends;
+
+       list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+               if (oap->oap_request) {
+                       ptlrpc_req_finished(oap->oap_request);
+                       oap->oap_request = ptlrpc_request_addref(new_req);
+               }
+       }
+
+       new_aa->aa_ocapa = aa->aa_ocapa;
+       aa->aa_ocapa = NULL;
+
+       /* XXX: This code will run into problem if we're going to support
+        * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+        * and wait for all of them to be finished. We should inherit request
+        * set from old request. */
+       ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+       DEBUG_REQ(D_INFO, new_req, "new request");
+       RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+       int stride, i, j;
+       struct brw_page *tmp;
+
+       if (num == 1)
+               return;
+       for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+               ;
+
+       do {
+               stride /= 3;
+               for (i = stride ; i < num ; i++) {
+                       tmp = array[i];
+                       j = i;
+                       while (j >= stride && array[j - stride]->off > tmp->off) {
+                               array[j] = array[j - stride];
+                               j -= stride;
+                       }
+                       array[j] = tmp;
+               }
+       } while (stride > 1);
+}
+
+static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages)
+{
+       int count = 1;
+       int offset;
+       int i = 0;
+
+       LASSERT (pages > 0);
+       offset = pg[i]->off & ~CFS_PAGE_MASK;
+
+       for (;;) {
+               pages--;
+               if (pages == 0)  /* that's all */
+                       return count;
+
+               if (offset + pg[i]->count < PAGE_CACHE_SIZE)
+                       return count;   /* doesn't end on page boundary */
+
+               i++;
+               offset = pg[i]->off & ~CFS_PAGE_MASK;
+               if (offset != 0)        /* doesn't start on page boundary */
+                       return count;
+
+               count++;
+       }
+}
+
+static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count)
+{
+       struct brw_page **ppga;
+       int i;
+
+       OBD_ALLOC(ppga, sizeof(*ppga) * count);
+       if (ppga == NULL)
+               return NULL;
+
+       for (i = 0; i < count; i++)
+               ppga[i] = pga + i;
+       return ppga;
+}
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count)
+{
+       LASSERT(ppga != NULL);
+       OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+                  obd_count page_count, struct brw_page *pga,
+                  struct obd_trans_info *oti)
+{
+       struct obdo *saved_oa = NULL;
+       struct brw_page **ppga, **orig;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       struct client_obd *cli;
+       int rc, page_count_orig;
+       ENTRY;
+
+       LASSERT((imp != NULL) && (imp->imp_obd != NULL));
+       cli = &imp->imp_obd->u.cli;
+
+       if (cmd & OBD_BRW_CHECK) {
+               /* The caller just wants to know if there's a chance that this
+                * I/O can succeed */
+
+               if (imp->imp_invalid)
+                       RETURN(-EIO);
+               RETURN(0);
+       }
+
+       /* test_brw with a failed create can trip this, maybe others. */
+       LASSERT(cli->cl_max_pages_per_rpc);
+
+       rc = 0;
+
+       orig = ppga = osc_build_ppga(pga, page_count);
+       if (ppga == NULL)
+               RETURN(-ENOMEM);
+       page_count_orig = page_count;
+
+       sort_brw_pages(ppga, page_count);
+       while (page_count) {
+               obd_count pages_per_brw;
+
+               if (page_count > cli->cl_max_pages_per_rpc)
+                       pages_per_brw = cli->cl_max_pages_per_rpc;
+               else
+                       pages_per_brw = page_count;
+
+               pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
+
+               if (saved_oa != NULL) {
+                       /* restore previously saved oa */
+                       *oinfo->oi_oa = *saved_oa;
+               } else if (page_count > pages_per_brw) {
+                       /* save a copy of oa (brw will clobber it) */
+                       OBDO_ALLOC(saved_oa);
+                       if (saved_oa == NULL)
+                               GOTO(out, rc = -ENOMEM);
+                       *saved_oa = *oinfo->oi_oa;
+               }
+
+               rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+                                     pages_per_brw, ppga, oinfo->oi_capa);
+
+               if (rc != 0)
+                       break;
+
+               page_count -= pages_per_brw;
+               ppga += pages_per_brw;
+       }
+
+out:
+       osc_release_ppga(orig, page_count_orig);
+
+       if (saved_oa != NULL)
+               OBDO_FREE(saved_oa);
+
+       RETURN(rc);
+}
+
+static int brw_interpret(const struct lu_env *env,
+                        struct ptlrpc_request *req, void *data, int rc)
+{
+       struct osc_brw_async_args *aa = data;
+       struct osc_extent *ext;
+       struct osc_extent *tmp;
+       struct cl_object  *obj = NULL;
+       struct client_obd *cli = aa->aa_cli;
+       ENTRY;
+
+       rc = osc_brw_fini_request(req, rc);
+       CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+       /* When server return -EINPROGRESS, client should always retry
+        * regardless of the number of times the bulk was resent already. */
+       if (osc_recoverable_error(rc)) {
+               if (req->rq_import_generation !=
+                   req->rq_import->imp_generation) {
+                       CDEBUG(D_HA, "%s: resend cross eviction for object: "
+                              ""DOSTID", rc = %d.\n",
+                              req->rq_import->imp_obd->obd_name,
+                              POSTID(&aa->aa_oa->o_oi), rc);
+               } else if (rc == -EINPROGRESS ||
+                   client_should_resend(aa->aa_resends, aa->aa_cli)) {
+                       rc = osc_brw_redo_request(req, aa, rc);
+               } else {
+                       CERROR("%s: too many resent retries for object: "
+                              ""LPU64":"LPU64", rc = %d.\n",
+                              req->rq_import->imp_obd->obd_name,
+                              POSTID(&aa->aa_oa->o_oi), rc);
+               }
+
+               if (rc == 0)
+                       RETURN(0);
+               else if (rc == -EAGAIN || rc == -EINPROGRESS)
+                       rc = -EIO;
+       }
+
+       if (aa->aa_ocapa) {
+               capa_put(aa->aa_ocapa);
+               aa->aa_ocapa = NULL;
+       }
+
+       list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+               if (obj == NULL && rc == 0) {
+                       obj = osc2cl(ext->oe_obj);
+                       cl_object_get(obj);
+               }
+
+               list_del_init(&ext->oe_link);
+               osc_extent_finish(env, ext, 1, rc);
+       }
+       LASSERT(list_empty(&aa->aa_exts));
+       LASSERT(list_empty(&aa->aa_oaps));
+
+       if (obj != NULL) {
+               struct obdo *oa = aa->aa_oa;
+               struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+               unsigned long valid = 0;
+
+               LASSERT(rc == 0);
+               if (oa->o_valid & OBD_MD_FLBLOCKS) {
+                       attr->cat_blocks = oa->o_blocks;
+                       valid |= CAT_BLOCKS;
+               }
+               if (oa->o_valid & OBD_MD_FLMTIME) {
+                       attr->cat_mtime = oa->o_mtime;
+                       valid |= CAT_MTIME;
+               }
+               if (oa->o_valid & OBD_MD_FLATIME) {
+                       attr->cat_atime = oa->o_atime;
+                       valid |= CAT_ATIME;
+               }
+               if (oa->o_valid & OBD_MD_FLCTIME) {
+                       attr->cat_ctime = oa->o_ctime;
+                       valid |= CAT_CTIME;
+               }
+               if (valid != 0) {
+                       cl_object_attr_lock(obj);
+                       cl_object_attr_set(env, obj, attr, valid);
+                       cl_object_attr_unlock(obj);
+               }
+               cl_object_put(env, obj);
+       }
+       OBDO_FREE(aa->aa_oa);
+
+       cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+                         req->rq_bulk->bd_nob_transferred);
+       osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+       ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+        * is called so we know whether to go to sync BRWs or wait for more
+        * RPCs to complete */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+               cli->cl_w_in_flight--;
+       else
+               cli->cl_r_in_flight--;
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+       RETURN(rc);
+}
+
+static void brw_commit(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       /* If osc_inc_unstable_pages (via osc_extent_finish) races with
+        * this called via the rq_commit_cb, I need to ensure
+        * osc_dec_unstable_pages is still called. Otherwise unstable
+        * pages may be leaked. */
+       if (req->rq_unstable)
+               osc_dec_unstable_pages(req);
+       else
+               req->rq_committed = 1;
+       spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+       struct ptlrpc_request *req = NULL;
+       struct osc_extent *ext;
+       LIST_HEAD(rpc_list);
+       struct brw_page **pga = NULL;
+       struct osc_brw_async_args *aa = NULL;
+       struct obdo *oa = NULL;
+       struct osc_async_page *oap;
+       struct osc_async_page *tmp;
+       struct cl_req *clerq = NULL;
+       enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
+       struct ldlm_lock *lock = NULL;
+       struct cl_req_attr crattr;
+       obd_off starting_offset = OBD_OBJECT_EOF;
+       obd_off ending_offset = 0;
+       int i, rc, mpflag = 0, mem_tight = 0, page_count = 0;
+
+       ENTRY;
+       LASSERT(!list_empty(ext_list));
+
+       /* add pages into rpc_list to build BRW rpc */
+       list_for_each_entry(ext, ext_list, oe_link) {
+               LASSERT(ext->oe_state == OES_RPC);
+               mem_tight |= ext->oe_memalloc;
+               list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+                       ++page_count;
+                       list_add_tail(&oap->oap_rpc_item, &rpc_list);
+                       if (starting_offset > oap->oap_obj_off)
+                               starting_offset = oap->oap_obj_off;
+                       else
+                               LASSERT(oap->oap_page_off == 0);
+                       if (ending_offset < oap->oap_obj_off + oap->oap_count)
+                               ending_offset = oap->oap_obj_off +
+                                               oap->oap_count;
+                       else
+                               LASSERT(oap->oap_page_off + oap->oap_count ==
+                                       PAGE_CACHE_SIZE);
+               }
+       }
+
+       if (mem_tight)
+               mpflag = cfs_memory_pressure_get_and_set();
+
+       memset(&crattr, 0, sizeof crattr);
+       OBD_ALLOC(pga, sizeof(*pga) * page_count);
+       if (pga == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       i = 0;
+       list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+               struct cl_page *page = oap2cl_page(oap);
+               if (clerq == NULL) {
+                       clerq = cl_req_alloc(env, page, crt,
+                                            1 /* only 1-object rpcs for
+                                               * now */);
+                       if (IS_ERR(clerq))
+                               GOTO(out, rc = PTR_ERR(clerq));
+                       lock = oap->oap_ldlm_lock;
+               }
+               if (mem_tight)
+                       oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+               pga[i] = &oap->oap_brw_page;
+               pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+               CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+                      pga[i]->pg, page_index(oap->oap_page), oap, pga[i]->flag);
+               i++;
+               cl_req_page_add(env, clerq, page);
+       }
+
+       /* always get the data for the obdo for the rpc */
+       LASSERT(clerq != NULL);
+       crattr.cra_oa = oa;
+       crattr.cra_capa = NULL;
+       memset(crattr.cra_jobid, 0, JOBSTATS_JOBID_SIZE);
+       cl_req_attr_set(env, clerq, &crattr, ~0ULL);
+       if (lock) {
+               oa->o_handle = lock->l_remote_handle;
+               oa->o_valid |= OBD_MD_FLHANDLE;
+       }
+
+       rc = cl_req_prep(env, clerq);
+       if (rc != 0) {
+               CERROR("cl_req_prep failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       sort_brw_pages(pga, page_count);
+       rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+                       pga, &req, crattr.cra_capa, 1, 0);
+       if (rc != 0) {
+               CERROR("prep_req failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       req->rq_commit_cb = brw_commit;
+       req->rq_interpret_reply = brw_interpret;
+
+       if (mem_tight != 0)
+               req->rq_memalloc = 1;
+
+       /* Need to update the timestamps after the request is built in case
+        * we race with setattr (locally or in queue at OST).  If OST gets
+        * later setattr before earlier BRW (as determined by the request xid),
+        * the OST will not use BRW timestamps.  Sadly, there is no obvious
+        * way to do this in a single call.  bug 10150 */
+       cl_req_attr_set(env, clerq, &crattr,
+                       OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+       lustre_msg_set_jobid(req->rq_reqmsg, crattr.cra_jobid);
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       INIT_LIST_HEAD(&aa->aa_oaps);
+       list_splice_init(&rpc_list, &aa->aa_oaps);
+       INIT_LIST_HEAD(&aa->aa_exts);
+       list_splice_init(ext_list, &aa->aa_exts);
+       aa->aa_clerq = clerq;
+
+       /* queued sync pages can be torn down while the pages
+        * were between the pending list and the rpc */
+       tmp = NULL;
+       list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+               /* only one oap gets a request reference */
+               if (tmp == NULL)
+                       tmp = oap;
+               if (oap->oap_interrupted && !req->rq_intr) {
+                       CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+                                       oap, req);
+                       ptlrpc_mark_interrupted(req);
+               }
+       }
+       if (tmp != NULL)
+               tmp->oap_request = ptlrpc_request_addref(req);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       starting_offset >>= PAGE_CACHE_SHIFT;
+       if (cmd == OBD_BRW_READ) {
+               cli->cl_r_in_flight++;
+               lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+               lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+               lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+                                     starting_offset + 1);
+       } else {
+               cli->cl_w_in_flight++;
+               lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+               lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+               lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+                                     starting_offset + 1);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+                 page_count, aa, cli->cl_r_in_flight,
+                 cli->cl_w_in_flight);
+
+       /* XXX: Maybe the caller can check the RPC bulk descriptor to
+        * see which CPU/NUMA node the majority of pages were allocated
+        * on, and try to assign the async RPC to the CPU core
+        * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+        *
+        * But on the other hand, we expect that multiple ptlrpcd
+        * threads and the initial write sponsor can run in parallel,
+        * especially when data checksum is enabled, which is CPU-bound
+        * operation and single ptlrpcd thread cannot process in time.
+        * So more ptlrpcd threads sharing BRW load
+        * (with PDL_POLICY_ROUND) seems better.
+        */
+       ptlrpcd_add_req(req, pol, -1);
+       rc = 0;
+       EXIT;
+
+out:
+       if (mem_tight != 0)
+               cfs_memory_pressure_restore(mpflag);
+
+       capa_put(crattr.cra_capa);
+       if (rc != 0) {
+               LASSERT(req == NULL);
+
+               if (oa)
+                       OBDO_FREE(oa);
+               if (pga)
+                       OBD_FREE(pga, sizeof(*pga) * page_count);
+               /* this should happen rarely and is pretty bad, it makes the
+                * pending list not follow the dirty order */
+               while (!list_empty(ext_list)) {
+                       ext = list_entry(ext_list->next, struct osc_extent,
+                                            oe_link);
+                       list_del_init(&ext->oe_link);
+                       osc_extent_finish(env, ext, 0, rc);
+               }
+               if (clerq && !IS_ERR(clerq))
+                       cl_req_completion(env, clerq, rc);
+       }
+       RETURN(rc);
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+                                       struct ldlm_enqueue_info *einfo)
+{
+       void *data = einfo->ei_cbdata;
+       int set = 0;
+
+       LASSERT(lock != NULL);
+       LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+       LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+       LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+       LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+       lock_res_and_lock(lock);
+       spin_lock(&osc_ast_guard);
+
+       if (lock->l_ast_data == NULL)
+               lock->l_ast_data = data;
+       if (lock->l_ast_data == data)
+               set = 1;
+
+       spin_unlock(&osc_ast_guard);
+       unlock_res_and_lock(lock);
+
+       return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+                                  struct ldlm_enqueue_info *einfo)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+       int set = 0;
+
+       if (lock != NULL) {
+               set = osc_set_lock_data_with_check(lock, einfo);
+               LDLM_LOCK_PUT(lock);
+       } else
+               CERROR("lockh %p, data %p - client evicted?\n",
+                      lockh, einfo->ei_cbdata);
+       return set;
+}
+
+static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+                            ldlm_iterator_t replace, void *data)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ostid_build_res_name(&lsm->lsm_oi, &res_id);
+       ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+       return 0;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          ldlm_iterator_t replace, void *data)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+       int rc = 0;
+
+       ostid_build_res_name(&lsm->lsm_oi, &res_id);
+       rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+       if (rc == LDLM_ITER_STOP)
+               return(1);
+       if (rc == LDLM_ITER_CONTINUE)
+               return(0);
+       return(rc);
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+                           obd_enqueue_update_f upcall, void *cookie,
+                           __u64 *flags, int agl, int rc)
+{
+       int intent = *flags & LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       if (intent) {
+               /* The request was created before ldlm_cli_enqueue call. */
+               if (rc == ELDLM_LOCK_ABORTED) {
+                       struct ldlm_reply *rep;
+                       rep = req_capsule_server_get(&req->rq_pill,
+                                                    &RMF_DLM_REP);
+
+                       LASSERT(rep != NULL);
+                       if (rep->lock_policy_res1)
+                               rc = rep->lock_policy_res1;
+               }
+       }
+
+       if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+           (rc == 0)) {
+               *flags |= LDLM_FL_LVB_READY;
+               CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
+                      lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+       }
+
+       /* Call the update callback. */
+       rc = (*upcall)(cookie, rc);
+       RETURN(rc);
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_enqueue_args *aa, int rc)
+{
+       struct ldlm_lock *lock;
+       struct lustre_handle handle;
+       __u32 mode;
+       struct ost_lvb *lvb;
+       __u32 lvb_len;
+       __u64 *flags = aa->oa_flags;
+
+       /* Make a local copy of a lock handle and a mode, because aa->oa_*
+        * might be freed anytime after lock upcall has been called. */
+       lustre_handle_copy(&handle, aa->oa_lockh);
+       mode = aa->oa_ei->ei_mode;
+
+       /* ldlm_cli_enqueue is holding a reference on the lock, so it must
+        * be valid. */
+       lock = ldlm_handle2lock(&handle);
+
+       /* Take an additional reference so that a blocking AST that
+        * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+        * to arrive after an upcall has been executed by
+        * osc_enqueue_fini(). */
+       ldlm_lock_addref(&handle, mode);
+
+       /* Let CP AST to grant the lock first. */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+       if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+               lvb = NULL;
+               lvb_len = 0;
+       } else {
+               lvb = aa->oa_lvb;
+               lvb_len = sizeof(*aa->oa_lvb);
+       }
+
+       /* Complete obtaining the lock procedure. */
+       rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+                                  mode, flags, lvb, lvb_len, &handle, rc);
+       /* Complete osc stuff. */
+       rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+                             flags, aa->oa_agl, rc);
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+       /* Release the lock for async request. */
+       if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+               /*
+                * Releases a reference taken by ldlm_cli_enqueue(), if it is
+                * not already released by
+                * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+                */
+               ldlm_lock_decref(&handle, mode);
+
+       LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+                aa->oa_lockh, req, aa);
+       ldlm_lock_decref(&handle, mode);
+       LDLM_LOCK_PUT(lock);
+       return rc;
+}
+
+void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                       struct lov_oinfo *loi, int flags,
+                       struct ost_lvb *lvb, __u32 mode, int rc)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
+
+       if (rc == ELDLM_OK) {
+               __u64 tmp;
+
+               LASSERT(lock != NULL);
+               loi->loi_lvb = *lvb;
+               tmp = loi->loi_lvb.lvb_size;
+               /* Extend KMS up to the end of this lock and no further
+                * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+               if (tmp > lock->l_policy_data.l_extent.end)
+                       tmp = lock->l_policy_data.l_extent.end + 1;
+               if (tmp >= loi->loi_kms) {
+                       LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+                                  ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
+                       loi_kms_set(loi, tmp);
+               } else {
+                       LDLM_DEBUG(lock, "lock acquired, setting rss="
+                                  LPU64"; leaving kms="LPU64", end="LPU64,
+                                  loi->loi_lvb.lvb_size, loi->loi_kms,
+                                  lock->l_policy_data.l_extent.end);
+               }
+               ldlm_lock_allow_match(lock);
+       } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) {
+               LASSERT(lock != NULL);
+               loi->loi_lvb = *lvb;
+               ldlm_lock_allow_match(lock);
+               CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                      " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
+               rc = ELDLM_OK;
+       }
+
+       if (lock != NULL) {
+               if (rc != ELDLM_OK)
+                       ldlm_lock_fail_match(lock);
+
+               LDLM_LOCK_PUT(lock);
+       }
+}
+EXPORT_SYMBOL(osc_update_enqueue);
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                    __u64 *flags, ldlm_policy_data_t *policy,
+                    struct ost_lvb *lvb, int kms_valid,
+                    obd_enqueue_update_f upcall, void *cookie,
+                    struct ldlm_enqueue_info *einfo,
+                    struct lustre_handle *lockh,
+                    struct ptlrpc_request_set *rqset, int async, int agl)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req = NULL;
+       int intent = *flags & LDLM_FL_HAS_INTENT;
+       int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+       ldlm_mode_t mode;
+       int rc;
+       ENTRY;
+
+       /* Filesystem lock extents are extended to page boundaries so that
+        * dealing with the page cache is a little smoother.  */
+       policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+       policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+       /*
+        * kms is not valid when either object is completely fresh (so that no
+        * locks are cached), or object was evicted. In the latter case cached
+        * lock cannot be used, because it would prime inode state with
+        * potentially stale LVB.
+        */
+       if (!kms_valid)
+               goto no_match;
+
+       /* Next, search for already existing extent locks that will cover us */
+       /* If we're trying to read, we also search for an existing PW lock.  The
+        * VFS and page cache already protect us locally, so lots of readers/
+        * writers can share a single PW lock.
+        *
+        * There are problems with conversion deadlocks, so instead of
+        * converting a read lock to a write lock, we'll just enqueue a new
+        * one.
+        *
+        * At some point we should cancel the read lock instead of making them
+        * send us a blocking callback, but there are problems with canceling
+        * locks out from other users right now, too. */
+       mode = einfo->ei_mode;
+       if (einfo->ei_mode == LCK_PR)
+               mode |= LCK_PW;
+       mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+                              einfo->ei_type, policy, mode, lockh, 0);
+       if (mode) {
+               struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+               if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+                       /* For AGL, if enqueue RPC is sent but the lock is not
+                        * granted, then skip to process this strpe.
+                        * Return -ECANCELED to tell the caller. */
+                       ldlm_lock_decref(lockh, mode);
+                       LDLM_LOCK_PUT(matched);
+                       RETURN(-ECANCELED);
+               } else if (osc_set_lock_data_with_check(matched, einfo)) {
+                       *flags |= LDLM_FL_LVB_READY;
+                       /* addref the lock only if not async requests and PW
+                        * lock is matched whereas we asked for PR. */
+                       if (!rqset && einfo->ei_mode != mode)
+                               ldlm_lock_addref(lockh, LCK_PR);
+                       if (intent) {
+                               /* I would like to be able to ASSERT here that
+                                * rss <= kms, but I can't, for reasons which
+                                * are explained in lov_enqueue() */
+                       }
+
+                       /* We already have a lock, and it's referenced.
+                        *
+                        * At this point, the cl_lock::cll_state is CLS_QUEUING,
+                        * AGL upcall may change it to CLS_HELD directly. */
+                       (*upcall)(cookie, ELDLM_OK);
+
+                       if (einfo->ei_mode != mode)
+                               ldlm_lock_decref(lockh, LCK_PW);
+                       else if (rqset)
+                               /* For async requests, decref the lock. */
+                               ldlm_lock_decref(lockh, einfo->ei_mode);
+                       LDLM_LOCK_PUT(matched);
+                       RETURN(ELDLM_OK);
+               } else {
+                       ldlm_lock_decref(lockh, mode);
+                       LDLM_LOCK_PUT(matched);
+               }
+       }
+
+ no_match:
+       if (intent) {
+               LIST_HEAD(cancels);
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_LDLM_ENQUEUE_LVB);
+               if (req == NULL)
+                       RETURN(-ENOMEM);
+
+               rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       RETURN(rc);
+               }
+
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                                    sizeof *lvb);
+               ptlrpc_request_set_replen(req);
+       }
+
+       /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+       *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+                             sizeof(*lvb), LVB_T_OST, lockh, async);
+       if (rqset) {
+               if (!rc) {
+                       struct osc_enqueue_args *aa;
+                       CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+                       aa = ptlrpc_req_async_args(req);
+                       aa->oa_ei = einfo;
+                       aa->oa_exp = exp;
+                       aa->oa_flags  = flags;
+                       aa->oa_upcall = upcall;
+                       aa->oa_cookie = cookie;
+                       aa->oa_lvb    = lvb;
+                       aa->oa_lockh  = lockh;
+                       aa->oa_agl    = !!agl;
+
+                       req->rq_interpret_reply =
+                               (ptlrpc_interpterer_t)osc_enqueue_interpret;
+                       if (rqset == PTLRPCD_SET)
+                               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+                       else
+                               ptlrpc_set_add_req(rqset, req);
+               } else if (intent) {
+                       ptlrpc_req_finished(req);
+               }
+               RETURN(rc);
+       }
+
+       rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+       if (intent)
+               ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+                      struct ldlm_enqueue_info *einfo,
+                      struct ptlrpc_request_set *rqset)
+{
+       struct ldlm_res_id res_id;
+       int rc;
+       ENTRY;
+
+       ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id);
+       rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy,
+                             &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
+                             oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid,
+                             oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh,
+                             rqset, rqset != NULL, 0);
+       RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                  __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                  int *flags, void *data, struct lustre_handle *lockh,
+                  int unref)
+{
+       struct obd_device *obd = exp->exp_obd;
+       int lflags = *flags;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+               RETURN(-EIO);
+
+       /* Filesystem lock extents are extended to page boundaries so that
+        * dealing with the page cache is a little smoother */
+       policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+       policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+       /* Next, search for already existing extent locks that will cover us */
+       /* If we're trying to read, we also search for an existing PW lock.  The
+        * VFS and page cache already protect us locally, so lots of readers/
+        * writers can share a single PW lock. */
+       rc = mode;
+       if (mode == LCK_PR)
+               rc |= LCK_PW;
+       rc = ldlm_lock_match(obd->obd_namespace, lflags,
+                            res_id, type, policy, rc, lockh, unref);
+       if (rc) {
+               if (data != NULL) {
+                       if (!osc_set_data_with_check(lockh, data)) {
+                               if (!(lflags & LDLM_FL_TEST_LOCK))
+                                       ldlm_lock_decref(lockh, rc);
+                               RETURN(0);
+                       }
+               }
+               if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+                       ldlm_lock_addref(lockh, LCK_PR);
+                       ldlm_lock_decref(lockh, LCK_PW);
+               }
+               RETURN(rc);
+       }
+       RETURN(rc);
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+       ENTRY;
+
+       if (unlikely(mode == LCK_GROUP))
+               ldlm_lock_decref_and_cancel(lockh, mode);
+       else
+               ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+       RETURN(osc_cancel_base(lockh, mode));
+}
+
+static int osc_cancel_unused(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            ldlm_cancel_flags_t flags,
+                            void *opaque)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct ldlm_res_id res_id, *resp = NULL;
+
+       if (lsm != NULL) {
+               ostid_build_res_name(&lsm->lsm_oi, &res_id);
+               resp = &res_id;
+       }
+
+       return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+                               struct ptlrpc_request *req,
+                               struct osc_async_args *aa, int rc)
+{
+       struct obd_statfs *msfs;
+       ENTRY;
+
+       if (rc == -EBADR)
+               /* The request has in fact never been sent
+                * due to issues at a higher level (LOV).
+                * Exit immediately since the caller is
+                * aware of the problem and takes care
+                * of the clean up */
+                RETURN(rc);
+
+       if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+           (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+               GOTO(out, rc = 0);
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *aa->aa_oi->oi_osfs = *msfs;
+out:
+       rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+       RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+                           struct obd_info *oinfo, __u64 max_age,
+                           struct ptlrpc_request_set *rqset)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct osc_async_args *aa;
+       int                 rc;
+       ENTRY;
+
+       /* We could possibly pass max_age in the request (as an absolute
+        * timestamp or a "seconds.usec ago") so the target can avoid doing
+        * extra calls into the filesystem if that isn't necessary (e.g.
+        * during mount that would help a bit).  Having relative timestamps
+        * is not so great if request processing is slow, while absolute
+        * timestamps are not ideal because they need time synchronization. */
+       req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = OST_CREATE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stat in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+       CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oi = oinfo;
+
+       ptlrpc_set_add_req(rqset, req);
+       RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct obd_statfs     *msfs;
+       struct ptlrpc_request *req;
+       struct obd_import     *imp = NULL;
+       int rc;
+       ENTRY;
+
+       /*Since the request might also come from lprocfs, so we need
+        *sync this with client_disconnect_export Bug15684*/
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import)
+               imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+       if (!imp)
+               RETURN(-ENODEV);
+
+       /* We could possibly pass max_age in the request (as an absolute
+        * timestamp or a "seconds.usec ago") so the target can avoid doing
+        * extra calls into the filesystem if that isn't necessary (e.g.
+        * during mount that would help a bit).  Having relative timestamps
+        * is not so great if request processing is slow, while absolute
+        * timestamps are not ideal because they need time synchronization. */
+       req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+       class_import_put(imp);
+
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = OST_CREATE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stat in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *osfs = *msfs;
+
+       EXIT;
+ out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+       /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+       struct lov_user_md_v3 lum, *lumk;
+       struct lov_user_ost_data_v1 *lmm_objects;
+       int rc = 0, lum_size;
+       ENTRY;
+
+       if (!lsm)
+               RETURN(-ENODATA);
+
+       /* we only need the header part from user space to get lmm_magic and
+        * lmm_stripe_count, (the header part is common to v1 and v3) */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(&lum, lump, lum_size))
+               RETURN(-EFAULT);
+
+       if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+           (lum.lmm_magic != LOV_USER_MAGIC_V3))
+               RETURN(-EINVAL);
+
+       /* lov_user_md_vX and lov_mds_md_vX must have the same size */
+       LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+       LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+       LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+       /* we can use lov_mds_md_size() to compute lum_size
+        * because lov_user_md_vX and lov_mds_md_vX have the same size */
+       if (lum.lmm_stripe_count > 0) {
+               lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+               OBD_ALLOC(lumk, lum_size);
+               if (!lumk)
+                       RETURN(-ENOMEM);
+
+               if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+                       lmm_objects =
+                           &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+               else
+                       lmm_objects = &(lumk->lmm_objects[0]);
+               lmm_objects->l_ost_oi = lsm->lsm_oi;
+       } else {
+               lum_size = lov_mds_md_size(0, lum.lmm_magic);
+               lumk = &lum;
+       }
+
+       lumk->lmm_oi = lsm->lsm_oi;
+       lumk->lmm_stripe_count = 1;
+
+       if (copy_to_user(lump, lumk, lum_size))
+               rc = -EFAULT;
+
+       if (lumk != &lum)
+               OBD_FREE(lumk, lum_size);
+
+       RETURN(rc);
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct obd_ioctl_data *data = karg;
+       int err = 0;
+       ENTRY;
+
+       if (!try_module_get(THIS_MODULE)) {
+               CERROR("Can't get module. Is it alive?");
+               return -EINVAL;
+       }
+       switch (cmd) {
+       case OBD_IOC_LOV_GET_CONFIG: {
+               char *buf;
+               struct lov_desc *desc;
+               struct obd_uuid uuid;
+
+               buf = NULL;
+               len = 0;
+               if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+                       GOTO(out, err = -EINVAL);
+
+               data = (struct obd_ioctl_data *)buf;
+
+               if (sizeof(*desc) > data->ioc_inllen1) {
+                       obd_ioctl_freedata(buf, len);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               if (data->ioc_inllen2 < sizeof(uuid)) {
+                       obd_ioctl_freedata(buf, len);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               desc = (struct lov_desc *)data->ioc_inlbuf1;
+               desc->ld_tgt_count = 1;
+               desc->ld_active_tgt_count = 1;
+               desc->ld_default_stripe_count = 1;
+               desc->ld_default_stripe_size = 0;
+               desc->ld_default_stripe_offset = 0;
+               desc->ld_pattern = 0;
+               memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+               memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+               err = copy_to_user((void *)uarg, buf, len);
+               if (err)
+                       err = -EFAULT;
+               obd_ioctl_freedata(buf, len);
+               GOTO(out, err);
+       }
+       case LL_IOC_LOV_SETSTRIPE:
+               err = obd_alloc_memmd(exp, karg);
+               if (err > 0)
+                       err = 0;
+               GOTO(out, err);
+       case LL_IOC_LOV_GETSTRIPE:
+               err = osc_getstripe(karg, uarg);
+               GOTO(out, err);
+       case OBD_IOC_CLIENT_RECOVER:
+               err = ptlrpc_recover_import(obd->u.cli.cl_import,
+                                           data->ioc_inlbuf1, 0);
+               if (err > 0)
+                       err = 0;
+               GOTO(out, err);
+       case IOC_OSC_SET_ACTIVE:
+               err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+                                              data->ioc_offset);
+               GOTO(out, err);
+       case OBD_IOC_POLL_QUOTACHECK:
+               err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+               GOTO(out, err);
+       case OBD_IOC_PING_TARGET:
+               err = ptlrpc_obd_ping(obd);
+               GOTO(out, err);
+       default:
+               CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+                      cmd, current_comm());
+               GOTO(out, err = -ENOTTY);
+       }
+out:
+       module_put(THIS_MODULE);
+       return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+                       obd_count keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       ENTRY;
+       if (!vallen || !val)
+               RETURN(-EFAULT);
+
+       if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+               __u32 *stripe = val;
+               *vallen = sizeof(*stripe);
+               *stripe = 0;
+               RETURN(0);
+       } else if (KEY_IS(KEY_LAST_ID)) {
+               struct ptlrpc_request *req;
+               obd_id          *reply;
+               char              *tmp;
+               int                 rc;
+
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_OST_GET_INFO_LAST_ID);
+               if (req == NULL)
+                       RETURN(-ENOMEM);
+
+               req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                                    RCL_CLIENT, keylen);
+               rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       RETURN(rc);
+               }
+
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+               memcpy(tmp, key, keylen);
+
+               req->rq_no_delay = req->rq_no_resend = 1;
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               if (rc)
+                       GOTO(out, rc);
+
+               reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+               if (reply == NULL)
+                       GOTO(out, rc = -EPROTO);
+
+               *((obd_id *)val) = *reply;
+       out:
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       } else if (KEY_IS(KEY_FIEMAP)) {
+               struct ptlrpc_request *req;
+               struct ll_user_fiemap *reply;
+               char *tmp;
+               int rc;
+
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_OST_GET_INFO_FIEMAP);
+               if (req == NULL)
+                       RETURN(-ENOMEM);
+
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+                                    RCL_CLIENT, keylen);
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+                                    RCL_CLIENT, *vallen);
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+                                    RCL_SERVER, *vallen);
+
+               rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       RETURN(rc);
+               }
+
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+               memcpy(tmp, key, keylen);
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+               memcpy(tmp, val, *vallen);
+
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               if (rc)
+                       GOTO(out1, rc);
+
+               reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+               if (reply == NULL)
+                       GOTO(out1, rc = -EPROTO);
+
+               memcpy(val, reply, *vallen);
+       out1:
+               ptlrpc_req_finished(req);
+
+               RETURN(rc);
+       }
+
+       RETURN(-EINVAL);
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obd = exp->exp_obd;
+       struct obd_import     *imp = class_exp2cliimp(exp);
+       char              *tmp;
+       int                 rc;
+       ENTRY;
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+       if (KEY_IS(KEY_CHECKSUM)) {
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_SPTLRPC_CONF)) {
+               sptlrpc_conf_client_adapt(obd);
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_FLUSH_CTX)) {
+               sptlrpc_import_flush_my_ctx(imp);
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_CACHE_SET)) {
+               struct client_obd *cli = &obd->u.cli;
+
+               LASSERT(cli->cl_cache == NULL); /* only once */
+               cli->cl_cache = (struct cl_client_cache *)val;
+               atomic_inc(&cli->cl_cache->ccc_users);
+               cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+               /* add this osc into entity list */
+               LASSERT(list_empty(&cli->cl_lru_osc));
+               spin_lock(&cli->cl_cache->ccc_lru_lock);
+               list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+               spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+               struct client_obd *cli = &obd->u.cli;
+               int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+               int target = *(int *)val;
+
+               nr = osc_lru_shrink(cli, min(nr, target));
+               *(int *)val -= nr;
+               RETURN(0);
+       }
+
+       if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+               RETURN(-EINVAL);
+
+       /* We pass all other commands directly to OST. Since nobody calls osc
+          methods directly and everybody is supposed to go through LOV, we
+          assume lov checked invalid values for us.
+          The only recognised values so far are evict_by_nid and mds_conn.
+          Even if something bad goes through, we'd get a -EINVAL from OST
+          anyway. */
+
+       req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+                                               &RQF_OST_SET_GRANT_INFO :
+                                               &RQF_OBD_SET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       if (!KEY_IS(KEY_GRANT_SHRINK))
+               req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                                    RCL_CLIENT, vallen);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+                                                       &RMF_OST_BODY :
+                                                       &RMF_SETINFO_VAL);
+       memcpy(tmp, val, vallen);
+
+       if (KEY_IS(KEY_GRANT_SHRINK)) {
+               struct osc_grant_args *aa;
+               struct obdo *oa;
+
+               CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+               aa = ptlrpc_req_async_args(req);
+               OBDO_ALLOC(oa);
+               if (!oa) {
+                       ptlrpc_req_finished(req);
+                       RETURN(-ENOMEM);
+               }
+               *oa = ((struct ost_body *)val)->oa;
+               aa->aa_oa = oa;
+               req->rq_interpret_reply = osc_shrink_grant_interpret;
+       }
+
+       ptlrpc_request_set_replen(req);
+       if (!KEY_IS(KEY_GRANT_SHRINK)) {
+               LASSERT(set != NULL);
+               ptlrpc_set_add_req(set, req);
+               ptlrpc_check_set(NULL, set);
+       } else
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+       RETURN(0);
+}
+
+
+static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *disk_obd, int *index)
+{
+       /* this code is not supposed to be used with LOD/OSP
+        * to be removed soon */
+       LBUG();
+       return 0;
+}
+
+static int osc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+       if (ctxt) {
+               llog_cat_close(NULL, ctxt->loc_handle);
+               llog_cleanup(NULL, ctxt);
+       }
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(0);
+}
+
+static int osc_reconnect(const struct lu_env *env,
+                        struct obd_export *exp, struct obd_device *obd,
+                        struct obd_uuid *cluuid,
+                        struct obd_connect_data *data,
+                        void *localdata)
+{
+       struct client_obd *cli = &obd->u.cli;
+
+       if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+               long lost_grant;
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+                               2 * cli_brw_size(obd);
+               lost_grant = cli->cl_lost_grant;
+               cli->cl_lost_grant = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+               CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d"
+                      " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+                      data->ocd_version, data->ocd_grant, lost_grant);
+       }
+
+       RETURN(0);
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct llog_ctxt  *ctxt;
+       int rc;
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt) {
+               if (obd->u.cli.cl_conn_count == 1) {
+                       /* Flush any remaining cancel messages out to the
+                        * target */
+                       llog_sync(ctxt, exp, 0);
+               }
+               llog_ctxt_put(ctxt);
+       } else {
+               CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n",
+                      obd);
+       }
+
+       rc = client_disconnect_export(exp);
+       /**
+        * Initially we put del_shrink_grant before disconnect_export, but it
+        * causes the following problem if setup (connect) and cleanup
+        * (disconnect) are tangled together.
+        *      connect p1                   disconnect p2
+        *   ptlrpc_connect_import
+        *     ...............         class_manual_cleanup
+        *                                   osc_disconnect
+        *                                   del_shrink_grant
+        *   ptlrpc_connect_interrupt
+        *     init_grant_shrink
+        *   add this client to shrink list
+        *                                    cleanup_osc
+        * Bang! pinger trigger the shrink.
+        * So the osc should be disconnected from the shrink list, after we
+        * are sure the import has been destroyed. BUG18662
+        */
+       if (obd->u.cli.cl_import == NULL)
+               osc_del_shrink_grant(&obd->u.cli);
+       return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+                           struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       struct client_obd *cli;
+       int rc = 0;
+
+       ENTRY;
+       LASSERT(imp->imp_obd == obd);
+
+       switch (event) {
+       case IMP_EVENT_DISCON: {
+               cli = &obd->u.cli;
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               cli->cl_avail_grant = 0;
+               cli->cl_lost_grant = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               break;
+       }
+       case IMP_EVENT_INACTIVE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+               struct lu_env    *env;
+               int                 refcheck;
+
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       /* Reset grants */
+                       cli = &obd->u.cli;
+                       /* all pages go to failing rpcs due to the invalid
+                        * import */
+                       osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+                       ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                       cl_env_put(env, &refcheck);
+               } else
+                       rc = PTR_ERR(env);
+               break;
+       }
+       case IMP_EVENT_ACTIVE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_OCD: {
+               struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+                       osc_init_grant(&obd->u.cli, ocd);
+
+               /* See bug 7198 */
+               if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                       imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+               break;
+       }
+       case IMP_EVENT_DEACTIVATE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+               break;
+       }
+       case IMP_EVENT_ACTIVATE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+               break;
+       }
+       default:
+               CERROR("Unknown import event %d\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+       check_res_locked(lock->l_resource);
+
+       /*
+        * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+        *
+        * XXX as a future improvement, we can also cancel unused write lock
+        * if it doesn't have dirty data and active mmaps.
+        */
+       if (lock->l_resource->lr_type == LDLM_EXTENT &&
+           (lock->l_granted_mode == LCK_PR ||
+            lock->l_granted_mode == LCK_CR) &&
+           (osc_dlm_lock_pageref(lock) == 0))
+               RETURN(1);
+
+       RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+       struct client_obd *cli = data;
+
+       CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+       osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+       RETURN(0);
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       struct client_obd         *cli = &obd->u.cli;
+       void                   *handler;
+       int                     rc;
+       ENTRY;
+
+       rc = ptlrpcd_addref();
+       if (rc)
+               RETURN(rc);
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(out_ptlrpcd, rc);
+
+       handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+       if (IS_ERR(handler))
+               GOTO(out_client_setup, rc = PTR_ERR(handler));
+       cli->cl_writeback_work = handler;
+
+       rc = osc_quota_setup(obd);
+       if (rc)
+               GOTO(out_ptlrpcd_work, rc);
+
+       cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+       lprocfs_osc_init_vars(&lvars);
+       if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+               lproc_osc_attach_seqstat(obd);
+               sptlrpc_lprocfs_cliobd_attach(obd);
+               ptlrpc_lprocfs_register_obd(obd);
+       }
+
+       /* We need to allocate a few requests more, because
+        * brw_interpret tries to create new requests before freeing
+        * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+        * reserved, but I'm afraid that might be too much wasted RAM
+        * in fact, so 2 is just my guess and still should work. */
+       cli->cl_import->imp_rq_pool =
+               ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+                                   OST_MAXREQSIZE,
+                                   ptlrpc_add_rqs_to_pool);
+
+       INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+       ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+       RETURN(rc);
+
+out_ptlrpcd_work:
+       ptlrpcd_destroy_work(handler);
+out_client_setup:
+       client_obd_cleanup(obd);
+out_ptlrpcd:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY: {
+               struct obd_import *imp;
+               imp = obd->u.cli.cl_import;
+               CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+               /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+               ptlrpc_deactivate_import(imp);
+               spin_lock(&imp->imp_lock);
+               imp->imp_pingable = 0;
+               spin_unlock(&imp->imp_lock);
+               break;
+       }
+       case OBD_CLEANUP_EXPORTS: {
+               struct client_obd *cli = &obd->u.cli;
+               /* LU-464
+                * for echo client, export may be on zombie list, wait for
+                * zombie thread to cull it, because cli.cl_import will be
+                * cleared in client_disconnect_export():
+                *   class_export_destroy() -> obd_cleanup() ->
+                *   echo_device_free() -> echo_client_cleanup() ->
+                *   obd_disconnect() -> osc_disconnect() ->
+                *   client_disconnect_export()
+                */
+               obd_zombie_barrier();
+               if (cli->cl_writeback_work) {
+                       ptlrpcd_destroy_work(cli->cl_writeback_work);
+                       cli->cl_writeback_work = NULL;
+               }
+               obd_cleanup_client_import(obd);
+               ptlrpc_lprocfs_unregister_obd(obd);
+               lprocfs_obd_cleanup(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+               }
+       }
+       RETURN(rc);
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+
+       ENTRY;
+
+       /* lru cleanup */
+       if (cli->cl_cache != NULL) {
+               LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+               spin_lock(&cli->cl_cache->ccc_lru_lock);
+               list_del_init(&cli->cl_lru_osc);
+               spin_unlock(&cli->cl_cache->ccc_lru_lock);
+               cli->cl_lru_left = NULL;
+               atomic_dec(&cli->cl_cache->ccc_users);
+               cli->cl_cache = NULL;
+       }
+
+       /* free memory of osc quota cache */
+       osc_quota_cleanup(obd);
+
+       rc = client_obd_cleanup(obd);
+
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc = 0;
+
+       lprocfs_osc_init_vars(&lvars);
+
+       switch (lcfg->lcfg_command) {
+       default:
+               rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               break;
+       }
+
+       return(rc);
+}
+
+static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+       .o_owner                = THIS_MODULE,
+       .o_setup                = osc_setup,
+       .o_precleanup      = osc_precleanup,
+       .o_cleanup            = osc_cleanup,
+       .o_add_conn          = client_import_add_conn,
+       .o_del_conn          = client_import_del_conn,
+       .o_connect            = client_connect_import,
+       .o_reconnect        = osc_reconnect,
+       .o_disconnect      = osc_disconnect,
+       .o_statfs              = osc_statfs,
+       .o_statfs_async  = osc_statfs_async,
+       .o_packmd              = osc_packmd,
+       .o_unpackmd          = osc_unpackmd,
+       .o_create              = osc_create,
+       .o_destroy            = osc_destroy,
+       .o_getattr            = osc_getattr,
+       .o_getattr_async        = osc_getattr_async,
+       .o_setattr            = osc_setattr,
+       .o_setattr_async        = osc_setattr_async,
+       .o_brw            = osc_brw,
+       .o_punch                = osc_punch,
+       .o_sync          = osc_sync,
+       .o_enqueue            = osc_enqueue,
+       .o_change_cbdata        = osc_change_cbdata,
+       .o_find_cbdata    = osc_find_cbdata,
+       .o_cancel              = osc_cancel,
+       .o_cancel_unused        = osc_cancel_unused,
+       .o_iocontrol        = osc_iocontrol,
+       .o_get_info          = osc_get_info,
+       .o_set_info_async       = osc_set_info_async,
+       .o_import_event  = osc_import_event,
+       .o_llog_init        = osc_llog_init,
+       .o_llog_finish    = osc_llog_finish,
+       .o_process_config       = osc_process_config,
+       .o_quotactl          = osc_quotactl,
+       .o_quotacheck      = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+int __init osc_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+       rc = lu_kmem_init(osc_caches);
+
+       lprocfs_osc_init_vars(&lvars);
+
+       rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+                                LUSTRE_OSC_NAME, &osc_device_type);
+       if (rc) {
+               lu_kmem_fini(osc_caches);
+               RETURN(rc);
+       }
+
+       spin_lock_init(&osc_ast_guard);
+       lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+       class_unregister_type(LUSTRE_OSC_NAME);
+       lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+
+cfs_module(osc, LUSTRE_VERSION_STRING, osc_init, osc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644 (file)
index 0000000..983eb66
--- /dev/null
@@ -0,0 +1,23 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+
+obj-$(CONFIG_PTLRPC_GSS) += gss/
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644 (file)
index 0000000..22f7e65
--- /dev/null
@@ -0,0 +1,3059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                       struct ptlrpc_client *cl)
+{
+       cl->cli_request_portal = req_portal;
+       cl->cli_reply_portal   = rep_portal;
+       cl->cli_name       = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+       struct ptlrpc_connection *c;
+       lnet_nid_t              self;
+       lnet_process_id_t        peer;
+       int                    err;
+
+       /* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+        * before accessing its values. */
+       /* coverity[uninit_use_in_call] */
+       err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+       if (err != 0) {
+               CNETERR("cannot find peer %s!\n", uuid->uuid);
+               return NULL;
+       }
+
+       c = ptlrpc_connection_get(peer, self, uuid);
+       if (c) {
+               memcpy(c->c_remote_uuid.uuid,
+                      uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+       }
+
+       CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+       return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+                                        unsigned type, unsigned portal)
+{
+       struct ptlrpc_bulk_desc *desc;
+       int i;
+
+       OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+       if (!desc)
+               return NULL;
+
+       spin_lock_init(&desc->bd_lock);
+       init_waitqueue_head(&desc->bd_waitq);
+       desc->bd_max_iov = npages;
+       desc->bd_iov_count = 0;
+       desc->bd_portal = portal;
+       desc->bd_type = type;
+       desc->bd_md_count = 0;
+       LASSERT(max_brw > 0);
+       desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+       /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+        * node. Negotiated ocd_brw_size will always be <= this number. */
+       for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+               LNetInvalidateHandle(&desc->bd_mds[i]);
+
+       return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+                                             unsigned npages, unsigned max_brw,
+                                             unsigned type, unsigned portal)
+{
+       struct obd_import *imp = req->rq_import;
+       struct ptlrpc_bulk_desc *desc;
+
+       ENTRY;
+       LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+       desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+       if (desc == NULL)
+               RETURN(NULL);
+
+       desc->bd_import_generation = req->rq_import_generation;
+       desc->bd_import = class_import_get(imp);
+       desc->bd_req = req;
+
+       desc->bd_cbid.cbid_fn  = client_bulk_callback;
+       desc->bd_cbid.cbid_arg = desc;
+
+       /* This makes req own desc, and free it when she frees herself */
+       req->rq_bulk = desc;
+
+       return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                            struct page *page, int pageoffset, int len, int pin)
+{
+       LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+       LASSERT(page != NULL);
+       LASSERT(pageoffset >= 0);
+       LASSERT(len > 0);
+       LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+       desc->bd_nob += len;
+
+       if (pin)
+               page_cache_get(page);
+
+       ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+       int i;
+       ENTRY;
+
+       LASSERT(desc != NULL);
+       LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+       LASSERT(desc->bd_md_count == 0);         /* network hands off */
+       LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+       sptlrpc_enc_pool_put_pages(desc);
+
+       if (desc->bd_export)
+               class_export_put(desc->bd_export);
+       else
+               class_import_put(desc->bd_import);
+
+       if (unpin) {
+               for (i = 0; i < desc->bd_iov_count ; i++)
+                       page_cache_release(desc->bd_iov[i].kiov_page);
+       }
+
+       OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+                               bd_iov[desc->bd_max_iov]));
+       EXIT;
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+       __u32 serv_est;
+       int idx;
+       struct imp_at *at;
+
+       LASSERT(req->rq_import);
+
+       if (AT_OFF) {
+               /* non-AT settings */
+               /**
+                * \a imp_server_timeout means this is reverse import and
+                * we send (currently only) ASTs to the client and cannot afford
+                * to wait too long for the reply, otherwise the other client
+                * (because of which we are sending this request) would
+                * timeout waiting for us
+                */
+               req->rq_timeout = req->rq_import->imp_server_timeout ?
+                                 obd_timeout / 2 : obd_timeout;
+       } else {
+               at = &req->rq_import->imp_at;
+               idx = import_at_get_index(req->rq_import,
+                                         req->rq_request_portal);
+               serv_est = at_get(&at->iat_service_estimate[idx]);
+               req->rq_timeout = at_est2timeout(serv_est);
+       }
+       /* We could get even fancier here, using history to predict increased
+          loading... */
+
+       /* Let the server know what this RPC timeout is by putting it in the
+          reqmsg*/
+       lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                 unsigned int serv_est)
+{
+       int idx;
+       unsigned int oldse;
+       struct imp_at *at;
+
+       LASSERT(req->rq_import);
+       at = &req->rq_import->imp_at;
+
+       idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+       /* max service estimates are tracked on the server side,
+          so just keep minimal history here */
+       oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+       if (oldse != 0)
+               CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+                      "has changed from %d to %d\n",
+                      req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+                      oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+       return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+                                     unsigned int service_time)
+{
+       unsigned int nl, oldnl;
+       struct imp_at *at;
+       time_t now = cfs_time_current_sec();
+
+       LASSERT(req->rq_import);
+       at = &req->rq_import->imp_at;
+
+       /* Network latency is total time less server processing time */
+       nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+       if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+               CWARN("Reported service time %u > total measured time "
+                     CFS_DURATION_T"\n", service_time,
+                     cfs_time_sub(now, req->rq_sent));
+
+       oldnl = at_measured(&at->iat_net_latency, nl);
+       if (oldnl != 0)
+               CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+                      "has changed from %d to %d\n",
+                      req->rq_import->imp_obd->obd_name,
+                      obd_uuid2str(
+                              &req->rq_import->imp_connection->c_remote_uuid),
+                      oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+       int rc;
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+               rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+               if (rc) {
+                       DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+                       return(-EPROTO);
+               }
+       }
+
+       rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+       if (rc) {
+               DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+               return(-EPROTO);
+       }
+       return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request *early_req;
+       time_t           olddl;
+       int                 rc;
+       ENTRY;
+
+       req->rq_early = 0;
+       spin_unlock(&req->rq_lock);
+
+       rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+       if (rc) {
+               spin_lock(&req->rq_lock);
+               RETURN(rc);
+       }
+
+       rc = unpack_reply(early_req);
+       if (rc == 0) {
+               /* Expecting to increase the service time estimate here */
+               ptlrpc_at_adj_service(req,
+                       lustre_msg_get_timeout(early_req->rq_repmsg));
+               ptlrpc_at_adj_net_latency(req,
+                       lustre_msg_get_service_time(early_req->rq_repmsg));
+       }
+
+       sptlrpc_cli_finish_early_reply(early_req);
+
+       if (rc != 0) {
+               spin_lock(&req->rq_lock);
+               RETURN(rc);
+       }
+
+       /* Adjust the local timeout for this req */
+       ptlrpc_at_set_req_timeout(req);
+
+       spin_lock(&req->rq_lock);
+       olddl = req->rq_deadline;
+       /* server assumes it now has rq_timeout from when it sent the
+        * early reply, so client should give it at least that long. */
+       req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+                          ptlrpc_at_get_net_latency(req);
+
+       DEBUG_REQ(D_ADAPTTO, req,
+                 "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+                 "("CFS_DURATION_T"s)", req->rq_early_count,
+                 cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
+                 cfs_time_sub(req->rq_deadline, olddl));
+
+       RETURN(rc);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+       struct list_head *l, *tmp;
+       struct ptlrpc_request *req;
+
+       LASSERT(pool != NULL);
+
+       spin_lock(&pool->prp_lock);
+       list_for_each_safe(l, tmp, &pool->prp_req_list) {
+               req = list_entry(l, struct ptlrpc_request, rq_list);
+               list_del(&req->rq_list);
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+               OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+               OBD_FREE(req, sizeof(*req));
+       }
+       spin_unlock(&pool->prp_lock);
+       OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+       int i;
+       int size = 1;
+
+       while (size < pool->prp_rq_size)
+               size <<= 1;
+
+       LASSERTF(list_empty(&pool->prp_req_list) ||
+                size == pool->prp_rq_size,
+                "Trying to change pool size with nonempty pool "
+                "from %d to %d bytes\n", pool->prp_rq_size, size);
+
+       spin_lock(&pool->prp_lock);
+       pool->prp_rq_size = size;
+       for (i = 0; i < num_rq; i++) {
+               struct ptlrpc_request *req;
+               struct lustre_msg *msg;
+
+               spin_unlock(&pool->prp_lock);
+               OBD_ALLOC(req, sizeof(struct ptlrpc_request));
+               if (!req)
+                       return;
+               OBD_ALLOC_LARGE(msg, size);
+               if (!msg) {
+                       OBD_FREE(req, sizeof(struct ptlrpc_request));
+                       return;
+               }
+               req->rq_reqbuf = msg;
+               req->rq_reqbuf_len = size;
+               req->rq_pool = pool;
+               spin_lock(&pool->prp_lock);
+               list_add_tail(&req->rq_list, &pool->prp_req_list);
+       }
+       spin_unlock(&pool->prp_lock);
+       return;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *                 to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+                   void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+       struct ptlrpc_request_pool *pool;
+
+       OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool));
+       if (!pool)
+               return NULL;
+
+       /* Request next power of two for the allocation, because internally
+          kernel would do exactly this */
+
+       spin_lock_init(&pool->prp_lock);
+       INIT_LIST_HEAD(&pool->prp_req_list);
+       pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+       pool->prp_populate = populate_pool;
+
+       populate_pool(pool, num_rq);
+
+       if (list_empty(&pool->prp_req_list)) {
+               /* have not allocated a single request for the pool */
+               OBD_FREE(pool, sizeof (struct ptlrpc_request_pool));
+               pool = NULL;
+       }
+       return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request;
+       struct lustre_msg *reqbuf;
+
+       if (!pool)
+               return NULL;
+
+       spin_lock(&pool->prp_lock);
+
+       /* See if we have anything in a pool, and bail out if nothing,
+        * in writeout path, where this matters, this is safe to do, because
+        * nothing is lost in this case, and when some in-flight requests
+        * complete, this code will be called again. */
+       if (unlikely(list_empty(&pool->prp_req_list))) {
+               spin_unlock(&pool->prp_lock);
+               return NULL;
+       }
+
+       request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+                                rq_list);
+       list_del_init(&request->rq_list);
+       spin_unlock(&pool->prp_lock);
+
+       LASSERT(request->rq_reqbuf);
+       LASSERT(request->rq_pool);
+
+       reqbuf = request->rq_reqbuf;
+       memset(request, 0, sizeof(*request));
+       request->rq_reqbuf = reqbuf;
+       request->rq_reqbuf_len = pool->prp_rq_size;
+       request->rq_pool = pool;
+
+       return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+       struct ptlrpc_request_pool *pool = request->rq_pool;
+
+       spin_lock(&pool->prp_lock);
+       LASSERT(list_empty(&request->rq_list));
+       LASSERT(!request->rq_receiving_reply);
+       list_add_tail(&request->rq_list, &pool->prp_req_list);
+       spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                                     __u32 version, int opcode,
+                                     int count, __u32 *lengths, char **bufs,
+                                     struct ptlrpc_cli_ctx *ctx)
+{
+       struct obd_import  *imp = request->rq_import;
+       int              rc;
+       ENTRY;
+
+       if (unlikely(ctx))
+               request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+       else {
+               rc = sptlrpc_req_get_ctx(request);
+               if (rc)
+                       GOTO(out_free, rc);
+       }
+
+       sptlrpc_req_set_flavor(request, opcode);
+
+       rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+                                lengths, bufs);
+       if (rc) {
+               LASSERT(!request->rq_pool);
+               GOTO(out_ctx, rc);
+       }
+
+       lustre_msg_add_version(request->rq_reqmsg, version);
+       request->rq_send_state = LUSTRE_IMP_FULL;
+       request->rq_type = PTL_RPC_MSG_REQUEST;
+       request->rq_export = NULL;
+
+       request->rq_req_cbid.cbid_fn  = request_out_callback;
+       request->rq_req_cbid.cbid_arg = request;
+
+       request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+       request->rq_reply_cbid.cbid_arg = request;
+
+       request->rq_reply_deadline = 0;
+       request->rq_phase = RQ_PHASE_NEW;
+       request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+       request->rq_request_portal = imp->imp_client->cli_request_portal;
+       request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+       ptlrpc_at_set_req_timeout(request);
+
+       spin_lock_init(&request->rq_lock);
+       INIT_LIST_HEAD(&request->rq_list);
+       INIT_LIST_HEAD(&request->rq_timed_list);
+       INIT_LIST_HEAD(&request->rq_replay_list);
+       INIT_LIST_HEAD(&request->rq_ctx_chain);
+       INIT_LIST_HEAD(&request->rq_set_chain);
+       INIT_LIST_HEAD(&request->rq_history_list);
+       INIT_LIST_HEAD(&request->rq_exp_list);
+       init_waitqueue_head(&request->rq_reply_waitq);
+       init_waitqueue_head(&request->rq_set_waitq);
+       request->rq_xid = ptlrpc_next_xid();
+       atomic_set(&request->rq_refcount, 1);
+
+       lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+       RETURN(0);
+out_ctx:
+       sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+       class_import_put(imp);
+       return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                            __u32 version, int opcode, char **bufs,
+                            struct ptlrpc_cli_ctx *ctx)
+{
+       int count;
+
+       count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+       return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+                                         request->rq_pill.rc_area[RCL_CLIENT],
+                                         bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                       __u32 version, int opcode)
+{
+       int rc;
+       rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+       if (rc)
+               return rc;
+
+       /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+        * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+        * have to send old ptlrpc_body to keep interoprability with these
+        * clients.
+        *
+        * Only three kinds of server->client RPCs so far:
+        *  - LDLM_BL_CALLBACK
+        *  - LDLM_CP_CALLBACK
+        *  - LDLM_GL_CALLBACK
+        *
+        * XXX This should be removed whenever we drop the interoprability with
+        *     the these old clients.
+        */
+       if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+           opcode == LDLM_GL_CALLBACK)
+               req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+                                  sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+                                             struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request = NULL;
+
+       if (pool)
+               request = ptlrpc_prep_req_from_pool(pool);
+
+       if (!request)
+               OBD_ALLOC_PTR(request);
+
+       if (request) {
+               LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+               LASSERT(imp != LP_POISON);
+               LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+                       imp->imp_client);
+               LASSERT(imp->imp_client != LP_POISON);
+
+               request->rq_import = class_import_get(imp);
+       } else {
+               CERROR("request allocation out of memory\n");
+       }
+
+       return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+                             struct ptlrpc_request_pool * pool,
+                             const struct req_format *format)
+{
+       struct ptlrpc_request *request;
+
+       request = __ptlrpc_request_alloc(imp, pool);
+       if (request == NULL)
+               return NULL;
+
+       req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+       req_capsule_set(&request->rq_pill, format);
+       return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                           const struct req_format *format)
+{
+       return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                           struct ptlrpc_request_pool * pool,
+                                           const struct req_format *format)
+{
+       return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+       if (request->rq_pool)
+               __ptlrpc_free_req_to_pool(request);
+       else
+               OBD_FREE_PTR(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                               const struct req_format *format,
+                                               __u32 version, int opcode)
+{
+       struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+       int                 rc;
+
+       if (req) {
+               rc = ptlrpc_request_pack(req, version, opcode);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       req = NULL;
+               }
+       }
+       return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a poolif not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+                    __u32 version, int opcode,
+                    int count, __u32 *lengths, char **bufs,
+                    struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request;
+       int                 rc;
+
+       request = __ptlrpc_request_alloc(imp, pool);
+       if (!request)
+               return NULL;
+
+       rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+                                       lengths, bufs, NULL);
+       if (rc) {
+               ptlrpc_request_free(request);
+               request = NULL;
+       }
+       return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+               __u32 *lengths, char **bufs)
+{
+       return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+                                   NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+       struct ptlrpc_request_set *set;
+
+       ENTRY;
+       OBD_ALLOC(set, sizeof *set);
+       if (!set)
+               RETURN(NULL);
+       atomic_set(&set->set_refcount, 1);
+       INIT_LIST_HEAD(&set->set_requests);
+       init_waitqueue_head(&set->set_waitq);
+       atomic_set(&set->set_new_count, 0);
+       atomic_set(&set->set_remaining, 0);
+       spin_lock_init(&set->set_new_req_lock);
+       INIT_LIST_HEAD(&set->set_new_requests);
+       INIT_LIST_HEAD(&set->set_cblist);
+       set->set_max_inflight = UINT_MAX;
+       set->set_producer     = NULL;
+       set->set_producer_arg = NULL;
+       set->set_rc        = 0;
+
+       RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+                                            void *arg)
+
+{
+       struct ptlrpc_request_set *set;
+
+       set = ptlrpc_prep_set();
+       if (!set)
+               RETURN(NULL);
+
+       set->set_max_inflight  = max;
+       set->set_producer      = func;
+       set->set_producer_arg  = arg;
+
+       RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+       struct list_head       *tmp;
+       struct list_head       *next;
+       int            expected_phase;
+       int            n = 0;
+       ENTRY;
+
+       /* Requests on the set should either all be completed, or all be new */
+       expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+                        RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+       list_for_each (tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               LASSERT(req->rq_phase == expected_phase);
+               n++;
+       }
+
+       LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+                atomic_read(&set->set_remaining) == n, "%d / %d\n",
+                atomic_read(&set->set_remaining), n);
+
+       list_for_each_safe(tmp, next, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+               list_del_init(&req->rq_set_chain);
+
+               LASSERT(req->rq_phase == expected_phase);
+
+               if (req->rq_phase == RQ_PHASE_NEW) {
+                       ptlrpc_req_interpret(NULL, req, -EBADR);
+                       atomic_dec(&set->set_remaining);
+               }
+
+               spin_lock(&req->rq_lock);
+               req->rq_set = NULL;
+               req->rq_invalid_rqset = 0;
+               spin_unlock(&req->rq_lock);
+
+               ptlrpc_req_finished (req);
+       }
+
+       LASSERT(atomic_read(&set->set_remaining) == 0);
+
+       ptlrpc_reqset_put(set);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                     set_interpreter_func fn, void *data)
+{
+       struct ptlrpc_set_cbdata *cbdata;
+
+       OBD_ALLOC_PTR(cbdata);
+       if (cbdata == NULL)
+               RETURN(-ENOMEM);
+
+       cbdata->psc_interpret = fn;
+       cbdata->psc_data = data;
+       list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+                       struct ptlrpc_request *req)
+{
+       LASSERT(list_empty(&req->rq_set_chain));
+
+       /* The set takes over the caller's request reference */
+       list_add_tail(&req->rq_set_chain, &set->set_requests);
+       req->rq_set = set;
+       atomic_inc(&set->set_remaining);
+       req->rq_queued_time = cfs_time_current();
+
+       if (req->rq_reqmsg != NULL)
+               lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+       if (set->set_producer != NULL)
+               /* If the request set has a producer callback, the RPC must be
+                * sent straight away */
+               ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                          struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *set = pc->pc_set;
+       int count, i;
+
+       LASSERT(req->rq_set == NULL);
+       LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+       spin_lock(&set->set_new_req_lock);
+       /*
+        * The set takes over the caller's request reference.
+        */
+       req->rq_set = set;
+       req->rq_queued_time = cfs_time_current();
+       list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+       count = atomic_inc_return(&set->set_new_count);
+       spin_unlock(&set->set_new_req_lock);
+
+       /* Only need to call wakeup once for the first entry. */
+       if (count == 1) {
+               wake_up(&set->set_waitq);
+
+               /* XXX: It maybe unnecessary to wakeup all the partners. But to
+                *      guarantee the async RPC can be processed ASAP, we have
+                *      no other better choice. It maybe fixed in future. */
+               for (i = 0; i < pc->pc_npartners; i++)
+                       wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+       }
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+                                  struct ptlrpc_request *req, int *status)
+{
+       int delay = 0;
+       ENTRY;
+
+       LASSERT (status != NULL);
+       *status = 0;
+
+       if (req->rq_ctx_init || req->rq_ctx_fini) {
+               /* always allow ctx init/fini rpc go through */
+       } else if (imp->imp_state == LUSTRE_IMP_NEW) {
+               DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+               *status = -EIO;
+       } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               /* pings may safely race with umount */
+               DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+                         D_HA : D_ERROR, req, "IMP_CLOSED ");
+               *status = -EIO;
+       } else if (ptlrpc_send_limit_expired(req)) {
+               /* probably doesn't need to be a D_ERROR after initial testing */
+               DEBUG_REQ(D_ERROR, req, "send limit expired ");
+               *status = -EIO;
+       } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+                  imp->imp_state == LUSTRE_IMP_CONNECTING) {
+               /* allow CONNECT even if import is invalid */ ;
+               if (atomic_read(&imp->imp_inval_count) != 0) {
+                       DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+                       *status = -EIO;
+               }
+       } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+               if (!imp->imp_deactive)
+                       DEBUG_REQ(D_NET, req, "IMP_INVALID");
+               *status = -ESHUTDOWN; /* bz 12940 */
+       } else if (req->rq_import_generation != imp->imp_generation) {
+               DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+               *status = -EIO;
+       } else if (req->rq_send_state != imp->imp_state) {
+               /* invalidate in progress - any requests should be drop */
+               if (atomic_read(&imp->imp_inval_count) != 0) {
+                       DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+                       *status = -EIO;
+               } else if (imp->imp_dlm_fake || req->rq_no_delay) {
+                       *status = -EWOULDBLOCK;
+               } else if (req->rq_allow_replay &&
+                         (imp->imp_state == LUSTRE_IMP_REPLAY ||
+                          imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+                          imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+                          imp->imp_state == LUSTRE_IMP_RECOVER)) {
+                       DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+               } else {
+                       delay = 1;
+               }
+       }
+
+       RETURN(delay);
+}
+
+/**
+ * Decide if the eror message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+       __u32 opc;
+       int err;
+
+       LASSERT(req->rq_reqmsg != NULL);
+       opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+       /* Suppress particular reconnect errors which are to be expected.  No
+        * errors are suppressed for the initial connection on an import */
+       if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+           (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+               /* Suppress timed out reconnect requests */
+               if (req->rq_timedout)
+                       return 0;
+
+               /* Suppress unavailable/again reconnect requests */
+               err = lustre_msg_get_status(req->rq_repmsg);
+               if (err == -ENODEV || err == -EAGAIN)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+       int err;
+       ENTRY;
+
+       err = lustre_msg_get_status(req->rq_repmsg);
+       if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+               struct obd_import *imp = req->rq_import;
+               __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+               if (ptlrpc_console_allow(req))
+                       LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s,"
+                                          " operation %s failed with %d.\n",
+                                          imp->imp_obd->obd_name,
+                                          libcfs_nid2str(
+                                          imp->imp_connection->c_peer.nid),
+                                          ll_opcode2str(opc), err);
+               RETURN(err < 0 ? err : -EINVAL);
+       }
+
+       if (err < 0) {
+               DEBUG_REQ(D_INFO, req, "status is %d", err);
+       } else if (err > 0) {
+               /* XXX: translate this error from net to host */
+               DEBUG_REQ(D_INFO, req, "status is %d", err);
+       }
+
+       RETURN(err);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+       struct lustre_msg *repmsg = req->rq_repmsg;
+       struct lustre_msg *reqmsg = req->rq_reqmsg;
+       __u64 *versions = lustre_msg_get_versions(repmsg);
+       ENTRY;
+
+       if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+               return;
+
+       LASSERT(versions);
+       lustre_msg_set_versions(reqmsg, versions);
+       CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n",
+              versions[0], versions[1]);
+
+       EXIT;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+       struct obd_import *imp = req->rq_import;
+       struct obd_device *obd = req->rq_import->imp_obd;
+       int rc;
+       struct timeval work_start;
+       long timediff;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+       /* repbuf must be unlinked */
+       LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink);
+
+       if (req->rq_reply_truncate) {
+               if (ptlrpc_no_resend(req)) {
+                       DEBUG_REQ(D_ERROR, req, "reply buffer overflow,"
+                                 " expected: %d, actual size: %d",
+                                 req->rq_nob_received, req->rq_repbuf_len);
+                       RETURN(-EOVERFLOW);
+               }
+
+               sptlrpc_cli_free_repbuf(req);
+               /* Pass the required reply buffer size (include
+                * space for early reply).
+                * NB: no need to roundup because alloc_repbuf
+                * will roundup it */
+               req->rq_replen       = req->rq_nob_received;
+               req->rq_nob_received = 0;
+               req->rq_resend       = 1;
+               RETURN(0);
+       }
+
+       /*
+        * NB Until this point, the whole of the incoming message,
+        * including buflens, status etc is in the sender's byte order.
+        */
+       rc = sptlrpc_cli_unwrap_reply(req);
+       if (rc) {
+               DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+               RETURN(rc);
+       }
+
+       /*
+        * Security layer unwrap might ask resend this request.
+        */
+       if (req->rq_resend)
+               RETURN(0);
+
+       rc = unpack_reply(req);
+       if (rc)
+               RETURN(rc);
+
+       /* retry indefinitely on EINPROGRESS */
+       if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+           ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+               time_t  now = cfs_time_current_sec();
+
+               DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+               req->rq_resend = 1;
+               req->rq_nr_resend++;
+
+               /* allocate new xid to avoid reply reconstruction */
+               if (!req->rq_bulk) {
+                       /* new xid is already allocated for bulk in
+                        * ptlrpc_check_set() */
+                       req->rq_xid = ptlrpc_next_xid();
+                       DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for "
+                                 "resend on EINPROGRESS");
+               }
+
+               /* Readjust the timeout for current conditions */
+               ptlrpc_at_set_req_timeout(req);
+               /* delay resend to give a chance to the server to get ready.
+                * The delay is increased by 1s on every resend and is capped to
+                * the current request timeout (i.e. obd_timeout if AT is off,
+                * or AT service time x 125% + 5s, see at_est2timeout) */
+               if (req->rq_nr_resend > req->rq_timeout)
+                       req->rq_sent = now + req->rq_timeout;
+               else
+                       req->rq_sent = now + req->rq_nr_resend;
+
+               RETURN(0);
+       }
+
+       do_gettimeofday(&work_start);
+       timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+       if (obd->obd_svc_stats != NULL) {
+               lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+                                   timediff);
+               ptlrpc_lprocfs_rpc_sent(req, timediff);
+       }
+
+       if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+           lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+               DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+                         lustre_msg_get_type(req->rq_repmsg));
+               RETURN(-EPROTO);
+       }
+
+       if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+               CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+       ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+       ptlrpc_at_adj_net_latency(req,
+                                 lustre_msg_get_service_time(req->rq_repmsg));
+
+       rc = ptlrpc_check_status(req);
+       imp->imp_connect_error = rc;
+
+       if (rc) {
+               /*
+                * Either we've been evicted, or the server has failed for
+                * some reason. Try to reconnect, and if that fails, punt to
+                * the upcall.
+                */
+               if (ll_rpc_recoverable_error(rc)) {
+                       if (req->rq_send_state != LUSTRE_IMP_FULL ||
+                           imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+                               RETURN(rc);
+                       }
+                       ptlrpc_request_handle_notconn(req);
+                       RETURN(rc);
+               }
+       } else {
+               /*
+                * Let's look if server sent slv. Do it only for RPC with
+                * rc == 0.
+                */
+               ldlm_cli_update_pool(req);
+       }
+
+       /*
+        * Store transno in reqmsg for replay.
+        */
+       if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+               req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+               lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+       }
+
+       if (imp->imp_replayable) {
+               spin_lock(&imp->imp_lock);
+               /*
+                * No point in adding already-committed requests to the replay
+                * list, we will just remove them immediately. b=9829
+                */
+               if (req->rq_transno != 0 &&
+                   (req->rq_transno >
+                    lustre_msg_get_last_committed(req->rq_repmsg) ||
+                    req->rq_replay)) {
+                       /** version recovery */
+                       ptlrpc_save_versions(req);
+                       ptlrpc_retain_replayable_request(req, imp);
+               } else if (req->rq_commit_cb != NULL) {
+                       spin_unlock(&imp->imp_lock);
+                       req->rq_commit_cb(req);
+                       spin_lock(&imp->imp_lock);
+               }
+
+               /*
+                * Replay-enabled imports return commit-status information.
+                */
+               if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+                       imp->imp_peer_committed_transno =
+                               lustre_msg_get_last_committed(req->rq_repmsg);
+               }
+
+               ptlrpc_free_committed(imp);
+
+               if (!list_empty(&imp->imp_replay_list)) {
+                       struct ptlrpc_request *last;
+
+                       last = list_entry(imp->imp_replay_list.prev,
+                                             struct ptlrpc_request,
+                                             rq_replay_list);
+                       /*
+                        * Requests with rq_replay stay on the list even if no
+                        * commit is expected.
+                        */
+                       if (last->rq_transno > imp->imp_peer_committed_transno)
+                               ptlrpc_pinger_commit_expected(imp);
+               }
+
+               spin_unlock(&imp->imp_lock);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+       struct obd_import     *imp = req->rq_import;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_phase == RQ_PHASE_NEW);
+       if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) &&
+           (!req->rq_generation_set ||
+            req->rq_import_generation == imp->imp_generation))
+               RETURN (0);
+
+       ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+       spin_lock(&imp->imp_lock);
+
+       if (!req->rq_generation_set)
+               req->rq_import_generation = imp->imp_generation;
+
+       if (ptlrpc_import_delay_req(imp, req, &rc)) {
+               spin_lock(&req->rq_lock);
+               req->rq_waiting = 1;
+               spin_unlock(&req->rq_lock);
+
+               DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+                         "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+                         ptlrpc_import_state_name(req->rq_send_state),
+                         ptlrpc_import_state_name(imp->imp_state));
+               LASSERT(list_empty(&req->rq_list));
+               list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+               atomic_inc(&req->rq_import->imp_inflight);
+               spin_unlock(&imp->imp_lock);
+               RETURN(0);
+       }
+
+       if (rc != 0) {
+               spin_unlock(&imp->imp_lock);
+               req->rq_status = rc;
+               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+               RETURN(rc);
+       }
+
+       LASSERT(list_empty(&req->rq_list));
+       list_add_tail(&req->rq_list, &imp->imp_sending_list);
+       atomic_inc(&req->rq_import->imp_inflight);
+       spin_unlock(&imp->imp_lock);
+
+       lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+       rc = sptlrpc_req_refresh_ctx(req, -1);
+       if (rc) {
+               if (req->rq_err) {
+                       req->rq_status = rc;
+                       RETURN(1);
+               } else {
+                       req->rq_wait_ctx = 1;
+                       RETURN(0);
+               }
+       }
+
+       CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc"
+              " %s:%s:%d:"LPU64":%s:%d\n", current_comm(),
+              imp->imp_obd->obd_uuid.uuid,
+              lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+              libcfs_nid2str(imp->imp_connection->c_peer.nid),
+              lustre_msg_get_opc(req->rq_reqmsg));
+
+       rc = ptl_send_rpc(req, 0);
+       if (rc) {
+               DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+               req->rq_net_err = 1;
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+       int remaining, rc;
+       ENTRY;
+
+       LASSERT(set->set_producer != NULL);
+
+       remaining = atomic_read(&set->set_remaining);
+
+       /* populate the ->set_requests list with requests until we
+        * reach the maximum number of RPCs in flight for this set */
+       while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+               rc = set->set_producer(set, set->set_producer_arg);
+               if (rc == -ENOENT) {
+                       /* no more RPC to produce */
+                       set->set_producer     = NULL;
+                       set->set_producer_arg = NULL;
+                       RETURN(0);
+               }
+       }
+
+       RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *next;
+       int force_timer_recalc = 0;
+       ENTRY;
+
+       if (atomic_read(&set->set_remaining) == 0)
+               RETURN(1);
+
+       list_for_each_safe(tmp, next, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+               struct obd_import *imp = req->rq_import;
+               int unregistered = 0;
+               int rc = 0;
+
+               if (req->rq_phase == RQ_PHASE_NEW &&
+                   ptlrpc_send_new_req(req)) {
+                       force_timer_recalc = 1;
+               }
+
+               /* delayed send - skip */
+               if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+                       continue;
+
+               /* delayed resend - skip */
+               if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+                   req->rq_sent > cfs_time_current_sec())
+                       continue;
+
+               if (!(req->rq_phase == RQ_PHASE_RPC ||
+                     req->rq_phase == RQ_PHASE_BULK ||
+                     req->rq_phase == RQ_PHASE_INTERPRET ||
+                     req->rq_phase == RQ_PHASE_UNREGISTERING ||
+                     req->rq_phase == RQ_PHASE_COMPLETE)) {
+                       DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+                       LBUG();
+               }
+
+               if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+                       LASSERT(req->rq_next_phase != req->rq_phase);
+                       LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+                       /*
+                        * Skip processing until reply is unlinked. We
+                        * can't return to pool before that and we can't
+                        * call interpret before that. We need to make
+                        * sure that all rdma transfers finished and will
+                        * not corrupt any data.
+                        */
+                       if (ptlrpc_client_recv_or_unlink(req) ||
+                           ptlrpc_client_bulk_active(req))
+                               continue;
+
+                       /*
+                        * Turn fail_loc off to prevent it from looping
+                        * forever.
+                        */
+                       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                               OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+                                                    OBD_FAIL_ONCE);
+                       }
+                       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+                               OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+                                                    OBD_FAIL_ONCE);
+                       }
+
+                       /*
+                        * Move to next phase if reply was successfully
+                        * unlinked.
+                        */
+                       ptlrpc_rqphase_move(req, req->rq_next_phase);
+               }
+
+               if (req->rq_phase == RQ_PHASE_COMPLETE)
+                       continue;
+
+               if (req->rq_phase == RQ_PHASE_INTERPRET)
+                       GOTO(interpret, req->rq_status);
+
+               /*
+                * Note that this also will start async reply unlink.
+                */
+               if (req->rq_net_err && !req->rq_timedout) {
+                       ptlrpc_expire_one_request(req, 1);
+
+                       /*
+                        * Check if we still need to wait for unlink.
+                        */
+                       if (ptlrpc_client_recv_or_unlink(req) ||
+                           ptlrpc_client_bulk_active(req))
+                               continue;
+                       /* If there is no need to resend, fail it now. */
+                       if (req->rq_no_resend) {
+                               if (req->rq_status == 0)
+                                       req->rq_status = -EIO;
+                               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                               GOTO(interpret, req->rq_status);
+                       } else {
+                               continue;
+                       }
+               }
+
+               if (req->rq_err) {
+                       spin_lock(&req->rq_lock);
+                       req->rq_replied = 0;
+                       spin_unlock(&req->rq_lock);
+                       if (req->rq_status == 0)
+                               req->rq_status = -EIO;
+                       ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                       GOTO(interpret, req->rq_status);
+               }
+
+               /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+                * so it sets rq_intr regardless of individual rpc
+                * timeouts. The synchronous IO waiting path sets
+                * rq_intr irrespective of whether ptlrpcd
+                * has seen a timeout.  Our policy is to only interpret
+                * interrupted rpcs after they have timed out, so we
+                * need to enforce that here.
+                */
+
+               if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+                                    req->rq_wait_ctx)) {
+                       req->rq_status = -EINTR;
+                       ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                       GOTO(interpret, req->rq_status);
+               }
+
+               if (req->rq_phase == RQ_PHASE_RPC) {
+                       if (req->rq_timedout || req->rq_resend ||
+                           req->rq_waiting || req->rq_wait_ctx) {
+                               int status;
+
+                               if (!ptlrpc_unregister_reply(req, 1))
+                                       continue;
+
+                               spin_lock(&imp->imp_lock);
+                               if (ptlrpc_import_delay_req(imp, req, &status)){
+                                       /* put on delay list - only if we wait
+                                        * recovery finished - before send */
+                                       list_del_init(&req->rq_list);
+                                       list_add_tail(&req->rq_list,
+                                                         &imp->
+                                                         imp_delayed_list);
+                                       spin_unlock(&imp->imp_lock);
+                                       continue;
+                               }
+
+                               if (status != 0)  {
+                                       req->rq_status = status;
+                                       ptlrpc_rqphase_move(req,
+                                               RQ_PHASE_INTERPRET);
+                                       spin_unlock(&imp->imp_lock);
+                                       GOTO(interpret, req->rq_status);
+                               }
+                               if (ptlrpc_no_resend(req) &&
+                                   !req->rq_wait_ctx) {
+                                       req->rq_status = -ENOTCONN;
+                                       ptlrpc_rqphase_move(req,
+                                                           RQ_PHASE_INTERPRET);
+                                       spin_unlock(&imp->imp_lock);
+                                       GOTO(interpret, req->rq_status);
+                               }
+
+                               list_del_init(&req->rq_list);
+                               list_add_tail(&req->rq_list,
+                                                 &imp->imp_sending_list);
+
+                               spin_unlock(&imp->imp_lock);
+
+                               spin_lock(&req->rq_lock);
+                               req->rq_waiting = 0;
+                               spin_unlock(&req->rq_lock);
+
+                               if (req->rq_timedout || req->rq_resend) {
+                                       /* This is re-sending anyways,
+                                        * let's mark req as resend. */
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_resend = 1;
+                                       spin_unlock(&req->rq_lock);
+                                       if (req->rq_bulk) {
+                                               __u64 old_xid;
+
+                                               if (!ptlrpc_unregister_bulk(req, 1))
+                                                       continue;
+
+                                               /* ensure previous bulk fails */
+                                               old_xid = req->rq_xid;
+                                               req->rq_xid = ptlrpc_next_xid();
+                                               CDEBUG(D_HA, "resend bulk "
+                                                      "old x"LPU64
+                                                      " new x"LPU64"\n",
+                                                      old_xid, req->rq_xid);
+                                       }
+                               }
+                               /*
+                                * rq_wait_ctx is only touched by ptlrpcd,
+                                * so no lock is needed here.
+                                */
+                               status = sptlrpc_req_refresh_ctx(req, -1);
+                               if (status) {
+                                       if (req->rq_err) {
+                                               req->rq_status = status;
+                                               spin_lock(&req->rq_lock);
+                                               req->rq_wait_ctx = 0;
+                                               spin_unlock(&req->rq_lock);
+                                               force_timer_recalc = 1;
+                                       } else {
+                                               spin_lock(&req->rq_lock);
+                                               req->rq_wait_ctx = 1;
+                                               spin_unlock(&req->rq_lock);
+                                       }
+
+                                       continue;
+                               } else {
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_wait_ctx = 0;
+                                       spin_unlock(&req->rq_lock);
+                               }
+
+                               rc = ptl_send_rpc(req, 0);
+                               if (rc) {
+                                       DEBUG_REQ(D_HA, req,
+                                                 "send failed: rc = %d", rc);
+                                       force_timer_recalc = 1;
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_net_err = 1;
+                                       spin_unlock(&req->rq_lock);
+                               }
+                               /* need to reset the timeout */
+                               force_timer_recalc = 1;
+                       }
+
+                       spin_lock(&req->rq_lock);
+
+                       if (ptlrpc_client_early(req)) {
+                               ptlrpc_at_recv_early_reply(req);
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       /* Still waiting for a reply? */
+                       if (ptlrpc_client_recv(req)) {
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       /* Did we actually receive a reply? */
+                       if (!ptlrpc_client_replied(req)) {
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       spin_unlock(&req->rq_lock);
+
+                       /* unlink from net because we are going to
+                        * swab in-place of reply buffer */
+                       unregistered = ptlrpc_unregister_reply(req, 1);
+                       if (!unregistered)
+                               continue;
+
+                       req->rq_status = after_reply(req);
+                       if (req->rq_resend)
+                               continue;
+
+                       /* If there is no bulk associated with this request,
+                        * then we're done and should let the interpreter
+                        * process the reply. Similarly if the RPC returned
+                        * an error, and therefore the bulk will never arrive.
+                        */
+                       if (req->rq_bulk == NULL || req->rq_status < 0) {
+                               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                               GOTO(interpret, req->rq_status);
+                       }
+
+                       ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+               }
+
+               LASSERT(req->rq_phase == RQ_PHASE_BULK);
+               if (ptlrpc_client_bulk_active(req))
+                       continue;
+
+               if (req->rq_bulk->bd_failure) {
+                       /* The RPC reply arrived OK, but the bulk screwed
+                        * up!  Dead weird since the server told us the RPC
+                        * was good after getting the REPLY for her GET or
+                        * the ACK for her PUT. */
+                       DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+                       req->rq_status = -EIO;
+               }
+
+               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+       interpret:
+               LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+               /* This moves to "unregistering" phase we need to wait for
+                * reply unlink. */
+               if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+                       /* start async bulk unlink too */
+                       ptlrpc_unregister_bulk(req, 1);
+                       continue;
+               }
+
+               if (!ptlrpc_unregister_bulk(req, 1))
+                       continue;
+
+               /* When calling interpret receiving already should be
+                * finished. */
+               LASSERT(!req->rq_receiving_reply);
+
+               ptlrpc_req_interpret(env, req, req->rq_status);
+
+               ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+               CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+                       "Completed RPC pname:cluuid:pid:xid:nid:"
+                       "opc %s:%s:%d:"LPU64":%s:%d\n",
+                       current_comm(), imp->imp_obd->obd_uuid.uuid,
+                       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+                       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                       lustre_msg_get_opc(req->rq_reqmsg));
+
+               spin_lock(&imp->imp_lock);
+               /* Request already may be not on sending or delaying list. This
+                * may happen in the case of marking it erroneous for the case
+                * ptlrpc_import_delay_req(req, status) find it impossible to
+                * allow sending this rpc and returns *status != 0. */
+               if (!list_empty(&req->rq_list)) {
+                       list_del_init(&req->rq_list);
+                       atomic_dec(&imp->imp_inflight);
+               }
+               spin_unlock(&imp->imp_lock);
+
+               atomic_dec(&set->set_remaining);
+               wake_up_all(&imp->imp_recovery_waitq);
+
+               if (set->set_producer) {
+                       /* produce a new request if possible */
+                       if (ptlrpc_set_producer(set) > 0)
+                               force_timer_recalc = 1;
+
+                       /* free the request that has just been completed
+                        * in order not to pollute set->set_requests */
+                       list_del_init(&req->rq_set_chain);
+                       spin_lock(&req->rq_lock);
+                       req->rq_set = NULL;
+                       req->rq_invalid_rqset = 0;
+                       spin_unlock(&req->rq_lock);
+
+                       /* record rq_status to compute the final status later */
+                       if (req->rq_status != 0)
+                               set->set_rc = req->rq_status;
+                       ptlrpc_req_finished(req);
+               }
+       }
+
+       /* If we hit an error, we want to recover promptly. */
+       RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+       struct obd_import *imp = req->rq_import;
+       int rc = 0;
+       ENTRY;
+
+       spin_lock(&req->rq_lock);
+       req->rq_timedout = 1;
+       spin_unlock(&req->rq_lock);
+
+       DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+                 "/real "CFS_DURATION_T"]",
+                 req->rq_net_err ? "failed due to network error" :
+                    ((req->rq_real_sent == 0 ||
+                      cfs_time_before(req->rq_real_sent, req->rq_sent) ||
+                      cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+                     "timed out for sent delay" : "timed out for slow reply"),
+                 req->rq_sent, req->rq_real_sent);
+
+       if (imp != NULL && obd_debug_peer_on_timeout)
+               LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+       ptlrpc_unregister_reply(req, async_unlink);
+       ptlrpc_unregister_bulk(req, async_unlink);
+
+       if (obd_dump_on_timeout)
+               libcfs_debug_dumplog();
+
+       if (imp == NULL) {
+               DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+               RETURN(1);
+       }
+
+       atomic_inc(&imp->imp_timeouts);
+
+       /* The DLM server doesn't want recovery run on its imports. */
+       if (imp->imp_dlm_fake)
+               RETURN(1);
+
+       /* If this request is for recovery or other primordial tasks,
+        * then error it out here. */
+       if (req->rq_ctx_init || req->rq_ctx_fini ||
+           req->rq_send_state != LUSTRE_IMP_FULL ||
+           imp->imp_obd->obd_no_recov) {
+               DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+                         ptlrpc_import_state_name(req->rq_send_state),
+                         ptlrpc_import_state_name(imp->imp_state));
+               spin_lock(&req->rq_lock);
+               req->rq_status = -ETIMEDOUT;
+               req->rq_err = 1;
+               spin_unlock(&req->rq_lock);
+               RETURN(1);
+       }
+
+       /* if a request can't be resent we can't wait for an answer after
+          the timeout */
+       if (ptlrpc_no_resend(req)) {
+               DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+               rc = 1;
+       }
+
+       ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+       RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+       struct ptlrpc_request_set *set = data;
+       struct list_head                *tmp;
+       time_t               now = cfs_time_current_sec();
+       ENTRY;
+
+       LASSERT(set != NULL);
+
+       /*
+        * A timeout expired. See which reqs it applies to...
+        */
+       list_for_each (tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               /* don't expire request waiting for context */
+               if (req->rq_wait_ctx)
+                       continue;
+
+               /* Request in-flight? */
+               if (!((req->rq_phase == RQ_PHASE_RPC &&
+                      !req->rq_waiting && !req->rq_resend) ||
+                     (req->rq_phase == RQ_PHASE_BULK)))
+                       continue;
+
+               if (req->rq_timedout ||     /* already dealt with */
+                   req->rq_deadline > now) /* not expired */
+                       continue;
+
+               /* Deal with this guy. Do it asynchronously to not block
+                * ptlrpcd thread. */
+               ptlrpc_expire_one_request(req, 1);
+       }
+
+       /*
+        * When waiting for a whole set, we always break out of the
+        * sleep so we can recalculate the timeout, or enable interrupts
+        * if everyone's timed out.
+        */
+       RETURN(1);
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       req->rq_intr = 1;
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+       struct ptlrpc_request_set *set = data;
+       struct list_head *tmp;
+
+       LASSERT(set != NULL);
+       CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+       list_for_each(tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               if (req->rq_phase != RQ_PHASE_RPC &&
+                   req->rq_phase != RQ_PHASE_UNREGISTERING)
+                       continue;
+
+               ptlrpc_mark_interrupted(req);
+       }
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+       struct list_head            *tmp;
+       time_t           now = cfs_time_current_sec();
+       int                 timeout = 0;
+       struct ptlrpc_request *req;
+       int                 deadline;
+       ENTRY;
+
+       SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
+
+       list_for_each(tmp, &set->set_requests) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+               /*
+                * Request in-flight?
+                */
+               if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+                     (req->rq_phase == RQ_PHASE_BULK) ||
+                     (req->rq_phase == RQ_PHASE_NEW)))
+                       continue;
+
+               /*
+                * Already timed out.
+                */
+               if (req->rq_timedout)
+                       continue;
+
+               /*
+                * Waiting for ctx.
+                */
+               if (req->rq_wait_ctx)
+                       continue;
+
+               if (req->rq_phase == RQ_PHASE_NEW)
+                       deadline = req->rq_sent;
+               else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+                       deadline = req->rq_sent;
+               else
+                       deadline = req->rq_sent + req->rq_timeout;
+
+               if (deadline <= now)    /* actually expired already */
+                       timeout = 1;    /* ASAP */
+               else if (timeout == 0 || timeout > deadline - now)
+                       timeout = deadline - now;
+       }
+       RETURN(timeout);
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+       struct list_head            *tmp;
+       struct ptlrpc_request *req;
+       struct l_wait_info     lwi;
+       int                 rc, timeout;
+       ENTRY;
+
+       if (set->set_producer)
+               (void)ptlrpc_set_producer(set);
+       else
+               list_for_each(tmp, &set->set_requests) {
+                       req = list_entry(tmp, struct ptlrpc_request,
+                                            rq_set_chain);
+                       if (req->rq_phase == RQ_PHASE_NEW)
+                               (void)ptlrpc_send_new_req(req);
+               }
+
+       if (list_empty(&set->set_requests))
+               RETURN(0);
+
+       do {
+               timeout = ptlrpc_set_next_timeout(set);
+
+               /* wait until all complete, interrupted, or an in-flight
+                * req times out */
+               CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+                      set, timeout);
+
+               if (timeout == 0 && !cfs_signal_pending())
+                       /*
+                        * No requests are in-flight (ether timed out
+                        * or delayed), so we can allow interrupts.
+                        * We still want to block for a limited time,
+                        * so we allow interrupts during the timeout.
+                        */
+                       lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+                                                  ptlrpc_expired_set,
+                                                  ptlrpc_interrupted_set, set);
+               else
+                       /*
+                        * At least one request is in flight, so no
+                        * interrupts are allowed. Wait until all
+                        * complete, or an in-flight req times out.
+                        */
+                       lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
+                                         ptlrpc_expired_set, set);
+
+               rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+               /* LU-769 - if we ignored the signal because it was already
+                * pending when we started, we need to handle it now or we risk
+                * it being ignored forever */
+               if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+                   cfs_signal_pending()) {
+                       sigset_t blocked_sigs =
+                                          cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+                       /* In fact we only interrupt for the "fatal" signals
+                        * like SIGINT or SIGKILL. We still ignore less
+                        * important signals since ptlrpc set is not easily
+                        * reentrant from userspace again */
+                       if (cfs_signal_pending())
+                               ptlrpc_interrupted_set(set);
+                       cfs_restore_sigs(blocked_sigs);
+               }
+
+               LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+               /* -EINTR => all requests have been flagged rq_intr so next
+                * check completes.
+                * -ETIMEDOUT => someone timed out.  When all reqs have
+                * timed out, signals are enabled allowing completion with
+                * EINTR.
+                * I don't really care if we go once more round the loop in
+                * the error cases -eeb. */
+               if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+                       list_for_each(tmp, &set->set_requests) {
+                               req = list_entry(tmp, struct ptlrpc_request,
+                                                    rq_set_chain);
+                               spin_lock(&req->rq_lock);
+                               req->rq_invalid_rqset = 1;
+                               spin_unlock(&req->rq_lock);
+                       }
+               }
+       } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+       LASSERT(atomic_read(&set->set_remaining) == 0);
+
+       rc = set->set_rc; /* rq_status of already freed requests if any */
+       list_for_each(tmp, &set->set_requests) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+               LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+               if (req->rq_status != 0)
+                       rc = req->rq_status;
+       }
+
+       if (set->set_interpret != NULL) {
+               int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+                       set->set_interpret;
+               rc = interpreter (set, set->set_arg, rc);
+       } else {
+               struct ptlrpc_set_cbdata *cbdata, *n;
+               int err;
+
+               list_for_each_entry_safe(cbdata, n,
+                                        &set->set_cblist, psc_item) {
+                       list_del_init(&cbdata->psc_item);
+                       err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+                       if (err && !rc)
+                               rc = err;
+                       OBD_FREE_PTR(cbdata);
+               }
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+       ENTRY;
+       if (request == NULL) {
+               EXIT;
+               return;
+       }
+
+       LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+       LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
+       LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+       LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+       LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+       LASSERTF(!request->rq_replay, "req %p\n", request);
+
+       req_capsule_fini(&request->rq_pill);
+
+       /* We must take it off the imp_replay_list first.  Otherwise, we'll set
+        * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+       if (request->rq_import != NULL) {
+               if (!locked)
+                       spin_lock(&request->rq_import->imp_lock);
+               list_del_init(&request->rq_replay_list);
+               if (!locked)
+                       spin_unlock(&request->rq_import->imp_lock);
+       }
+       LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+       if (atomic_read(&request->rq_refcount) != 0) {
+               DEBUG_REQ(D_ERROR, request,
+                         "freeing request with nonzero refcount");
+               LBUG();
+       }
+
+       if (request->rq_repbuf != NULL)
+               sptlrpc_cli_free_repbuf(request);
+       if (request->rq_export != NULL) {
+               class_export_put(request->rq_export);
+               request->rq_export = NULL;
+       }
+       if (request->rq_import != NULL) {
+               class_import_put(request->rq_import);
+               request->rq_import = NULL;
+       }
+       if (request->rq_bulk != NULL)
+               ptlrpc_free_bulk_pin(request->rq_bulk);
+
+       if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+               sptlrpc_cli_free_reqbuf(request);
+
+       if (request->rq_cli_ctx)
+               sptlrpc_req_put_ctx(request, !locked);
+
+       if (request->rq_pool)
+               __ptlrpc_free_req_to_pool(request);
+       else
+               OBD_FREE(request, sizeof(*request));
+       EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, reuqest is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+       LASSERT(spin_is_locked(&request->rq_import->imp_lock));
+       (void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+       ENTRY;
+       if (request == NULL)
+               RETURN(1);
+
+       if (request == LP_POISON ||
+           request->rq_reqmsg == LP_POISON) {
+               CERROR("dereferencing freed request (bug 575)\n");
+               LBUG();
+               RETURN(1);
+       }
+
+       DEBUG_REQ(D_INFO, request, "refcount now %u",
+                 atomic_read(&request->rq_refcount) - 1);
+
+       if (atomic_dec_and_test(&request->rq_refcount)) {
+               __ptlrpc_free_req(request, locked);
+               RETURN(1);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+       __ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+       return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+       int             rc;
+       wait_queue_head_t       *wq;
+       struct l_wait_info lwi;
+
+       /*
+        * Might sleep.
+        */
+       LASSERT(!in_interrupt());
+
+       /*
+        * Let's setup deadline for reply unlink.
+        */
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           async && request->rq_reply_deadline == 0)
+               request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+
+       /*
+        * Nothing left to do.
+        */
+       if (!ptlrpc_client_recv_or_unlink(request))
+               RETURN(1);
+
+       LNetMDUnlink(request->rq_reply_md_h);
+
+       /*
+        * Let's check it once again.
+        */
+       if (!ptlrpc_client_recv_or_unlink(request))
+               RETURN(1);
+
+       /*
+        * Move to "Unregistering" phase as reply was not unlinked yet.
+        */
+       ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+       /*
+        * Do not wait for unlink to finish.
+        */
+       if (async)
+               RETURN(0);
+
+       /*
+        * We have to l_wait_event() whatever the result, to give liblustre
+        * a chance to run reply_in_callback(), and to make sure we've
+        * unlinked before returning a req to the pool.
+        */
+       if (request->rq_set != NULL)
+               wq = &request->rq_set->set_waitq;
+       else
+               wq = &request->rq_reply_waitq;
+
+       for (;;) {
+               /* Network access will complete in finite time but the HUGE
+                * timeout lets us CWARN for visibility of sluggish NALs */
+               lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                          cfs_time_seconds(1), NULL, NULL);
+               rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+                                 &lwi);
+               if (rc == 0) {
+                       ptlrpc_rqphase_move(request, request->rq_next_phase);
+                       RETURN(1);
+               }
+
+               LASSERT(rc == -ETIMEDOUT);
+               DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+                         "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+                         request->rq_must_unlink);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meetign first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+       struct list_head *tmp, *saved;
+       struct ptlrpc_request *req;
+       struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+       ENTRY;
+
+       LASSERT(imp != NULL);
+
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+
+       if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+           imp->imp_generation == imp->imp_last_generation_checked) {
+               CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n",
+                      imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+               EXIT;
+               return;
+       }
+       CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
+              imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+              imp->imp_generation);
+       imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+       imp->imp_last_generation_checked = imp->imp_generation;
+
+       list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
+               req = list_entry(tmp, struct ptlrpc_request,
+                                    rq_replay_list);
+
+               /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+               LASSERT(req != last_req);
+               last_req = req;
+
+               if (req->rq_transno == 0) {
+                       DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+                       LBUG();
+               }
+               if (req->rq_import_generation < imp->imp_generation) {
+                       DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+                       GOTO(free_req, 0);
+               }
+
+               if (req->rq_replay) {
+                       DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+                       continue;
+               }
+
+               /* not yet committed */
+               if (req->rq_transno > imp->imp_peer_committed_transno) {
+                       DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+                       break;
+               }
+
+               DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
+                         imp->imp_peer_committed_transno);
+free_req:
+               spin_lock(&req->rq_lock);
+               req->rq_replay = 0;
+               spin_unlock(&req->rq_lock);
+               if (req->rq_commit_cb != NULL)
+                       req->rq_commit_cb(req);
+               list_del_init(&req->rq_replay_list);
+               __ptlrpc_req_finished(req, 1);
+       }
+
+       EXIT;
+       return;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+       ENTRY;
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "going to resend");
+       lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+       req->rq_status = -EAGAIN;
+
+       spin_lock(&req->rq_lock);
+       req->rq_resend = 1;
+       req->rq_net_err = 0;
+       req->rq_timedout = 0;
+       if (req->rq_bulk) {
+               __u64 old_xid = req->rq_xid;
+
+               /* ensure previous bulk fails */
+               req->rq_xid = ptlrpc_next_xid();
+               CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+                      old_xid, req->rq_xid);
+       }
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+       req->rq_status = -ERESTARTSYS;
+
+       spin_lock(&req->rq_lock);
+       req->rq_restart = 1;
+       req->rq_timedout = 0;
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+       ENTRY;
+       atomic_inc(&req->rq_refcount);
+       RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                     struct obd_import *imp)
+{
+       struct list_head *tmp;
+
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+       if (req->rq_transno == 0) {
+               DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+               LBUG();
+       }
+
+       /* clear this for new requests that were resent as well
+          as resent replayed requests. */
+       lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+       /* don't re-add requests that have been replayed */
+       if (!list_empty(&req->rq_replay_list))
+               return;
+
+       lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+       LASSERT(imp->imp_replayable);
+       /* Balanced in ptlrpc_free_committed, usually. */
+       ptlrpc_request_addref(req);
+       list_for_each_prev(tmp, &imp->imp_replay_list) {
+               struct ptlrpc_request *iter =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_replay_list);
+
+               /* We may have duplicate transnos if we create and then
+                * open a file, or for closes retained if to match creating
+                * opens, so use req->rq_xid as a secondary key.
+                * (See bugs 684, 685, and 428.)
+                * XXX no longer needed, but all opens need transnos!
+                */
+               if (iter->rq_transno > req->rq_transno)
+                       continue;
+
+               if (iter->rq_transno == req->rq_transno) {
+                       LASSERT(iter->rq_xid != req->rq_xid);
+                       if (iter->rq_xid > req->rq_xid)
+                               continue;
+               }
+
+               list_add(&req->rq_replay_list, &iter->rq_replay_list);
+               return;
+       }
+
+       list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *set;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_set == NULL);
+       LASSERT(!req->rq_receiving_reply);
+
+       set = ptlrpc_prep_set();
+       if (set == NULL) {
+               CERROR("Unable to allocate ptlrpc set.");
+               RETURN(-ENOMEM);
+       }
+
+       /* for distributed debugging */
+       lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+       /* add a ref for the set (see comment in ptlrpc_set_add_req) */
+       ptlrpc_request_addref(req);
+       ptlrpc_set_add_req(set, req);
+       rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+       int praa_old_state;
+       int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of succesful reply calls registeresd request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+                                  struct ptlrpc_request *req,
+                                  void * data, int rc)
+{
+       struct ptlrpc_replay_async_args *aa = data;
+       struct obd_import *imp = req->rq_import;
+
+       ENTRY;
+       atomic_dec(&imp->imp_replay_inflight);
+
+       if (!ptlrpc_client_replied(req)) {
+               CERROR("request replay timed out, restarting recovery\n");
+               GOTO(out, rc = -ETIMEDOUT);
+       }
+
+       if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+           (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+            lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+               GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+       /** VBR: check version failure */
+       if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+               /** replay was failed due to version mismatch */
+               DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+               spin_lock(&imp->imp_lock);
+               imp->imp_vbr_failed = 1;
+               imp->imp_no_lock_replay = 1;
+               spin_unlock(&imp->imp_lock);
+               lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+       } else {
+               /** The transno had better not change over replay. */
+               LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+                        lustre_msg_get_transno(req->rq_repmsg) ||
+                        lustre_msg_get_transno(req->rq_repmsg) == 0,
+                        LPX64"/"LPX64"\n",
+                        lustre_msg_get_transno(req->rq_reqmsg),
+                        lustre_msg_get_transno(req->rq_repmsg));
+       }
+
+       spin_lock(&imp->imp_lock);
+       /** if replays by version then gap occur on server, no trust to locks */
+       if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+               imp->imp_no_lock_replay = 1;
+       imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+       spin_unlock(&imp->imp_lock);
+       LASSERT(imp->imp_last_replay_transno);
+
+       /* transaction number shouldn't be bigger than the latest replayed */
+       if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+               DEBUG_REQ(D_ERROR, req,
+                         "Reported transno "LPU64" is bigger than the "
+                         "replayed one: "LPU64, req->rq_transno,
+                         lustre_msg_get_transno(req->rq_reqmsg));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       DEBUG_REQ(D_HA, req, "got rep");
+
+       /* let the callback do fixups, possibly including in the request */
+       if (req->rq_replay_cb)
+               req->rq_replay_cb(req);
+
+       if (ptlrpc_client_replied(req) &&
+           lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+               DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+                         lustre_msg_get_status(req->rq_repmsg),
+                         aa->praa_old_status);
+       } else {
+               /* Put it back for re-replay. */
+               lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+       }
+
+       /*
+        * Errors while replay can set transno to 0, but
+        * imp_last_replay_transno shouldn't be set to 0 anyway
+        */
+       if (req->rq_transno == 0)
+               CERROR("Transno is 0 during replay!\n");
+
+       /* continue with recovery */
+       rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+       req->rq_send_state = aa->praa_old_state;
+
+       if (rc != 0)
+               /* this replay failed, so restart recovery */
+               ptlrpc_connect_import(imp);
+
+       RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+       struct ptlrpc_replay_async_args *aa;
+       ENTRY;
+
+       LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+       LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       memset(aa, 0, sizeof *aa);
+
+       /* Prepare request to be resent with ptlrpcd */
+       aa->praa_old_state = req->rq_send_state;
+       req->rq_send_state = LUSTRE_IMP_REPLAY;
+       req->rq_phase = RQ_PHASE_NEW;
+       req->rq_next_phase = RQ_PHASE_UNDEFINED;
+       if (req->rq_repmsg)
+               aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+       req->rq_status = 0;
+       req->rq_interpret_reply = ptlrpc_replay_interpret;
+       /* Readjust the timeout for current conditions */
+       ptlrpc_at_set_req_timeout(req);
+
+       /* Tell server the net_latency, so the server can calculate how long
+        * it should wait for next replay */
+       lustre_msg_set_service_time(req->rq_reqmsg,
+                                   ptlrpc_at_get_net_latency(req));
+       DEBUG_REQ(D_HA, req, "REPLAY");
+
+       atomic_inc(&req->rq_import->imp_replay_inflight);
+       ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+       struct list_head *tmp, *n;
+       ENTRY;
+
+       /* Make sure that no new requests get processed for this import.
+        * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+        * this flag and then putting requests on sending_list or delayed_list.
+        */
+       spin_lock(&imp->imp_lock);
+
+       /* XXX locking?  Maybe we should remove each request with the list
+        * locked?  Also, how do we know if the requests on the list are
+        * being freed at this time?
+        */
+       list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_import_generation < imp->imp_generation) {
+                       req->rq_err = 1;
+                       req->rq_status = -EIO;
+                       ptlrpc_client_wake_req(req);
+               }
+               spin_unlock(&req->rq_lock);
+       }
+
+       list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_import_generation < imp->imp_generation) {
+                       req->rq_err = 1;
+                       req->rq_status = -EIO;
+                       ptlrpc_client_wake_req(req);
+               }
+               spin_unlock(&req->rq_lock);
+       }
+
+       /* Last chance to free reqs left on the replay list, but we
+        * will still leak reqs that haven't committed.  */
+       if (imp->imp_replayable)
+               ptlrpc_free_committed(imp);
+
+       spin_unlock(&imp->imp_lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *pos;
+
+       LASSERT(set != NULL);
+
+       list_for_each_safe(pos, tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(pos, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_phase != RQ_PHASE_RPC) {
+                       spin_unlock(&req->rq_lock);
+                       continue;
+               }
+
+               req->rq_err = 1;
+               req->rq_status = -EINTR;
+               ptlrpc_client_wake_req(req);
+               spin_unlock(&req->rq_lock);
+       }
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+       time_t now = cfs_time_current_sec();
+
+       spin_lock_init(&ptlrpc_last_xid_lock);
+       if (now < YEAR_2004) {
+               cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+               ptlrpc_last_xid >>= 2;
+               ptlrpc_last_xid |= (1ULL << 61);
+       } else {
+               ptlrpc_last_xid = (__u64)now << 20;
+       }
+
+       /* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+       CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+       ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+       __u64 next;
+
+       spin_lock(&ptlrpc_last_xid_lock);
+       next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+       ptlrpc_last_xid = next;
+       spin_unlock(&ptlrpc_last_xid_lock);
+
+       return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+       /* need to avoid possible word tearing on 32-bit systems */
+       __u64 next;
+
+       spin_lock(&ptlrpc_last_xid_lock);
+       next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+       spin_unlock(&ptlrpc_last_xid_lock);
+
+       return next;
+#else
+       /* No need to lock, since returned value is racy anyways */
+       return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *      handler = ptlrpcd_alloc_work();
+ *      ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *      ptlrpcd_queue_work();
+ *      ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+       __u64   magic;
+       int   (*cb)(const struct lu_env *, void *);
+       void   *cbdata;
+};
+
+#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */
+
+static int work_interpreter(const struct lu_env *env,
+                           struct ptlrpc_request *req, void *data, int rc)
+{
+       struct ptlrpc_work_async_args *arg = data;
+
+       LASSERT(arg->magic == PTLRPC_WORK_MAGIC);
+       LASSERT(arg->cb != NULL);
+
+       return arg->cb(env, arg->cbdata);
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                        int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+       struct ptlrpc_request    *req = NULL;
+       struct ptlrpc_work_async_args *args;
+       ENTRY;
+
+       might_sleep();
+
+       if (cb == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       /* copy some code from deprecated fakereq. */
+       OBD_ALLOC_PTR(req);
+       if (req == NULL) {
+               CERROR("ptlrpc: run out of memory!\n");
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+       req->rq_type = PTL_RPC_MSG_REQUEST;
+       req->rq_import = class_import_get(imp);
+       req->rq_export = NULL;
+       req->rq_interpret_reply = work_interpreter;
+       /* don't want reply */
+       req->rq_receiving_reply = 0;
+       req->rq_must_unlink = 0;
+       req->rq_no_delay = req->rq_no_resend = 1;
+
+       spin_lock_init(&req->rq_lock);
+       INIT_LIST_HEAD(&req->rq_list);
+       INIT_LIST_HEAD(&req->rq_replay_list);
+       INIT_LIST_HEAD(&req->rq_set_chain);
+       INIT_LIST_HEAD(&req->rq_history_list);
+       INIT_LIST_HEAD(&req->rq_exp_list);
+       init_waitqueue_head(&req->rq_reply_waitq);
+       init_waitqueue_head(&req->rq_set_waitq);
+       atomic_set(&req->rq_refcount, 1);
+
+       CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+       args = ptlrpc_req_async_args(req);
+       args->magic  = PTLRPC_WORK_MAGIC;
+       args->cb     = cb;
+       args->cbdata = cbdata;
+
+       RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+       struct ptlrpc_request *req = handler;
+
+       if (req)
+               ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+       struct ptlrpc_request *req = handler;
+
+       /*
+        * Check if the req is already being queued.
+        *
+        * Here comes a trick: it lacks a way of checking if a req is being
+        * processed reliably in ptlrpc. Here I have to use refcount of req
+        * for this purpose. This is okay because the caller should use this
+        * req as opaque data. - Jinshan
+        */
+       LASSERT(atomic_read(&req->rq_refcount) > 0);
+       if (atomic_read(&req->rq_refcount) > 1)
+               return -EBUSY;
+
+       if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */
+               atomic_dec(&req->rq_refcount);
+               return -EBUSY;
+       }
+
+       /* re-initialize the req */
+       req->rq_timeout = obd_timeout;
+       req->rq_sent       = cfs_time_current_sec();
+       req->rq_deadline       = req->rq_sent + req->rq_timeout;
+       req->rq_reply_deadline = req->rq_deadline;
+       req->rq_phase     = RQ_PHASE_INTERPRET;
+       req->rq_next_phase     = RQ_PHASE_COMPLETE;
+       req->rq_xid         = ptlrpc_next_xid();
+       req->rq_import_generation = req->rq_import->imp_generation;
+
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644 (file)
index 0000000..a0757f3
--- /dev/null
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static cfs_hash_t *conn_hash = NULL;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+                     struct obd_uuid *uuid)
+{
+       struct ptlrpc_connection *conn, *conn2;
+       ENTRY;
+
+       conn = cfs_hash_lookup(conn_hash, &peer);
+       if (conn)
+               GOTO(out, conn);
+
+       OBD_ALLOC_PTR(conn);
+       if (!conn)
+               RETURN(NULL);
+
+       conn->c_peer = peer;
+       conn->c_self = self;
+       INIT_HLIST_NODE(&conn->c_hash);
+       atomic_set(&conn->c_refcount, 1);
+       if (uuid)
+               obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+       /*
+        * Add the newly created conn to the hash, on key collision we
+        * lost a racing addition and must destroy our newly allocated
+        * connection.  The object which exists in the has will be
+        * returned and may be compared against out object.
+        */
+       /* In the function below, .hs_keycmp resolves to
+        * conn_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+       if (conn != conn2) {
+               OBD_FREE_PTR(conn);
+               conn = conn2;
+       }
+       EXIT;
+out:
+       CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+       return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!conn)
+               RETURN(rc);
+
+       LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+       /*
+        * We do not remove connection from hashtable and
+        * do not free it even if last caller released ref,
+        * as we want to have it cached for the case it is
+        * needed again.
+        *
+        * Deallocating it and later creating new connection
+        * again would be wastful. This way we also avoid
+        * expensive locking to protect things from get/put
+        * race when found cached connection is freed by
+        * ptlrpc_connection_put().
+        *
+        * It will be freed later in module unload time,
+        * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+        * path is called.
+        */
+       if (atomic_dec_return(&conn->c_refcount) == 1)
+               rc = 1;
+
+       CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+       ENTRY;
+
+       atomic_inc(&conn->c_refcount);
+       CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+
+       RETURN(conn);
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+       ENTRY;
+
+       conn_hash = cfs_hash_create("CONN_HASH",
+                                   HASH_CONN_CUR_BITS,
+                                   HASH_CONN_MAX_BITS,
+                                   HASH_CONN_BKT_BITS, 0,
+                                   CFS_HASH_MIN_THETA,
+                                   CFS_HASH_MAX_THETA,
+                                   &conn_hash_ops, CFS_HASH_DEFAULT);
+       if (!conn_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void) {
+       ENTRY;
+       cfs_hash_putref(conn_hash);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+       const lnet_process_id_t *conn_key;
+
+       LASSERT(key != NULL);
+       conn_key = (lnet_process_id_t*)key;
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+       return conn_key->nid == conn->c_peer.nid &&
+              conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       /*
+        * Nothing should be left. Connection user put it and
+        * connection also was deleted from table by this time
+        * so we should have 0 refs.
+        */
+       LASSERTF(atomic_read(&conn->c_refcount) == 0,
+                "Busy connection with %d refs\n",
+                atomic_read(&conn->c_refcount));
+       OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+       .hs_hash        = conn_hashfn,
+       .hs_keycmp      = conn_keycmp,
+       .hs_key  = conn_key,
+       .hs_object      = conn_object,
+       .hs_get  = conn_get,
+       .hs_put_locked  = conn_put_locked,
+       .hs_exit        = conn_exit,
+};
diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644 (file)
index 0000000..0264c10
--- /dev/null
@@ -0,0 +1,595 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+# ifdef __mips64__
+#  include <linux/kernel.h>
+# endif
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t   ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+       struct ptlrpc_request *req = cbid->cbid_arg;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_SEND ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->unlinked);
+
+       DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+       sptlrpc_request_out_callback(req);
+       req->rq_real_sent = cfs_time_current_sec();
+
+       if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+               /* Failed send: make it seem like the reply timed out, just
+                * like failing sends in client.c does currently...  */
+
+               spin_lock(&req->rq_lock);
+               req->rq_net_err = 1;
+               spin_unlock(&req->rq_lock);
+
+               ptlrpc_client_wake_req(req);
+       }
+
+       ptlrpc_req_finished(req);
+
+       EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+       struct ptlrpc_request *req = cbid->cbid_arg;
+       ENTRY;
+
+       DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+       LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->md.start == req->rq_repbuf);
+       LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len);
+       /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+          for adaptive timeouts' early reply. */
+       LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+       spin_lock(&req->rq_lock);
+
+       req->rq_receiving_reply = 0;
+       req->rq_early = 0;
+       if (ev->unlinked)
+               req->rq_must_unlink = 0;
+
+       if (ev->status)
+               goto out_wake;
+
+       if (ev->type == LNET_EVENT_UNLINK) {
+               LASSERT(ev->unlinked);
+               DEBUG_REQ(D_NET, req, "unlink");
+               goto out_wake;
+       }
+
+       if (ev->mlength < ev->rlength ) {
+               CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+                      req->rq_replen, ev->rlength, ev->offset);
+               req->rq_reply_truncate = 1;
+               req->rq_replied = 1;
+               req->rq_status = -EOVERFLOW;
+               req->rq_nob_received = ev->rlength + ev->offset;
+               goto out_wake;
+       }
+
+       if ((ev->offset == 0) &&
+           ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+               /* Early reply */
+               DEBUG_REQ(D_ADAPTTO, req,
+                         "Early reply received: mlen=%u offset=%d replen=%d "
+                         "replied=%d unlinked=%d", ev->mlength, ev->offset,
+                         req->rq_replen, req->rq_replied, ev->unlinked);
+
+               req->rq_early_count++; /* number received, client side */
+
+               if (req->rq_replied)   /* already got the real reply */
+                       goto out_wake;
+
+               req->rq_early = 1;
+               req->rq_reply_off = ev->offset;
+               req->rq_nob_received = ev->mlength;
+               /* And we're still receiving */
+               req->rq_receiving_reply = 1;
+       } else {
+               /* Real reply */
+               req->rq_rep_swab_mask = 0;
+               req->rq_replied = 1;
+               req->rq_reply_off = ev->offset;
+               req->rq_nob_received = ev->mlength;
+               /* LNetMDUnlink can't be called under the LNET_LOCK,
+                  so we must unlink in ptlrpc_unregister_reply */
+               DEBUG_REQ(D_INFO, req,
+                         "reply in flags=%x mlen=%u offset=%d replen=%d",
+                         lustre_msg_get_flags(req->rq_reqmsg),
+                         ev->mlength, ev->offset, req->rq_replen);
+       }
+
+       req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
+       /* NB don't unlock till after wakeup; req can disappear under us
+        * since we don't have our own ref */
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+       EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+       struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+       struct ptlrpc_request   *req;
+       ENTRY;
+
+       LASSERT ((desc->bd_type == BULK_PUT_SINK &&
+                 ev->type == LNET_EVENT_PUT) ||
+                (desc->bd_type == BULK_GET_SOURCE &&
+                 ev->type == LNET_EVENT_GET) ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->unlinked);
+
+       if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+               ev->status = -EIO;
+
+       if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+               ev->status = -EIO;
+
+       CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+              "event type %d, status %d, desc %p\n",
+              ev->type, ev->status, desc);
+
+       spin_lock(&desc->bd_lock);
+       req = desc->bd_req;
+       LASSERT(desc->bd_md_count > 0);
+       desc->bd_md_count--;
+
+       if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+               desc->bd_nob_transferred += ev->mlength;
+               desc->bd_sender = ev->sender;
+       } else {
+               /* start reconnect and resend if network error hit */
+               spin_lock(&req->rq_lock);
+               req->rq_net_err = 1;
+               spin_unlock(&req->rq_lock);
+       }
+
+       if (ev->status != 0)
+               desc->bd_failure = 1;
+
+       /* NB don't unlock till after wakeup; desc can disappear under us
+        * otherwise */
+       if (desc->bd_md_count == 0)
+               ptlrpc_client_wake_req(desc->bd_req);
+
+       spin_unlock(&desc->bd_lock);
+       EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)   ((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT         32
+#define REQS_USEC_SHIFT                16
+#define REQS_SEQ_SHIFT(svcpt)  REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+                                  struct ptlrpc_request *req)
+{
+       __u64   sec = req->rq_arrival_time.tv_sec;
+       __u32   usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+       __u64   new_seq;
+
+       /* set sequence ID for request and add it to history list,
+        * it must be called with hold svcpt::scp_lock */
+
+       new_seq = (sec << REQS_SEC_SHIFT) |
+                 (usec << REQS_USEC_SHIFT) |
+                 (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+       if (new_seq > svcpt->scp_hist_seq) {
+               /* This handles the initial case of scp_hist_seq == 0 or
+                * we just jumped into a new time window */
+               svcpt->scp_hist_seq = new_seq;
+       } else {
+               LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+               /* NB: increase sequence number in current usec bucket,
+                * however, it's possible that we used up all bits for
+                * sequence and jumped into the next usec bucket (future time),
+                * then we hope there will be less RPCs per bucket at some
+                * point, and sequence will catch up again */
+               svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+               new_seq = svcpt->scp_hist_seq;
+       }
+
+       req->rq_history_seq = new_seq;
+
+       list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id               *cbid = ev->md.user_ptr;
+       struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+       struct ptlrpc_service_part        *svcpt = rqbd->rqbd_svcpt;
+       struct ptlrpc_service        *service = svcpt->scp_service;
+       struct ptlrpc_request        *req;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_PUT ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+       LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
+                rqbd->rqbd_buffer + service->srv_buf_size);
+
+       CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+              "event type %d, status %d, service %s\n",
+              ev->type, ev->status, service->srv_name);
+
+       if (ev->unlinked) {
+               /* If this is the last request message to fit in the
+                * request buffer we can use the request object embedded in
+                * rqbd.  Note that if we failed to allocate a request,
+                * we'd have to re-post the rqbd, which we can't do in this
+                * context. */
+               req = &rqbd->rqbd_req;
+               memset(req, 0, sizeof (*req));
+       } else {
+               LASSERT (ev->type == LNET_EVENT_PUT);
+               if (ev->status != 0) {
+                       /* We moaned above already... */
+                       return;
+               }
+               OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
+               if (req == NULL) {
+                       CERROR("Can't allocate incoming request descriptor: "
+                              "Dropping %s RPC from %s\n",
+                              service->srv_name,
+                              libcfs_id2str(ev->initiator));
+                       return;
+               }
+       }
+
+       /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+        * flags are reset and scalars are zero.  We only set the message
+        * size to non-zero if this was a successful receive. */
+       req->rq_xid = ev->match_bits;
+       req->rq_reqbuf = ev->md.start + ev->offset;
+       if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+               req->rq_reqdata_len = ev->mlength;
+       do_gettimeofday(&req->rq_arrival_time);
+       req->rq_peer = ev->initiator;
+       req->rq_self = ev->target.nid;
+       req->rq_rqbd = rqbd;
+       req->rq_phase = RQ_PHASE_NEW;
+       spin_lock_init(&req->rq_lock);
+       INIT_LIST_HEAD(&req->rq_timed_list);
+       INIT_LIST_HEAD(&req->rq_exp_list);
+       atomic_set(&req->rq_refcount, 1);
+       if (ev->type == LNET_EVENT_PUT)
+               CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
+                      req, req->rq_xid, ev->mlength);
+
+       CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+       spin_lock(&svcpt->scp_lock);
+
+       ptlrpc_req_add_history(svcpt, req);
+
+       if (ev->unlinked) {
+               svcpt->scp_nrqbds_posted--;
+               CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+                      svcpt->scp_nrqbds_posted);
+
+               /* Normally, don't complain about 0 buffers posted; LNET won't
+                * drop incoming reqs since we set the portal lazy */
+               if (test_req_buffer_pressure &&
+                   ev->type != LNET_EVENT_UNLINK &&
+                   svcpt->scp_nrqbds_posted == 0)
+                       CWARN("All %s request buffers busy\n",
+                             service->srv_name);
+
+               /* req takes over the network's ref on rqbd */
+       } else {
+               /* req takes a ref on rqbd */
+               rqbd->rqbd_refcount++;
+       }
+
+       list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+       svcpt->scp_nreqs_incoming++;
+
+       /* NB everything can disappear under us once the request
+        * has been queued and we unlock, so do the wake now... */
+       wake_up(&svcpt->scp_waitq);
+
+       spin_unlock(&svcpt->scp_lock);
+       EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id       *cbid = ev->md.user_ptr;
+       struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_SEND ||
+                ev->type == LNET_EVENT_ACK ||
+                ev->type == LNET_EVENT_UNLINK);
+
+       if (!rs->rs_difficult) {
+               /* 'Easy' replies have no further processing so I drop the
+                * net's ref on 'rs' */
+               LASSERT (ev->unlinked);
+               ptlrpc_rs_decref(rs);
+               EXIT;
+               return;
+       }
+
+       LASSERT (rs->rs_on_net);
+
+       if (ev->unlinked) {
+               /* Last network callback. The net's ref on 'rs' stays put
+                * until ptlrpc_handle_rs() is done with it */
+               spin_lock(&svcpt->scp_rep_lock);
+               spin_lock(&rs->rs_lock);
+
+               rs->rs_on_net = 0;
+               if (!rs->rs_no_ack ||
+                   rs->rs_transno <=
+                   rs->rs_export->exp_obd->obd_last_committed)
+                       ptlrpc_schedule_difficult_reply(rs);
+
+               spin_unlock(&rs->rs_lock);
+               spin_unlock(&svcpt->scp_rep_lock);
+       }
+       EXIT;
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+       void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+       /* Honestly, it's best to find out early. */
+       LASSERT (cbid->cbid_arg != LP_POISON);
+       LASSERT (callback == request_out_callback ||
+                callback == reply_in_callback ||
+                callback == client_bulk_callback ||
+                callback == request_in_callback ||
+                callback == reply_out_callback
+                );
+
+       callback (ev);
+}
+
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid,
+                        lnet_process_id_t *peer, lnet_nid_t *self)
+{
+       int            best_dist = 0;
+       __u32        best_order = 0;
+       int            count = 0;
+       int            rc = -ENOENT;
+       int            portals_compatibility;
+       int            dist;
+       __u32        order;
+       lnet_nid_t      dst_nid;
+       lnet_nid_t      src_nid;
+
+       portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+       peer->pid = LUSTRE_SRV_LNET_PID;
+
+       /* Choose the matching UUID that's closest */
+       while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+               dist = LNetDist(dst_nid, &src_nid, &order);
+               if (dist < 0)
+                       continue;
+
+               if (dist == 0) {                /* local! use loopback LND */
+                       peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+                       rc = 0;
+                       break;
+               }
+
+               if (rc < 0 ||
+                   dist < best_dist ||
+                   (dist == best_dist && order < best_order)) {
+                       best_dist = dist;
+                       best_order = order;
+
+                       if (portals_compatibility > 1) {
+                               /* Strong portals compatibility: Zero the nid's
+                                * NET, so if I'm reading new config logs, or
+                                * getting configured by (new) lconf I can
+                                * still talk to old servers. */
+                               dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+                               src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+                       }
+                       peer->nid = dst_nid;
+                       *self = src_nid;
+                       rc = 0;
+               }
+       }
+
+       CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+       return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+       wait_queue_head_t        waitq;
+       struct l_wait_info  lwi;
+       int              rc;
+       int              retries;
+
+       /* Wait for the event queue to become idle since there may still be
+        * messages in flight with pending events (i.e. the fire-and-forget
+        * messages == client requests and "non-difficult" server
+        * replies */
+
+       for (retries = 0;; retries++) {
+               rc = LNetEQFree(ptlrpc_eq_h);
+               switch (rc) {
+               default:
+                       LBUG();
+
+               case 0:
+                       LNetNIFini();
+                       return;
+
+               case -EBUSY:
+                       if (retries != 0)
+                               CWARN("Event queue still busy\n");
+
+                       /* Wait for a bit */
+                       init_waitqueue_head(&waitq);
+                       lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+                       l_wait_event(waitq, 0, &lwi);
+                       break;
+               }
+       }
+       /* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+       lnet_pid_t      pid;
+
+       pid = LUSTRE_SRV_LNET_PID;
+       return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+       int           rc;
+       lnet_pid_t       pid;
+
+       pid = ptl_get_pid();
+       CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+       /* We're not passing any limits yet... */
+       rc = LNetNIInit(pid);
+       if (rc < 0) {
+               CDEBUG (D_NET, "Can't init network interface: %d\n", rc);
+               return (-ENOENT);
+       }
+
+       /* CAVEAT EMPTOR: how we process portals events is _radically_
+        * different depending on... */
+       /* kernel LNet calls our master callback when there are new event,
+        * because we are guaranteed to get every event via callback,
+        * so we just set EQ size to 0 to avoid overhread of serializing
+        * enqueue/dequeue operations in LNet. */
+       rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+       if (rc == 0)
+               return 0;
+
+       CERROR ("Failed to allocate event queue: %d\n", rc);
+       LNetNIFini();
+
+       return (-ENOMEM);
+}
+
+
+int ptlrpc_init_portals(void)
+{
+       int   rc = ptlrpc_ni_init();
+
+       if (rc != 0) {
+               CERROR("network initialisation failed\n");
+               return -EIO;
+       }
+       rc = ptlrpcd_addref();
+       if (rc == 0)
+               return 0;
+
+       CERROR("rpcd initialisation failed\n");
+       ptlrpc_ni_fini();
+       return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+       ptlrpcd_decref();
+       ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile
new file mode 100644 (file)
index 0000000..8cdfbee
--- /dev/null
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTRE_FS) := ptlrpc_gss.o
+
+ptlrpc_gss-y := sec_gss.o gss_bulk.o gss_cli_upcall.o gss_svc_upcall.o \
+               gss_rawobj.o lproc_gss.o gss_generic_token.o            \
+               gss_mech_switch.o gss_krb5_mech.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h
new file mode 100644 (file)
index 0000000..feac604
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+       struct gss_api_mech    *mech_type;
+       void               *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER         ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT       ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID   ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+               rawobj_t                *input_token,
+               struct gss_api_mech     *mech,
+               struct gss_ctx   **ctx);
+__u32 lgss_copy_reverse_context(
+               struct gss_ctx    *ctx,
+               struct gss_ctx   **ctx_new);
+__u32 lgss_inquire_context(
+               struct gss_ctx    *ctx,
+               unsigned long      *endtime);
+__u32 lgss_get_mic(
+               struct gss_ctx    *ctx,
+               int                   msgcnt,
+               rawobj_t                *msgs,
+               int                   iovcnt,
+               lnet_kiov_t          *iovs,
+               rawobj_t                *mic_token);
+__u32 lgss_verify_mic(
+               struct gss_ctx    *ctx,
+               int                   msgcnt,
+               rawobj_t                *msgs,
+               int                   iovcnt,
+               lnet_kiov_t          *iovs,
+               rawobj_t                *mic_token);
+__u32 lgss_wrap(
+               struct gss_ctx    *ctx,
+               rawobj_t                *gsshdr,
+               rawobj_t                *msg,
+               int                   msg_buflen,
+               rawobj_t                *out_token);
+__u32 lgss_unwrap(
+               struct gss_ctx    *ctx,
+               rawobj_t                *gsshdr,
+               rawobj_t                *token,
+               rawobj_t                *out_msg);
+__u32 lgss_prep_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc,
+               rawobj_t                *token,
+               int                   adj_nob);
+__u32 lgss_unwrap_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc,
+               rawobj_t                *token,
+               int                   adj_nob);
+__u32 lgss_delete_sec_context(
+               struct gss_ctx   **ctx);
+int lgss_display(
+               struct gss_ctx    *ctx,
+               char                *buf,
+               int                   bufsize);
+
+struct subflavor_desc {
+       __u32      sf_subflavor;
+       __u32      sf_qop;
+       __u32      sf_service;
+       char       *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+       struct list_head              gm_list;
+       module_t           *gm_owner;
+       char               *gm_name;
+       rawobj_t                gm_oid;
+       atomic_t            gm_count;
+       struct gss_api_ops     *gm_ops;
+       int                  gm_sf_num;
+       struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+       __u32 (*gss_import_sec_context)(
+                       rawobj_t               *input_token,
+                       struct gss_ctx   *ctx);
+       __u32 (*gss_copy_reverse_context)(
+                       struct gss_ctx   *ctx,
+                       struct gss_ctx   *ctx_new);
+       __u32 (*gss_inquire_context)(
+                       struct gss_ctx   *ctx,
+                       unsigned long     *endtime);
+       __u32 (*gss_get_mic)(
+                       struct gss_ctx   *ctx,
+                       int                  msgcnt,
+                       rawobj_t               *msgs,
+                       int                  iovcnt,
+                       lnet_kiov_t         *iovs,
+                       rawobj_t               *mic_token);
+       __u32 (*gss_verify_mic)(
+                       struct gss_ctx   *ctx,
+                       int                  msgcnt,
+                       rawobj_t               *msgs,
+                       int                  iovcnt,
+                       lnet_kiov_t         *iovs,
+                       rawobj_t               *mic_token);
+       __u32 (*gss_wrap)(
+                       struct gss_ctx   *ctx,
+                       rawobj_t               *gsshdr,
+                       rawobj_t               *msg,
+                       int                  msg_buflen,
+                       rawobj_t               *out_token);
+       __u32 (*gss_unwrap)(
+                       struct gss_ctx   *ctx,
+                       rawobj_t               *gsshdr,
+                       rawobj_t               *token,
+                       rawobj_t               *out_msg);
+       __u32 (*gss_prep_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc);
+       __u32 (*gss_wrap_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t               *token,
+                       int                  adj_nob);
+       __u32 (*gss_unwrap_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t               *token,
+                       int                  adj_nob);
+       void (*gss_delete_sec_context)(
+                       void               *ctx);
+       int  (*gss_display)(
+                       struct gss_ctx   *ctx,
+                       char               *buf,
+                       int                  bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644 (file)
index 0000000..c70eb00
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                    (-2045022976L)
+#define G_BAD_STRING_UID                        (-2045022975L)
+#define G_NOUSER                                (-2045022974L)
+#define G_VALIDATE_FAILED                      (-2045022973L)
+#define G_BUFFER_ALLOC                    (-2045022972L)
+#define G_BAD_MSG_CTX                      (-2045022971L)
+#define G_WRONG_SIZE                        (-2045022970L)
+#define G_BAD_USAGE                          (-2045022969L)
+#define G_UNKNOWN_QOP                      (-2045022968L)
+#define G_NO_HOSTNAME                      (-2045022967L)
+#define G_BAD_HOSTNAME                    (-2045022966L)
+#define G_WRONG_MECH                        (-2045022965L)
+#define G_BAD_TOK_HEADER                        (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                          (-2045022962L)
+#define G_REFLECT                              (-2045022961L)
+#define G_WRONG_TOKID                      (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+                           int *body_size,
+                           unsigned char **buf_in,
+                           int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+                    rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+                unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+                        int body_size,
+                        unsigned char **buf);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644 (file)
index 0000000..ed95bbb
--- /dev/null
@@ -0,0 +1,512 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_cli_ctx            *gctx;
+       struct lustre_msg              *msg;
+       struct ptlrpc_bulk_sec_desc     *bsd;
+       rawobj_t                         token;
+       __u32                       maj;
+       int                           offset;
+       int                           rc;
+       ENTRY;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+       LASSERT(gctx->gc_mechctx);
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+               LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+               msg = req->rq_reqbuf;
+               offset = msg->lm_bufcount - 1;
+               break;
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+               msg = req->rq_reqbuf;
+               offset = msg->lm_bufcount - 2;
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+               msg = req->rq_clrbuf;
+               offset = msg->lm_bufcount - 1;
+               break;
+       default:
+               LBUG();
+       }
+
+       bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+       bsd->bsd_version = 0;
+       bsd->bsd_flags = 0;
+       bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+       if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               RETURN(0);
+
+       LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+               bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+       if (req->rq_bulk_read) {
+               /*
+                * bulk read: prepare receiving pages only for privacy mode.
+                */
+               if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                       return gss_cli_prep_bulk(req, desc);
+       } else {
+               /*
+                * bulk write: sign or encrypt bulk pages.
+                */
+               bsd->bsd_nob = desc->bd_nob;
+
+               if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                       /* integrity mode */
+                       token.data = bsd->bsd_data;
+                       token.len = lustre_msg_buflen(msg, offset) -
+                                   sizeof(*bsd);
+
+                       maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+                                          desc->bd_iov_count, desc->bd_iov,
+                                          &token);
+                       if (maj != GSS_S_COMPLETE) {
+                               CWARN("failed to sign bulk data: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               } else {
+                       /* privacy mode */
+                       if (desc->bd_iov_count == 0)
+                               RETURN(0);
+
+                       rc = sptlrpc_enc_pool_get_pages(desc);
+                       if (rc) {
+                               CERROR("bulk write: failed to allocate "
+                                      "encryption pages: %d\n", rc);
+                               RETURN(rc);
+                       }
+
+                       token.data = bsd->bsd_data;
+                       token.len = lustre_msg_buflen(msg, offset) -
+                                   sizeof(*bsd);
+
+                       maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+                       if (maj != GSS_S_COMPLETE) {
+                               CWARN("fail to encrypt bulk data: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               }
+       }
+
+       RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_request *req,
+                           struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_cli_ctx            *gctx;
+       struct lustre_msg              *rmsg, *vmsg;
+       struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+       rawobj_t                         token;
+       __u32                       maj;
+       int                           roff, voff;
+       ENTRY;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 1;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 3);
+
+               rmsg = req->rq_reqbuf;
+               roff = rmsg->lm_bufcount - 1; /* last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 3);
+               break;
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 2;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 4);
+
+               rmsg = req->rq_reqbuf;
+               roff = rmsg->lm_bufcount - 2; /* second last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 4);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 1;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 2);
+
+               rmsg = req->rq_clrbuf;
+               roff = rmsg->lm_bufcount - 1; /* last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 2);
+               break;
+       default:
+               LBUG();
+       }
+
+       bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+       bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+       LASSERT(bsdr && bsdv);
+
+       if (bsdr->bsd_version != bsdv->bsd_version ||
+           bsdr->bsd_type != bsdv->bsd_type ||
+           bsdr->bsd_svc != bsdv->bsd_svc) {
+               CERROR("bulk security descriptor mismatch: "
+                      "(%u,%u,%u) != (%u,%u,%u)\n",
+                      bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+                      bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+               RETURN(-EPROTO);
+       }
+
+       LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+               bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+               bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+       /*
+        * in privacy mode if return success, make sure bd_nob_transferred
+        * is the actual size of the clear text, otherwise upper layer
+        * may be surprised.
+        */
+       if (req->rq_bulk_write) {
+               if (bsdv->bsd_flags & BSD_FL_ERR) {
+                       CERROR("server reported bulk i/o failure\n");
+                       RETURN(-EIO);
+               }
+
+               if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                       desc->bd_nob_transferred = desc->bd_nob;
+       } else {
+               /*
+                * bulk read, upon return success, bd_nob_transferred is
+                * the size of plain text actually received.
+                */
+               gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+               LASSERT(gctx->gc_mechctx);
+
+               if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                       int i, nob;
+
+                       /* fix the actual data size */
+                       for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+                               if (desc->bd_iov[i].kiov_len + nob >
+                                   desc->bd_nob_transferred) {
+                                       desc->bd_iov[i].kiov_len =
+                                               desc->bd_nob_transferred - nob;
+                               }
+                               nob += desc->bd_iov[i].kiov_len;
+                       }
+
+                       token.data = bsdv->bsd_data;
+                       token.len = lustre_msg_buflen(vmsg, voff) -
+                                   sizeof(*bsdv);
+
+                       maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+                                             desc->bd_iov_count, desc->bd_iov,
+                                             &token);
+                       if (maj != GSS_S_COMPLETE) {
+                               CERROR("failed to verify bulk read: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+                       desc->bd_nob = bsdv->bsd_nob;
+                       if (desc->bd_nob == 0)
+                               RETURN(0);
+
+                       token.data = bsdv->bsd_data;
+                       token.len = lustre_msg_buflen(vmsg, voff) -
+                                   sizeof(*bsdr);
+
+                       maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+                                              &token, 1);
+                       if (maj != GSS_S_COMPLETE) {
+                               CERROR("failed to decrypt bulk read: %x\n",
+                                      maj);
+                               RETURN(-EACCES);
+                       }
+
+                       desc->bd_nob_transferred = desc->bd_nob;
+               }
+       }
+
+       RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+                        struct gss_ctx *mechctx)
+{
+       int     rc;
+
+       if (desc->bd_iov_count == 0)
+               return 0;
+
+       rc = sptlrpc_enc_pool_get_pages(desc);
+       if (rc)
+               return rc;
+
+       if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+               return -EACCES;
+
+       return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       int          rc;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read);
+
+       if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+               RETURN(0);
+
+       rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+       if (rc)
+               CERROR("bulk read: failed to prepare encryption "
+                      "pages: %d\n", rc);
+
+       RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsd;
+       int                        rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_write);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsd = grctx->src_reqbsd;
+       if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+               RETURN(0);
+
+       rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+       if (rc)
+               CERROR("bulk write: failed to prepare encryption "
+                      "pages: %d\n", rc);
+
+       RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+       rawobj_t                      token;
+       __u32                    maj;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_write);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsdr = grctx->src_reqbsd;
+       bsdv = grctx->src_repbsd;
+
+       /* bsdr has been sanity checked during unpacking */
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       switch (bsdv->bsd_svc) {
+       case SPTLRPC_BULK_SVC_INTG:
+               token.data = bsdr->bsd_data;
+               token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+               maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                     desc->bd_iov_count, desc->bd_iov, &token);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to verify bulk signature: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       case SPTLRPC_BULK_SVC_PRIV:
+               if (bsdr->bsd_nob != desc->bd_nob) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("prepared nob %d doesn't match the actual "
+                              "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+                       RETURN(-EPROTO);
+               }
+
+               if (desc->bd_iov_count == 0) {
+                       LASSERT(desc->bd_nob == 0);
+                       break;
+               }
+
+               token.data = bsdr->bsd_data;
+               token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+               maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                      desc, &token, 0);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed decrypt bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       }
+
+       RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+       rawobj_t                      token;
+       __u32                    maj;
+       int                        rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsdr = grctx->src_reqbsd;
+       bsdv = grctx->src_repbsd;
+
+       /* bsdr has been sanity checked during unpacking */
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       switch (bsdv->bsd_svc) {
+       case SPTLRPC_BULK_SVC_INTG:
+               token.data = bsdv->bsd_data;
+               token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+               maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                  desc->bd_iov_count, desc->bd_iov, &token);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to sign bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       case SPTLRPC_BULK_SVC_PRIV:
+               bsdv->bsd_nob = desc->bd_nob;
+
+               if (desc->bd_iov_count == 0) {
+                       LASSERT(desc->bd_nob == 0);
+                       break;
+               }
+
+               rc = sptlrpc_enc_pool_get_pages(desc);
+               if (rc) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("bulk read: failed to allocate encryption "
+                              "pages: %d\n", rc);
+                       RETURN(rc);
+               }
+
+               token.data = bsdv->bsd_data;
+               token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+               maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                    desc, &token, 1);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to encrypt bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       }
+
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644 (file)
index 0000000..142c789
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper               *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+                         struct ptlrpc_request *req,
+                         int lustre_srv,
+                         uid_t uid, gid_t gid,
+                         long token_size,
+                         char __user *token)
+{
+       struct lustre_msg       *msg = req->rq_reqbuf;
+       struct gss_sec    *gsec;
+       struct gss_header       *ghdr;
+       struct ptlrpc_user_desc *pud;
+       __u32              *p, size, offset = 2;
+       rawobj_t                 obj;
+
+       LASSERT(msg->lm_bufcount <= 4);
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_cli_ctx->cc_sec);
+
+       /* gss hdr */
+       ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+       ghdr->gh_seq = 0;
+       ghdr->gh_svc = SPTLRPC_SVC_NULL;
+       ghdr->gh_handle.len = 0;
+
+       /* fix the user desc */
+       if (req->rq_pack_udesc) {
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+               pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+               LASSERT(pud);
+               pud->pud_uid = pud->pud_fsuid = uid;
+               pud->pud_gid = pud->pud_fsgid = gid;
+               pud->pud_cap = 0;
+               pud->pud_ngroups = 0;
+               offset++;
+       }
+
+       /* security payload */
+       p = lustre_msg_buf(msg, offset, 0);
+       size = msg->lm_buflens[offset];
+       LASSERT(p);
+
+       /* 1. lustre svc type */
+       LASSERT(size > 4);
+       *p++ = cpu_to_le32(lustre_srv);
+       size -= 4;
+
+       /* 2. target uuid */
+       obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+       obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+       if (rawobj_serialize(&obj, &p, &size))
+               LBUG();
+
+       /* 3. reverse context handle. actually only needed by root user,
+        *    but we send it anyway. */
+       gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+       obj.len = sizeof(gsec->gs_rvs_hdl);
+       obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+       if (rawobj_serialize(&obj, &p, &size))
+               LBUG();
+
+       /* 4. now the token */
+       LASSERT(size >= (sizeof(__u32) + token_size));
+       *p++ = cpu_to_le32(((__u32) token_size));
+       if (copy_from_user(p, token, token_size)) {
+               CERROR("can't copy token\n");
+               return -EFAULT;
+       }
+       size -= sizeof(__u32) + cfs_size_round4(token_size);
+
+       req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+                                               msg->lm_buflens[offset] - size, 0);
+       return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+                        char __user *outbuf, long outlen)
+{
+       struct gss_rep_header   *ghdr;
+       __u32               obj_len, round_len;
+       __u32               status, effective = 0;
+
+       if (msg->lm_bufcount != 3) {
+               CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+               return -EPROTO;
+       }
+
+       ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("unable to extract gss reply header\n");
+               return -EPROTO;
+       }
+
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("invalid gss version %u\n", ghdr->gh_version);
+               return -EPROTO;
+       }
+
+       if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) +
+                    cfs_size_round4(msg->lm_buflens[2])) {
+               CERROR("output buffer size %ld too small\n", outlen);
+               return -EFAULT;
+       }
+
+       status = 0;
+       effective = 0;
+
+       if (copy_to_user(outbuf, &status, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+               return -EFAULT;
+       outbuf += 4;
+       effective += 4 * 4;
+
+       /* handle */
+       obj_len = ghdr->gh_handle.len;
+       round_len = (obj_len + 3) & ~ 3;
+       if (copy_to_user(outbuf, &obj_len, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+               return -EFAULT;
+       outbuf += round_len;
+       effective += 4 + round_len;
+
+       /* out token */
+       obj_len = msg->lm_buflens[2];
+       round_len = (obj_len + 3) & ~ 3;
+       if (copy_to_user(outbuf, &obj_len, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+               return -EFAULT;
+       outbuf += round_len;
+       effective += 4 + round_len;
+
+       return effective;
+}
+
+/* XXX move to where lgssd could see */
+struct lgssd_ioctl_param {
+       int          version;   /* in   */
+       int          secid;       /* in   */
+       char       *uuid;          /* in   */
+       int          lustre_svc;     /* in   */
+       uid_t      uid;     /* in   */
+       gid_t      gid;     /* in   */
+       long        send_token_size;/* in   */
+       char       *send_token;     /* in   */
+       long        reply_buf_size; /* in   */
+       char       *reply_buf;      /* in   */
+       long        status;      /* out  */
+       long        reply_length;   /* out  */
+};
+
+int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count)
+{
+       struct obd_import       *imp;
+       struct ptlrpc_request    *req;
+       struct lgssd_ioctl_param  param;
+       struct obd_device       *obd;
+       char                  obdname[64];
+       long                  lsize;
+       int                    rc;
+
+       if (count != sizeof(param)) {
+               CERROR("ioctl size %lu, expect %lu, please check lgss_keyring "
+                      "version\n", count, (unsigned long) sizeof(param));
+               RETURN(-EINVAL);
+       }
+       if (copy_from_user(&param, buffer, sizeof(param))) {
+               CERROR("failed copy data from lgssd\n");
+               RETURN(-EFAULT);
+       }
+
+       if (param.version != GSSD_INTERFACE_VERSION) {
+               CERROR("gssd interface version %d (expect %d)\n",
+                       param.version, GSSD_INTERFACE_VERSION);
+               RETURN(-EINVAL);
+       }
+
+       /* take name */
+       if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) {
+               CERROR("Invalid obdname pointer\n");
+               RETURN(-EFAULT);
+       }
+
+       obd = class_name2obd(obdname);
+       if (!obd) {
+               CERROR("no such obd %s\n", obdname);
+               RETURN(-EINVAL);
+       }
+
+       if (unlikely(!obd->obd_set_up)) {
+               CERROR("obd %s not setup\n", obdname);
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               CERROR("obd %s has stopped\n", obdname);
+               spin_unlock(&obd->obd_dev_lock);
+               RETURN(-EINVAL);
+       }
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+               CERROR("obd %s is not a client device\n", obdname);
+               spin_unlock(&obd->obd_dev_lock);
+               RETURN(-EINVAL);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import == NULL) {
+               CERROR("obd %s: import has gone\n", obd->obd_name);
+               up_read(&obd->u.cli.cl_sem);
+               RETURN(-EINVAL);
+       }
+       imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+
+       if (imp->imp_deactive) {
+               CERROR("import has been deactivated\n");
+               class_import_put(imp);
+               RETURN(-EINVAL);
+       }
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+                                       SEC_CTX_INIT);
+       if (req == NULL) {
+               param.status = -ENOMEM;
+               goto out_copy;
+       }
+
+       if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+               CWARN("original secid %d, now has changed to %d, "
+                     "cancel this negotiation\n", param.secid,
+                     req->rq_cli_ctx->cc_sec->ps_id);
+               param.status = -EINVAL;
+               goto out_copy;
+       }
+
+       /* get token */
+       rc = ctx_init_pack_request(imp, req,
+                                  param.lustre_svc,
+                                  param.uid, param.gid,
+                                  param.send_token_size,
+                                  param.send_token);
+       if (rc) {
+               param.status = rc;
+               goto out_copy;
+       }
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               /* If any _real_ denial be made, we expect server return
+                * -EACCES reply or return success but indicate gss error
+                * inside reply messsage. All other errors are treated as
+                * timeout, caller might try the negotiation repeatedly,
+                * leave recovery decisions to general ptlrpc layer.
+                *
+                * FIXME maybe some other error code shouldn't be treated
+                * as timeout. */
+               param.status = rc;
+               if (rc != -EACCES)
+                       param.status = -ETIMEDOUT;
+               goto out_copy;
+       }
+
+       LASSERT(req->rq_repdata);
+       lsize = ctx_init_parse_reply(req->rq_repdata,
+                                    ptlrpc_rep_need_swab(req),
+                                    param.reply_buf, param.reply_buf_size);
+       if (lsize < 0) {
+               param.status = (int) lsize;
+               goto out_copy;
+       }
+
+       param.status = 0;
+       param.reply_length = lsize;
+
+out_copy:
+       if (copy_to_user(buffer, &param, sizeof(param)))
+               rc = -EFAULT;
+       else
+               rc = 0;
+
+       class_import_put(imp);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+       struct ptlrpc_cli_ctx   *ctx = &gctx->gc_base;
+       struct obd_import       *imp = ctx->cc_sec->ps_import;
+       struct ptlrpc_request   *req;
+       struct ptlrpc_user_desc *pud;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+               CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+                      "don't send destroy rpc\n", ctx,
+                      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+               RETURN(0);
+       }
+
+       might_sleep();
+
+       CWARN("%s ctx %p idx "LPX64" (%u->%s)\n",
+             sec_is_reverse(ctx->cc_sec) ?
+             "server finishing reverse" : "client finishing forward",
+             ctx, gss_handle_to_u64(&gctx->gc_handle),
+             ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+       gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+       if (req == NULL) {
+               CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+                     ctx, ctx->cc_vcred.vc_uid);
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+                                     NULL, ctx);
+       if (rc) {
+               ptlrpc_request_free(req);
+               GOTO(out_ref, rc);
+       }
+
+       /* fix the user desc */
+       if (req->rq_pack_udesc) {
+               /* we rely the fact that this request is in AUTH mode,
+                * and user_desc at offset 2. */
+               pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+               LASSERT(pud);
+               pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+               pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+               pud->pud_cap = 0;
+               pud->pud_ngroups = 0;
+       }
+
+       req->rq_phase = RQ_PHASE_RPC;
+       rc = ptl_send_rpc(req, 1);
+       if (rc)
+               CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+                     ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+       ptlrpc_req_finished(req);
+out:
+       RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+       return 0;
+}
+
+void __exit gss_exit_cli_upcall(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h
new file mode 100644 (file)
index 0000000..1342579
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG       (1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG         (16)
+#define GSS_C_INTEG_FLAG       (32)
+#define GSS_C_ANON_FLAG         (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG       (256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH           (0)
+#define GSS_C_INITIATE   (1)
+#define GSS_C_ACCEPT       (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE   (1)
+#define GSS_C_MECH_CODE         (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE       ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE   (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK       ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK       ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK       ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+         (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+       (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+       (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+       (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+       (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+       (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+       (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+       (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+       (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+       (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+       (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+       (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+       (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+       (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+       (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+       (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+       (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+       (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+       (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+       (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+       (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+       (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN       (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+       (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+       (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+       (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644 (file)
index 0000000..20b1638
--- /dev/null
@@ -0,0 +1,285 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+       memcpy((ptr), (char *) (str), (len)); \
+       (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60                           tag for APPLICATION 0, SEQUENCE
+                                       (constructed, definite-length)
+       <length>                possible multiple bytes, need to parse/generate
+       0x06                    tag for OBJECT IDENTIFIER
+               <moid_length>   compile-time constant string (assume 1 byte)
+               <moid_bytes>    compile-time constant string
+       <inner_bytes>           the ANY containing the application token
+                                       bytes 0,1 are the token type
+                                       bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+       if (length < (1 << 7))
+               return 1;
+       else if (length < (1 << 8))
+               return 2;
+#if (SIZEOF_INT == 2)
+       else
+               return 3;
+#else
+       else if (length < (1 << 16))
+               return 3;
+       else if (length < (1 << 24))
+               return 4;
+       else
+               return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+       if (length < (1 << 7)) {
+               *(*buf)++ = (unsigned char) length;
+       } else {
+               *(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+               if (length >= (1 << 24))
+                       *(*buf)++ = (unsigned char) (length >> 24);
+               if (length >= (1 << 16))
+                       *(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+               if (length >= (1 << 8))
+                       *(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+               *(*buf)++ = (unsigned char) (length & 0xff);
+       }
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+       unsigned char sf;
+       int ret;
+
+       if (*bufsize < 1)
+               return -1;
+       sf = *(*buf)++;
+       (*bufsize)--;
+       if (sf & 0x80) {
+               if ((sf &= 0x7f) > ((*bufsize) - 1))
+                       return -1;
+               if (sf > SIZEOF_INT)
+                       return -1;
+               ret = 0;
+               for (; sf; sf--) {
+                       ret = (ret << 8) + (*(*buf)++);
+                       (*bufsize)--;
+               }
+       } else {
+               ret = sf;
+       }
+
+       return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+       /* set body_size to sequence contents size */
+       body_size += 4 + (int) mech->len; /* NEED overflow check */
+       return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+       *(*buf)++ = 0x60;
+       der_write_length(buf, 4 + mech->len + body_size);
+       *(*buf)++ = 0x06;
+       *(*buf)++ = (unsigned char) mech->len;
+       TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                           unsigned char **buf_in, int toksize)
+{
+       unsigned char *buf = *buf_in;
+       int seqsize;
+       rawobj_t toid;
+       int ret = 0;
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x60)
+               return (G_BAD_TOK_HEADER);
+
+       if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+               return(G_BAD_TOK_HEADER);
+
+       if (seqsize != toksize)
+               return (G_BAD_TOK_HEADER);
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x06)
+               return (G_BAD_TOK_HEADER);
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       toid.len = *buf++;
+
+       if ((toksize -= toid.len) < 0)
+               return (G_BAD_TOK_HEADER);
+       toid.data = buf;
+       buf += toid.len;
+
+       if (!g_OID_equal(&toid, mech))
+               ret = G_WRONG_MECH;
+
+       /* G_WRONG_MECH is not returned immediately because it's more
+        * important to return G_BAD_TOK_HEADER if the token header is
+        * in fact bad
+        */
+       if ((toksize -= 2) < 0)
+               return (G_BAD_TOK_HEADER);
+
+       if (ret)
+               return (ret);
+
+       if (!ret) {
+               *buf_in = buf;
+               *body_size = toksize;
+       }
+
+       return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+       unsigned char *buf = in_buf->data;
+       int len = in_buf->len;
+       int ret = 0;
+       int seqsize;
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x60)
+               return (G_BAD_TOK_HEADER);
+
+       if ((seqsize = der_read_length(&buf, &len)) < 0)
+               return (G_BAD_TOK_HEADER);
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x06)
+               return (G_BAD_TOK_HEADER);
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       mech->len = *buf++;
+
+       if ((len -= mech->len) < 0)
+               return (G_BAD_TOK_HEADER);
+       OBD_ALLOC_LARGE(mech->data, mech->len);
+       if (!mech->data)
+               return (G_BUFFER_ALLOC);
+       memcpy(mech->data, buf, mech->len);
+
+       return ret;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644 (file)
index 0000000..cbfc47c
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+typedef struct netobj_s {
+       __u32      len;
+       __u8        data[0];
+} netobj_t;
+
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+
+typedef struct rawobj_s {
+       __u32      len;
+       __u8       *data;
+} rawobj_t;
+
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+       __u32      dataoff;
+       __u32      datalen;
+       __u32      buflen;
+       __u8       *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                        void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA                 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT                                         \
+       (obd_timeout < __TIMEOUT_DELTA ?                                \
+        __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT         (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
+
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+                                  unsigned long sec_flags)
+{
+       if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+               return expiry;
+
+       if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+               return expiry - __TIMEOUT_DELTA;
+
+       return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK          (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+#define GSSD_INTERFACE_VERSION   (1)
+
+#define PTLRPC_GSS_VERSION           (1)
+
+
+enum ptlrpc_gss_proc {
+       PTLRPC_GSS_PROC_DATA        = 0,
+       PTLRPC_GSS_PROC_INIT        = 1,
+       PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+       PTLRPC_GSS_PROC_DESTROY  = 3,
+       PTLRPC_GSS_PROC_ERR          = 4,
+};
+
+enum ptlrpc_gss_tgt {
+       LUSTRE_GSS_TGT_MGS            = 0,
+       LUSTRE_GSS_TGT_MDS            = 1,
+       LUSTRE_GSS_TGT_OSS            = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+       LUSTRE_GSS_PACK_BULK        = 1,
+       LUSTRE_GSS_PACK_USER        = 2,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+       const char *name = imp->imp_obd->obd_type->typ_name;
+
+       if (!strcmp(name, LUSTRE_MGC_NAME))
+               return LUSTRE_GSS_TGT_MGS;
+       if (!strcmp(name, LUSTRE_MDC_NAME))
+               return LUSTRE_GSS_TGT_MDS;
+       if (!strcmp(name, LUSTRE_OSC_NAME))
+               return LUSTRE_GSS_TGT_OSS;
+       LBUG();
+       return 0;
+}
+
+/*
+ * following 3 header must have the same size and offset
+ */
+struct gss_header {
+       __u8                gh_version;     /* gss version */
+       __u8                gh_sp;        /* sec part */
+       __u16              gh_pad0;
+       __u32              gh_flags;       /* wrap flags */
+       __u32              gh_proc;     /* proc */
+       __u32              gh_seq;       /* sequence */
+       __u32              gh_svc;       /* service */
+       __u32              gh_pad1;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+       __u8                gh_version;
+       __u8                gh_sp;
+       __u16              gh_pad0;
+       __u32              gh_flags;
+       __u32              gh_proc;
+       __u32              gh_major;
+       __u32              gh_minor;
+       __u32              gh_seqwin;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;
+};
+
+struct gss_err_header {
+       __u8                gh_version;
+       __u8                gh_sp;
+       __u16              gh_pad0;
+       __u32              gh_flags;
+       __u32              gh_proc;
+       __u32              gh_major;
+       __u32              gh_minor;
+       __u32              gh_pad1;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;
+};
+
+/*
+ * part of wire context information send from client which be saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+       __u32              gw_flags;
+       __u32              gw_proc;
+       __u32              gw_seq;
+       __u32              gw_svc;
+       rawobj_t                gw_handle;
+};
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE   (sizeof(struct gss_header) + \
+                                        PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+       if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+               return -1;
+       return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN                 (2048)
+#define GSS_SEQ_WIN_MAIN               GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK               (128)
+#define GSS_SEQ_REPACK_THRESHOLD       (GSS_SEQ_WIN_MAIN / 2 + \
+                                        GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+       spinlock_t              ssd_lock;
+       /*
+        * highest sequence number seen so far, for main and back window
+        */
+       __u32              ssd_max_main;
+       __u32              ssd_max_back;
+       /*
+        * main and back window
+        * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+        * of ssd_win is nonzero iff sequence number i has been seen already.
+        */
+       unsigned long      ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+       unsigned long      ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+       struct gss_ctx   *gsc_mechctx;
+       struct gss_svc_seq_data gsc_seqdata;
+       rawobj_t                gsc_rvs_hdl;
+       __u32              gsc_rvs_seq;
+       uid_t              gsc_uid;
+       gid_t              gsc_gid;
+       uid_t              gsc_mapped_uid;
+       unsigned int        gsc_usr_root:1,
+                               gsc_usr_mds:1,
+                               gsc_usr_oss:1,
+                               gsc_remote:1,
+                               gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+       struct ptlrpc_svc_ctx      src_base;
+       /*
+        * context
+        */
+       struct gss_wire_ctx          src_wirectx;
+       struct gss_svc_ctx           *src_ctx;
+       /*
+        * record place of bulk_sec_desc in request/reply buffer
+        */
+       struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+       int                          src_reqbsd_size;
+       struct ptlrpc_bulk_sec_desc    *src_repbsd;
+       int                          src_repbsd_size;
+       /*
+        * flags
+        */
+       unsigned int                src_init:1,
+                                       src_init_continue:1,
+                                       src_err_notify:1;
+       int                          src_reserve_len;
+};
+
+struct gss_cli_ctx {
+       struct ptlrpc_cli_ctx   gc_base;
+       __u32              gc_flavor;
+       __u32              gc_proc;
+       __u32              gc_win;
+       atomic_t            gc_seq;
+       rawobj_t                gc_handle;
+       struct gss_ctx   *gc_mechctx;
+       /* handle for the buddy svc ctx */
+       rawobj_t                gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+       struct gss_cli_ctx      gck_base;
+       struct key           *gck_key;
+       struct timer_list      *gck_timer;
+};
+
+struct gss_sec {
+       struct ptlrpc_sec       gs_base;
+       struct gss_api_mech     *gs_mech;
+       spinlock_t              gs_lock;
+       __u64                   gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+       struct gss_sec    gsp_base;
+       int                  gsp_chash_size;  /* must be 2^n */
+       struct hlist_head       gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+       struct gss_sec    gsk_base;
+       /*
+        * all contexts listed here. access is protected by sec spinlock.
+        */
+       struct hlist_head       gsk_clist;
+       /*
+        * specially point to root ctx (only one at a time). access is
+        * protected by sec spinlock.
+        */
+       struct ptlrpc_cli_ctx  *gsk_root_ctx;
+       /*
+        * specially serialize upcalls for root context.
+        */
+       struct mutex                    gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       struct mutex            gsk_uc_lock;    /* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+       return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+       return container_of(ctx2gctx(ctx),
+                           struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+       return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+       return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+       return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+
+#define GSS_CTX_INIT_MAX_LEN       (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN  (32)
+#define GSS_PRIVBUF_SUFFIX_LEN  (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(ctx);
+       return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(ctx);
+       return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+                         struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                     int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                     int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                       int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+                   struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                        struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                  int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+                         struct ptlrpc_sec_policy *policy,
+                         struct obd_import *imp,
+                         struct ptlrpc_svc_ctx *ctx,
+                         struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_ctx_ops *ctxops,
+                           struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+
+/* gss_pipefs.c */
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_request *req,
+                           struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                           unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void __exit gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                  struct gss_sec *gsec,
+                                  struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                              struct gss_svc_reqctx *grctx,
+                              struct gss_wire_ctx *gw,
+                              struct obd_device *target,
+                              __u32 lustre_svc,
+                              rawobj_t *rvs_hdl,
+                              rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                          struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void __exit gss_exit_svc_upcall(void);
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_lproc(void);
+void __exit gss_exit_lproc(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void __exit cleanup_kerberos_module(void);
+
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+       char *buf, *p = (char *) ptr;
+       int bufsize = size * 2 + 1, i;
+
+       OBD_ALLOC(buf, bufsize);
+       if (!buf) {
+               CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+               return;
+       }
+
+       for (i = 0; i < size; i++)
+               sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+       buf[size + size] = '\0';
+       LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+       OBD_FREE(buf, bufsize);
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644 (file)
index 0000000..bb571ae
--- /dev/null
@@ -0,0 +1,1424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                              struct ptlrpc_svc_ctx *svc_ctx);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/****************************************
+ * internal helpers                 *
+ ****************************************/
+
+#define DUMP_PROCESS_KEYRINGS(tsk)                                     \
+{                                                                      \
+       CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "                 \
+             "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",           \
+             tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,                \
+             tsk->parent->comm, tsk->parent->pid,                      \
+             tsk->parent->uid, tsk->parent->fsuid,                     \
+             tsk->request_key_auth ?                                   \
+             tsk->request_key_auth->serial : 0,                        \
+             key_cred(tsk)->thread_keyring ?                           \
+             key_cred(tsk)->thread_keyring->serial : 0,                \
+             key_tgcred(tsk)->process_keyring ?                        \
+             key_tgcred(tsk)->process_keyring->serial : 0,             \
+             key_tgcred(tsk)->session_keyring ?                        \
+             key_tgcred(tsk)->session_keyring->serial : 0,             \
+             key_cred(tsk)->user->uid_keyring ?                        \
+             key_cred(tsk)->user->uid_keyring->serial : 0,             \
+             key_cred(tsk)->user->session_keyring ?                    \
+             key_cred(tsk)->user->session_keyring->serial : 0,         \
+             key_cred(tsk)->jit_keyring                                \
+            );                                                         \
+}
+
+#define DUMP_KEY(key)                                             \
+{                                                                     \
+       CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",            \
+             key, key->serial, atomic_read(&key->usage),              \
+             key->uid, key->gid,                                      \
+             key->description ? key->description : "n/a"              \
+            );                                                  \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+       set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(unsigned long data)
+{
+       struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+       struct key          *key = ctx2gctx_keyring(ctx)->gck_key;
+
+       CWARN("ctx %p, key %p\n", ctx, key);
+
+       LASSERT(key);
+
+       cli_ctx_expire(ctx);
+       key_revoke_locked(key);
+}
+
+static
+void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+{
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+       struct timer_list         *timer = gctx_kr->gck_timer;
+
+       LASSERT(timer);
+
+       CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+       timeout = timeout * HZ + cfs_time_current();
+
+       init_timer(timer);
+       timer->expires = timeout;
+       timer->data = (unsigned long ) ctx;
+       timer->function = ctx_upcall_timeout_kr;
+
+       add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+       struct timer_list         *timer = gctx_kr->gck_timer;
+
+       if (timer == NULL)
+               return;
+
+       CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+       gctx_kr->gck_timer = NULL;
+
+       del_singleshot_timer_sync(timer);
+
+       OBD_FREE_PTR(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+                                    struct vfs_cred *vcred)
+{
+       struct ptlrpc_cli_ctx      *ctx;
+       struct gss_cli_ctx_keyring *gctx_kr;
+
+       OBD_ALLOC_PTR(gctx_kr);
+       if (gctx_kr == NULL)
+               return NULL;
+
+       OBD_ALLOC_PTR(gctx_kr->gck_timer);
+       if (gctx_kr->gck_timer == NULL) {
+               OBD_FREE_PTR(gctx_kr);
+               return NULL;
+       }
+       init_timer(gctx_kr->gck_timer);
+
+       ctx = &gctx_kr->gck_base.gc_base;
+
+       if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+               OBD_FREE_PTR(gctx_kr->gck_timer);
+               OBD_FREE_PTR(gctx_kr);
+               return NULL;
+       }
+
+       ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+       clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+       atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+       return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_sec         *sec = ctx->cc_sec;
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+
+       CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+       /* at this time the association with key has been broken. */
+       LASSERT(sec);
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+       LASSERT(gctx_kr->gck_key == NULL);
+
+       ctx_clear_timer_kr(ctx);
+       LASSERT(gctx_kr->gck_timer == NULL);
+
+       if (gss_cli_ctx_fini_common(sec, ctx))
+               return;
+
+       OBD_FREE_PTR(gctx_kr);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       if (sync) {
+               ctx_destroy_kr(ctx);
+       } else {
+               atomic_inc(&ctx->cc_refcount);
+               sptlrpc_gc_add_ctx(ctx);
+       }
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       if (atomic_dec_and_test(&ctx->cc_refcount))
+               ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+       if (condition)
+               spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+       if (condition)
+               spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+       struct ptlrpc_sec      *sec = ctx->cc_sec;
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+
+       LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       spin_lock_if(&sec->ps_lock, !locked);
+
+       atomic_inc(&ctx->cc_refcount);
+       set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+       hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+       if (is_root)
+               gsec_kr->gsk_root_ctx = ctx;
+
+       spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+       struct ptlrpc_sec       *sec = ctx->cc_sec;
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+       /* if hashed bit has gone, leave the job to somebody who is doing it */
+       if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+               return 0;
+
+       /* drop ref inside spin lock to prevent race with other operations */
+       spin_lock_if(&sec->ps_lock, !locked);
+
+       if (gsec_kr->gsk_root_ctx == ctx)
+               gsec_kr->gsk_root_ctx = NULL;
+       hlist_del_init(&ctx->cc_cache);
+       atomic_dec(&ctx->cc_refcount);
+
+       spin_unlock_if(&sec->ps_lock, !locked);
+
+       return 1;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(atomic_read(&key->usage) > 0);
+       LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+       LASSERT(key->payload.data == NULL);
+
+       /* at this time context may or may not in list. */
+       key_get(key);
+       atomic_inc(&ctx->cc_refcount);
+       ctx2gctx_keyring(ctx)->gck_key = key;
+       key->payload.data = ctx;
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(key->payload.data == ctx);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+       /* must revoke the key, or others may treat it as newly created */
+       key_revoke_locked(key);
+
+       key->payload.data = NULL;
+       ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+       /* once ctx get split from key, the timer is meaningless */
+       ctx_clear_timer_kr(ctx);
+
+       ctx_put_kr(ctx, 1);
+       key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+       if (key) {
+               LASSERT(key->payload.data == ctx);
+
+               key_get(key);
+               down_write(&key->sem);
+               unbind_key_ctx(key, ctx);
+               up_write(&key->sem);
+               key_put(key);
+       }
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+       struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+
+       if (ctx)
+               unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       if (ctx_unlist_kr(ctx, 0))
+               unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+       struct ptlrpc_cli_ctx *ctx = key->payload.data;
+
+       if (ctx && ctx_unlist_kr(ctx, 0))
+               unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       struct gss_cli_ctx     *gctx;
+
+       hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) {
+               hlist_del_init(&ctx->cc_cache);
+
+               /* reverse ctx: update current seq to buddy svcctx if exist.
+                * ideally this should be done at gss_cli_ctx_finalize(), but
+                * the ctx destroy could be delayed by:
+                *  1) ctx still has reference;
+                *  2) ctx destroy is asynchronous;
+                * and reverse import call inval_all_ctx() require this be done
+                *_immediately_ otherwise newly created reverse ctx might copy
+                * the very old sequence number from svcctx. */
+               gctx = ctx2gctx(ctx);
+               if (!rawobj_empty(&gctx->gc_svc_handle) &&
+                   sec_is_reverse(gctx->gc_base.cc_sec)) {
+                       gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+                                       (__u32) atomic_read(&gctx->gc_seq));
+               }
+
+               /* we need to wakeup waiting reqs here. the context might
+                * be forced released before upcall finished, then the
+                * late-arrived downcall can't find the ctx even. */
+               sptlrpc_cli_ctx_wakeup(ctx);
+
+               unbind_ctx_kr(ctx);
+               ctx_put_kr(ctx, 0);
+       }
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx   *ctx = NULL;
+
+       spin_lock(&sec->ps_lock);
+
+       ctx = gsec_kr->gsk_root_ctx;
+
+       if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+               struct ptlrpc_cli_ctx  *tmp;
+
+               /* reverse ctx, search root ctx in list, choose the one
+                * with shortest expire time, which is most possibly have
+                * an established peer ctx at client side. */
+               hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) {
+                       if (ctx == NULL || ctx->cc_expire == 0 ||
+                           ctx->cc_expire > tmp->cc_expire) {
+                               ctx = tmp;
+                               /* promote to be root_ctx */
+                               gsec_kr->gsk_root_ctx = ctx;
+                       }
+               }
+       }
+
+       if (ctx) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+               LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+               atomic_inc(&ctx->cc_refcount);
+       }
+
+       spin_unlock(&sec->ps_lock);
+
+       return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+                                struct ptlrpc_cli_ctx *new_ctx,
+                                struct key *key)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       cfs_time_t            now;
+       ENTRY;
+
+       LASSERT(sec_is_reverse(sec));
+
+       spin_lock(&sec->ps_lock);
+
+       now = cfs_time_current_sec();
+
+       /* set all existing ctxs short expiry */
+       hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) {
+               if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+                       ctx->cc_early_expire = 1;
+                       ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+               }
+       }
+
+       /* if there's root_ctx there, instead obsolete the current
+        * immediately, we leave it continue operating for a little while.
+        * hopefully when the first backward rpc with newest ctx send out,
+        * the client side already have the peer ctx well established. */
+       ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+       if (key)
+               bind_key_ctx(key, new_ctx);
+
+       spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+                              struct ptlrpc_sec *sec, uid_t uid)
+{
+       snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+       ((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis                         *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+                                     struct ptlrpc_svc_ctx *svcctx,
+                                     struct sptlrpc_flavor *sf)
+{
+       struct gss_sec_keyring  *gsec_kr;
+       ENTRY;
+
+       OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+       if (gsec_kr == NULL)
+               RETURN(NULL);
+
+       INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+       gsec_kr->gsk_root_ctx = NULL;
+       mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+       if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+                                 imp, svcctx, sf))
+               goto err_free;
+
+       if (svcctx != NULL &&
+           sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+               gss_sec_destroy_common(&gsec_kr->gsk_base);
+               goto err_free;
+       }
+
+       RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+       OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+       RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec    *gsec = sec2gsec(sec);
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+       CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+       LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+       gss_sec_destroy_common(gsec);
+
+       OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+       /* except the ROOTONLY flag, treat it as root user only if real uid
+        * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+       if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+               return 1;
+       else
+               return 0;
+}
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+       struct task_struct *tsk = current;
+       struct key *ring;
+
+       switch (key_cred(tsk)->jit_keyring) {
+       case KEY_REQKEY_DEFL_DEFAULT:
+       case KEY_REQKEY_DEFL_THREAD_KEYRING:
+               ring = key_get(key_cred(tsk)->thread_keyring);
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+               ring = key_get(key_tgcred(tsk)->process_keyring);
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_SESSION_KEYRING:
+               rcu_read_lock();
+               ring = key_get(rcu_dereference(key_tgcred(tsk)
+                                              ->session_keyring));
+               rcu_read_unlock();
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+               ring = key_get(key_cred(tsk)->user->session_keyring);
+               break;
+       case KEY_REQKEY_DEFL_USER_KEYRING:
+               ring = key_get(key_cred(tsk)->user->uid_keyring);
+               break;
+       case KEY_REQKEY_DEFL_GROUP_KEYRING:
+       default:
+               LBUG();
+       }
+
+       LASSERT(ring);
+       key_unlink(ring, key);
+       key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+                                             struct vfs_cred *vcred,
+                                             int create, int remove_dead)
+{
+       struct obd_import       *imp = sec->ps_import;
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx   *ctx = NULL;
+       unsigned int         is_root = 0, create_new = 0;
+       struct key            *key;
+       char                 desc[24];
+       char                *coinfo;
+       int                   coinfo_size;
+       char                *co_flags = "";
+       ENTRY;
+
+       LASSERT(imp != NULL);
+
+       is_root = user_is_root(sec, vcred);
+
+       /* a little bit optimization for root context */
+       if (is_root) {
+               ctx = sec_lookup_root_ctx_kr(sec);
+               /*
+                * Only lookup directly for REVERSE sec, which should
+                * always succeed.
+                */
+               if (ctx || sec_is_reverse(sec))
+                       RETURN(ctx);
+       }
+
+       LASSERT(create != 0);
+
+       /* for root context, obtain lock and check again, this time hold
+        * the root upcall lock, make sure nobody else populated new root
+        * context after last check. */
+       if (is_root) {
+               mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+               ctx = sec_lookup_root_ctx_kr(sec);
+               if (ctx)
+                       goto out;
+
+               /* update reverse handle for root user */
+               sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+               switch (sec->ps_part) {
+               case LUSTRE_SP_MDT:
+                       co_flags = "m";
+                       break;
+               case LUSTRE_SP_OST:
+                       co_flags = "o";
+                       break;
+               case LUSTRE_SP_MGC:
+                       co_flags = "rmo";
+                       break;
+               case LUSTRE_SP_CLI:
+                       co_flags = "r";
+                       break;
+               case LUSTRE_SP_MGS:
+               default:
+                       LBUG();
+               }
+       }
+
+       /* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+        * but we do authentication based on real uid/gid. the key permission
+        * bits will be exactly as POS_ALL, so only processes who subscribed
+        * this key could have the access, although the quota might be counted
+        * on others (fsuid/fsgid).
+        *
+        * keyring will use fsuid/fsgid as upcall parameters, so we have to
+        * encode real uid/gid into callout info.
+        */
+
+       construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+       /* callout info format:
+        * secid:mech:uid:gid:flags:svc_type:peer_nid:target_uuid
+        */
+       coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+       OBD_ALLOC(coinfo, coinfo_size);
+       if (coinfo == NULL)
+               goto out;
+
+       snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%d:"LPX64":%s",
+                sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+                vcred->vc_uid, vcred->vc_gid,
+                co_flags, import_to_gss_svc(imp),
+                imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name);
+
+       CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+       keyring_upcall_lock(gsec_kr);
+       key = request_key(&gss_key_type, desc, coinfo);
+       keyring_upcall_unlock(gsec_kr);
+
+       OBD_FREE(coinfo, coinfo_size);
+
+       if (IS_ERR(key)) {
+               CERROR("failed request key: %ld\n", PTR_ERR(key));
+               goto out;
+       }
+       CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+       /* once payload.data was pointed to a ctx, it never changes until
+        * we de-associate them; but parallel request_key() may return
+        * a key with payload.data == NULL at the same time. so we still
+        * need wirtelock of key->sem to serialize them. */
+       down_write(&key->sem);
+
+       if (likely(key->payload.data != NULL)) {
+               ctx = key->payload.data;
+
+               LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+               LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+               LASSERT(atomic_read(&key->usage) >= 2);
+
+               /* simply take a ref and return. it's upper layer's
+                * responsibility to detect & replace dead ctx. */
+               atomic_inc(&ctx->cc_refcount);
+       } else {
+               /* pre initialization with a cli_ctx. this can't be done in
+                * key_instantiate() because we'v no enough information
+                * there. */
+               ctx = ctx_create_kr(sec, vcred);
+               if (ctx != NULL) {
+                       ctx_enlist_kr(ctx, is_root, 0);
+                       bind_key_ctx(key, ctx);
+
+                       ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+                       CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+                              key, ctx, sec);
+               } else {
+                       /* we'd prefer to call key_revoke(), but we more like
+                        * to revoke it within this key->sem locked period. */
+                       key_revoke_locked(key);
+               }
+
+               create_new = 1;
+       }
+
+       up_write(&key->sem);
+
+       if (is_root && create_new)
+               request_key_unlink(key);
+
+       key_put(key);
+out:
+       if (is_root)
+               mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+       RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           int sync)
+{
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+                            uid_t uid,
+                            int grace, int force)
+{
+       struct key            *key;
+       char                 desc[24];
+
+       /* nothing to do for reverse or rootonly sec */
+       if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+               return;
+
+       construct_key_desc(desc, sizeof(desc), sec, uid);
+
+       /* there should be only one valid key, but we put it in the
+        * loop in case of any weird cases */
+       for (;;) {
+               key = request_key(&gss_key_type, desc, NULL);
+               if (IS_ERR(key)) {
+                       CDEBUG(D_SEC, "No more key found for current user\n");
+                       break;
+               }
+
+               down_write(&key->sem);
+
+               kill_key_locked(key);
+
+               /* kill_key_locked() should usually revoke the key, but we
+                * revoke it again to make sure, e.g. some case the key may
+                * not well coupled with a context. */
+               key_revoke_locked(key);
+
+               up_write(&key->sem);
+
+               key_put(key);
+       }
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec,
+                            uid_t uid,
+                            int grace, int force)
+{
+       struct gss_sec_keyring *gsec_kr;
+       struct hlist_head       freelist = HLIST_HEAD_INIT;
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       gsec_kr = sec2gsec_keyring(sec);
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_kr->gsk_clist, cc_cache) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+               if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+                       continue;
+
+               /* at this moment there's at least 2 base reference:
+                * key association and in-list. */
+               if (atomic_read(&ctx->cc_refcount) > 2) {
+                       if (!force)
+                               continue;
+                       CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+                             ctx, ctx->cc_vcred.vc_uid,
+                             sec2target_str(ctx->cc_sec),
+                             atomic_read(&ctx->cc_refcount) - 2);
+               }
+
+               set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+               if (!grace)
+                       clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+               atomic_inc(&ctx->cc_refcount);
+
+               if (ctx_unlist_kr(ctx, 1)) {
+                       hlist_add_head(&ctx->cc_cache, &freelist);
+               } else {
+                       LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+                       atomic_dec(&ctx->cc_refcount);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       dispose_ctx_list_kr(&freelist);
+       EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+                              uid_t uid, int grace, int force)
+{
+       ENTRY;
+
+       CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+              sec, atomic_read(&sec->ps_refcount),
+              atomic_read(&sec->ps_nctx),
+              uid, grace, force);
+
+       if (uid != -1 && uid != 0)
+               flush_user_ctx_cache_kr(sec, uid, grace, force);
+       else
+               flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+       RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct hlist_head       freelist = HLIST_HEAD_INIT;
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       CWARN("running gc\n");
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_kr->gsk_clist, cc_cache) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+               atomic_inc(&ctx->cc_refcount);
+
+               if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+                       hlist_add_head(&ctx->cc_cache, &freelist);
+                       CWARN("unhashed ctx %p\n", ctx);
+               } else {
+                       LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+                       atomic_dec(&ctx->cc_refcount);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       dispose_ctx_list_kr(&freelist);
+       EXIT;
+       return;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       struct gss_cli_ctx     *gctx;
+       time_t            now = cfs_time_current_sec();
+       ENTRY;
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                 &gsec_kr->gsk_clist, cc_cache) {
+               struct key           *key;
+               char                flags_str[40];
+               char                mech[40];
+
+               gctx = ctx2gctx(ctx);
+               key = ctx2gctx_keyring(ctx)->gck_key;
+
+               gss_cli_ctx_flags2str(ctx->cc_flags,
+                                     flags_str, sizeof(flags_str));
+
+               if (gctx->gc_mechctx)
+                       lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+               else
+                       snprintf(mech, sizeof(mech), "N/A");
+               mech[sizeof(mech) - 1] = '\0';
+
+               seq_printf(seq, "%p: uid %u, ref %d, expire %ld(%+ld), fl %s, "
+                          "seq %d, win %u, key %08x(ref %d), "
+                          "hdl "LPX64":"LPX64", mech: %s\n",
+                          ctx, ctx->cc_vcred.vc_uid,
+                          atomic_read(&ctx->cc_refcount),
+                          ctx->cc_expire,
+                          ctx->cc_expire ?  ctx->cc_expire - now : 0,
+                          flags_str,
+                          atomic_read(&gctx->gc_seq),
+                          gctx->gc_win,
+                          key ? key->serial : 0,
+                          key ? atomic_read(&key->usage) : 0,
+                          gss_handle_to_u64(&gctx->gc_handle),
+                          gss_handle_to_u64(&gctx->gc_svc_handle),
+                          mech);
+       }
+       spin_unlock(&sec->ps_lock);
+
+       RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       /* upcall is already on the way */
+       return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       if (cli_ctx_check_death(ctx)) {
+               kill_ctx_kr(ctx);
+               return 1;
+       }
+
+       if (cli_ctx_is_ready(ctx))
+               return 0;
+       return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       cli_ctx_expire(ctx);
+       kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service               *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                       struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct ptlrpc_cli_ctx   *cli_ctx;
+       struct vfs_cred   vcred = { 0, 0 };
+       int                   rc;
+
+       LASSERT(sec);
+       LASSERT(svc_ctx);
+
+       cli_ctx = ctx_create_kr(sec, &vcred);
+       if (cli_ctx == NULL)
+               return -ENOMEM;
+
+       rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+       if (rc) {
+               CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+               ctx_put_kr(cli_ctx, 1);
+               return rc;
+       }
+
+       rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+       ctx_put_kr(cli_ctx, 1);
+
+       return 0;
+}
+
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+       return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+                           struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct ptlrpc_sec *sec;
+       int             rc;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       LASSERT(sec);
+
+       rc = sec_install_rctx_kr(sec, svc_ctx);
+       sptlrpc_sec_put(sec);
+
+       return rc;
+}
+
+/****************************************
+ * key apis                         *
+ ****************************************/
+
+static
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+       int          rc;
+       ENTRY;
+
+       if (data != NULL || datalen != 0) {
+               CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+               RETURN(-EINVAL);
+       }
+
+       if (key->payload.data != 0) {
+               CERROR("key already have payload\n");
+               RETURN(-EINVAL);
+       }
+
+       /* link the key to session keyring, so following context negotiation
+        * rpc fired from user space could find this key. This will be unlinked
+        * automatically when upcall processes die.
+        *
+        * we can't do this through keyctl from userspace, because the upcall
+        * might be neither possessor nor owner of the key (setuid).
+        *
+        * the session keyring is created upon upcall, and don't change all
+        * the way until upcall finished, so rcu lock is not needed here.
+        */
+       LASSERT(key_tgcred(current)->session_keyring);
+
+       lockdep_off();
+       rc = key_link(key_tgcred(current)->session_keyring, key);
+       lockdep_on();
+       if (unlikely(rc)) {
+               CERROR("failed to link key %08x to keyring %08x: %d\n",
+                      key->serial,
+                      key_tgcred(current)->session_keyring->serial, rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, key->payload.data);
+       RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+       struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+       struct gss_cli_ctx      *gctx;
+       rawobj_t                 tmpobj = RAWOBJ_EMPTY;
+       __u32               datalen32 = (__u32) datalen;
+       int                   rc;
+       ENTRY;
+
+       if (data == NULL || datalen == 0) {
+               CWARN("invalid: data %p, len %lu\n", data, (long)datalen);
+               RETURN(-EINVAL);
+       }
+
+       /* if upcall finished negotiation too fast (mostly likely because
+        * of local error happened) and call kt_update(), the ctx
+        * might be still NULL. but the key will finally be associate
+        * with a context, or be revoked. if key status is fine, return
+        * -EAGAIN to allow userspace sleep a while and call again. */
+       if (ctx == NULL) {
+               CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+                     key, key->serial, key->flags);
+
+               rc = key_validate(key);
+               if (rc == 0)
+                       RETURN(-EAGAIN);
+               else
+                       RETURN(rc);
+       }
+
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       ctx_clear_timer_kr(ctx);
+
+       /* don't proceed if already refreshed */
+       if (cli_ctx_is_refreshed(ctx)) {
+               CWARN("ctx already done refresh\n");
+               RETURN(0);
+       }
+
+       sptlrpc_cli_ctx_get(ctx);
+       gctx = ctx2gctx(ctx);
+
+       rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+                                 sizeof(gctx->gc_win));
+       if (rc) {
+               CERROR("failed extract seq_win\n");
+               goto out;
+       }
+
+       if (gctx->gc_win == 0) {
+               __u32   nego_rpc_err, nego_gss_err;
+
+               rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+                                         sizeof(nego_rpc_err));
+               if (rc) {
+                       CERROR("failed to extrace rpc rc\n");
+                       goto out;
+               }
+
+               rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+                                         sizeof(nego_gss_err));
+               if (rc) {
+                       CERROR("failed to extrace gss rc\n");
+                       goto out;
+               }
+
+               CERROR("negotiation: rpc err %d, gss err %x\n",
+                      nego_rpc_err, nego_gss_err);
+
+               rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+       } else {
+               rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+                                               (__u32 **) &data, &datalen32);
+               if (rc) {
+                       CERROR("failed extract handle\n");
+                       goto out;
+               }
+
+               rc = rawobj_extract_local(&tmpobj, (__u32 **) &data,&datalen32);
+               if (rc) {
+                       CERROR("failed extract mech\n");
+                       goto out;
+               }
+
+               rc = lgss_import_sec_context(&tmpobj,
+                                            sec2gsec(ctx->cc_sec)->gs_mech,
+                                            &gctx->gc_mechctx);
+               if (rc != GSS_S_COMPLETE)
+                       CERROR("failed import context\n");
+               else
+                       rc = 0;
+       }
+out:
+       /* we don't care what current status of this ctx, even someone else
+        * is operating on the ctx at the same time. we just add up our own
+        * opinions here. */
+       if (rc == 0) {
+               gss_cli_ctx_uptodate(gctx);
+       } else {
+               /* this will also revoke the key. has to be done before
+                * wakeup waiters otherwise they can find the stale key */
+               kill_key_locked(key);
+
+               cli_ctx_expire(ctx);
+
+               if (rc != -ERESTART)
+                       set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+       }
+
+       /* let user space think it's a success */
+       sptlrpc_cli_ctx_put(ctx, 1);
+       RETURN(0);
+}
+
+static
+int gss_kt_match(const struct key *key, const void *desc)
+{
+       return (strcmp(key->description, (const char *) desc) == 0);
+}
+
+static
+void gss_kt_destroy(struct key *key)
+{
+       ENTRY;
+       LASSERT(key->payload.data == NULL);
+       CDEBUG(D_SEC, "destroy key %p\n", key);
+       EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+       if (key->description == NULL)
+               seq_puts(s, "[null]");
+       else
+               seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+       .name      = "lgssc",
+       .def_datalen    = 0,
+       .instantiate    = gss_kt_instantiate,
+       .update  = gss_kt_update,
+       .match    = gss_kt_match,
+       .destroy        = gss_kt_destroy,
+       .describe       = gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy       *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+       .match            = gss_cli_ctx_match,
+       .refresh                = gss_cli_ctx_refresh_kr,
+       .validate              = gss_cli_ctx_validate_kr,
+       .die                = gss_cli_ctx_die_kr,
+       .sign              = gss_cli_ctx_sign,
+       .verify          = gss_cli_ctx_verify,
+       .seal              = gss_cli_ctx_seal,
+       .unseal          = gss_cli_ctx_unseal,
+       .wrap_bulk            = gss_cli_ctx_wrap_bulk,
+       .unwrap_bulk        = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+       .create_sec          = gss_sec_create_kr,
+       .destroy_sec        = gss_sec_destroy_kr,
+       .kill_sec              = gss_sec_kill,
+       .lookup_ctx          = gss_sec_lookup_ctx_kr,
+       .release_ctx        = gss_sec_release_ctx_kr,
+       .flush_ctx_cache        = gss_sec_flush_ctx_cache_kr,
+       .gc_ctx          = gss_sec_gc_ctx_kr,
+       .install_rctx      = gss_sec_install_rctx,
+       .alloc_reqbuf      = gss_alloc_reqbuf,
+       .free_reqbuf        = gss_free_reqbuf,
+       .alloc_repbuf      = gss_alloc_repbuf,
+       .free_repbuf        = gss_free_repbuf,
+       .enlarge_reqbuf  = gss_enlarge_reqbuf,
+       .display                = gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+       .accept          = gss_svc_accept_kr,
+       .invalidate_ctx  = gss_svc_invalidate_ctx,
+       .alloc_rs              = gss_svc_alloc_rs,
+       .authorize            = gss_svc_authorize,
+       .free_rs                = gss_svc_free_rs,
+       .free_ctx              = gss_svc_free_ctx,
+       .prep_bulk            = gss_svc_prep_bulk,
+       .unwrap_bulk        = gss_svc_unwrap_bulk,
+       .wrap_bulk            = gss_svc_wrap_bulk,
+       .install_rctx      = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "gss.keyring",
+       .sp_policy            = SPTLRPC_POLICY_GSS,
+       .sp_cops                = &gss_sec_keyring_cops,
+       .sp_sops                = &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+       int rc;
+
+       rc = register_key_type(&gss_key_type);
+       if (rc) {
+               CERROR("failed to register keyring type: %d\n", rc);
+               return rc;
+       }
+
+       rc = sptlrpc_register_policy(&gss_policy_keyring);
+       if (rc) {
+               unregister_key_type(&gss_key_type);
+               return rc;
+       }
+
+       return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+       unregister_key_type(&gss_key_type);
+       sptlrpc_unregister_policy(&gss_policy_keyring);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644 (file)
index 0000000..676d4b9
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL   22
+#define KG_USAGE_ACCEPTOR_SIGN   23
+#define KG_USAGE_INITIATOR_SEAL         24
+#define KG_USAGE_INITIATOR_SIGN         25
+
+#define KG_TOK_MIC_MSG           0x0404
+#define KG_TOK_WRAP_MSG                 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR         0x01
+#define FLAG_WRAP_CONFIDENTIAL   0x02
+#define FLAG_ACCEPTOR_SUBKEY       0x04
+
+struct krb5_header {
+       __u16      kh_tok_id;      /* token id */
+       __u8        kh_flags;       /* acceptor flags */
+       __u8        kh_filler;      /* 0xff */
+       __u16      kh_ec;         /* extra count */
+       __u16      kh_rrc;       /* right rotation count */
+       __u64      kh_seq;       /* sequence number */
+       __u8        kh_cksum[0];    /* checksum */
+};
+
+struct krb5_keyblock {
+       rawobj_t                 kb_key;
+       struct ll_crypto_cipher *kb_tfm;
+};
+
+struct krb5_ctx {
+       unsigned int        kc_initiate:1,
+                               kc_cfx:1,
+                               kc_seed_init:1,
+                               kc_have_acceptor_subkey:1;
+       __s32              kc_endtime;
+       __u8                kc_seed[16];
+       __u64              kc_seq_send;
+       __u64              kc_seq_recv;
+       __u32              kc_enctype;
+       struct krb5_keyblock    kc_keye;        /* encryption */
+       struct krb5_keyblock    kc_keyi;        /* integrity */
+       struct krb5_keyblock    kc_keyc;        /* checksum */
+       rawobj_t                kc_mech_used;
+};
+
+enum sgn_alg {
+       SGN_ALG_DES_MAC_MD5        = 0x0000,
+       SGN_ALG_MD2_5            = 0x0001,
+       SGN_ALG_DES_MAC        = 0x0002,
+       SGN_ALG_3                    = 0x0003, /* not published */
+       SGN_ALG_HMAC_MD5              = 0x0011, /* microsoft w2k; no support */
+       SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+       SEAL_ALG_NONE            = 0xffff,
+       SEAL_ALG_DES              = 0x0000,
+       SEAL_ALG_1                  = 0x0001, /* not published */
+       SEAL_ALG_MICROSOFT_RC4  = 0x0010, /* microsoft w2k; no support */
+       SEAL_ALG_DES3KD        = 0x0002
+};
+
+#define CKSUMTYPE_CRC32                 0x0001
+#define CKSUMTYPE_RSA_MD4             0x0002
+#define CKSUMTYPE_RSA_MD4_DES     0x0003
+#define CKSUMTYPE_DESCBC               0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5             0x0007
+#define CKSUMTYPE_RSA_MD5_DES     0x0008
+#define CKSUMTYPE_NIST_SHA           0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3       0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                      (39756032L)
+#define KG_KEYTAB_NOMATCH                      (39756033L)
+#define KG_TGT_MISSING                    (39756034L)
+#define KG_NO_SUBKEY                        (39756035L)
+#define KG_CONTEXT_ESTABLISHED            (39756036L)
+#define KG_BAD_SIGN_TYPE                        (39756037L)
+#define KG_BAD_LENGTH                      (39756038L)
+#define KG_CTX_INCOMPLETE                      (39756039L)
+#define KG_CONTEXT                            (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                          (39756042L)
+#define KG_BAD_SEQ                            (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                        (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire.
+ * these get mapped to linux kernel crypto routines.
+ */
+#define ENCTYPE_NULL       0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001 /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002 /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003 /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004 /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005 /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006 /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN         0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644 (file)
index 0000000..4b28931
--- /dev/null
@@ -0,0 +1,1786 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+
+static spinlock_t krb5_seq_lock;
+
+struct krb5_enctype {
+       char       *ke_dispname;
+       char       *ke_enc_name;            /* linux tfm name */
+       char       *ke_hash_name;          /* linux tfm name */
+       int          ke_enc_mode;           /* linux tfm mode */
+       int          ke_hash_size;         /* checksum size */
+       int          ke_conf_size;         /* confounder size */
+       unsigned int    ke_hash_hmac:1;  /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+       [ENCTYPE_DES_CBC_RAW] = {              /* des-cbc-md5 */
+               "des-cbc-md5",
+               "cbc(des)",
+               "md5",
+               0,
+               16,
+               8,
+               0,
+       },
+       [ENCTYPE_DES3_CBC_RAW] = {            /* des3-hmac-sha1 */
+               "des3-hmac-sha1",
+               "cbc(des3_ede)",
+               "hmac(sha1)",
+               0,
+               20,
+               8,
+               1,
+       },
+       [ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {   /* aes128-cts */
+               "aes128-cts-hmac-sha1-96",
+               "cbc(aes)",
+               "hmac(sha1)",
+               0,
+               12,
+               16,
+               1,
+       },
+       [ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {   /* aes256-cts */
+               "aes256-cts-hmac-sha1-96",
+               "cbc(aes)",
+               "hmac(sha1)",
+               0,
+               12,
+               16,
+               1,
+       },
+       [ENCTYPE_ARCFOUR_HMAC] = {            /* arcfour-hmac-md5 */
+               "arcfour-hmac-md5",
+               "ecb(arc4)",
+               "hmac(md5)",
+               0,
+               16,
+               8,
+               1,
+       },
+};
+
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
+static const char * enctype2str(__u32 enctype)
+{
+       if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+               return enctypes[enctype].ke_dispname;
+
+       return "unknown";
+}
+
+static
+int keyblock_init(struct krb5_keyblock *kb, char *alg_name, int alg_mode)
+{
+       kb->kb_tfm = ll_crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+       if (IS_ERR(kb->kb_tfm)) {
+               CERROR("failed to alloc tfm: %s, mode %d\n",
+                      alg_name, alg_mode);
+               return -1;
+       }
+
+       if (ll_crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data, kb->kb_key.len)) {
+               CERROR("failed to set %s key, len %d\n",
+                      alg_name, kb->kb_key.len);
+               return -1;
+       }
+
+       return 0;
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+       struct krb5_enctype *ke;
+
+       if (kctx->kc_enctype >= MAX_ENCTYPES ||
+           enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+               CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+               return -1;
+       }
+
+       ke = &enctypes[kctx->kc_enctype];
+
+       /* tfm arc4 is stateful, user should alloc-use-free by his own */
+       if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+           keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+
+       /* tfm hmac is stateful, user should alloc-use-free by his own */
+       if (ke->ke_hash_hmac == 0 &&
+           keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+       if (ke->ke_hash_hmac == 0 &&
+           keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+
+       return 0;
+}
+
+static
+void keyblock_free(struct krb5_keyblock *kb)
+{
+       rawobj_free(&kb->kb_key);
+       if (kb->kb_tfm)
+               ll_crypto_free_blkcipher(kb->kb_tfm);
+}
+
+static
+int keyblock_dup(struct krb5_keyblock *new, struct krb5_keyblock *kb)
+{
+       return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+static
+int get_bytes(char **ptr, const char *end, void *res, int len)
+{
+       char *p, *q;
+       p = *ptr;
+       q = p + len;
+       if (q > end || q < p)
+               return -1;
+       memcpy(res, p, len);
+       *ptr = q;
+       return 0;
+}
+
+static
+int get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+       char   *p, *q;
+       __u32   len;
+
+       p = *ptr;
+       if (get_bytes(&p, end, &len, sizeof(len)))
+               return -1;
+
+       q = p + len;
+       if (q > end || q < p)
+               return -1;
+
+       OBD_ALLOC_LARGE(res->data, len);
+       if (!res->data)
+               return -1;
+
+       res->len = len;
+       memcpy(res->data, p, len);
+       *ptr = q;
+       return 0;
+}
+
+static
+int get_keyblock(char **ptr, const char *end,
+                struct krb5_keyblock *kb, __u32 keysize)
+{
+       char *buf;
+
+       OBD_ALLOC_LARGE(buf, keysize);
+       if (buf == NULL)
+               return -1;
+
+       if (get_bytes(ptr, end, buf, keysize)) {
+               OBD_FREE_LARGE(buf, keysize);
+               return -1;
+       }
+
+       kb->kb_key.len = keysize;
+       kb->kb_key.data = buf;
+       return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+       rawobj_free(&kctx->kc_mech_used);
+
+       keyblock_free(&kctx->kc_keye);
+       keyblock_free(&kctx->kc_keyi);
+       keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+       unsigned int    tmp_uint, keysize;
+
+       /* seed_init flag */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+       kctx->kc_seed_init = (tmp_uint != 0);
+
+       /* seed */
+       if (get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+               goto out_err;
+
+       /* sign/seal algorithm, not really used now */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       /* end time */
+       if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+               goto out_err;
+
+       /* seq send */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+       kctx->kc_seq_send = tmp_uint;
+
+       /* mech oid */
+       if (get_rawobj(&p, end, &kctx->kc_mech_used))
+               goto out_err;
+
+       /* old style enc/seq keys in format:
+        *   - enctype (u32)
+        *   - keysize (u32)
+        *   - keydata
+        * we decompose them to fit into the new context
+        */
+
+       /* enc key */
+       if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+               goto out_err;
+
+       if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+               goto out_err;
+
+       if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+               goto out_err;
+
+       /* seq key */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           tmp_uint != kctx->kc_enctype)
+               goto out_err;
+
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           tmp_uint != keysize)
+               goto out_err;
+
+       if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+               goto out_err;
+
+       /* old style fallback */
+       if (keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+               goto out_err;
+
+       if (p != end)
+               goto out_err;
+
+       CDEBUG(D_SEC, "succesfully imported rfc1964 context\n");
+       return 0;
+out_err:
+       return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR                0x00000001
+#define KRB5_CTX_FLAG_CFX              0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY  0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+       unsigned int    tmp_uint, keysize;
+
+       /* end time */
+       if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+               goto out_err;
+
+       /* flags */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+               kctx->kc_initiate = 1;
+       if (tmp_uint & KRB5_CTX_FLAG_CFX)
+               kctx->kc_cfx = 1;
+       if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+               kctx->kc_have_acceptor_subkey = 1;
+
+       /* seq send */
+       if (get_bytes(&p, end, &kctx->kc_seq_send, sizeof(kctx->kc_seq_send)))
+               goto out_err;
+
+       /* enctype */
+       if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+               goto out_err;
+
+       /* size of each key */
+       if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+               goto out_err;
+
+       /* number of keys - should always be 3 */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       if (tmp_uint != 3) {
+               CERROR("Invalid number of keys: %u\n", tmp_uint);
+               goto out_err;
+       }
+
+       /* ke */
+       if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+               goto out_err;
+       /* ki */
+       if (get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+               goto out_err;
+       /* ki */
+       if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+               goto out_err;
+
+       CDEBUG(D_SEC, "succesfully imported v2 context\n");
+       return 0;
+out_err:
+       return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+                                     struct gss_ctx *gctx)
+{
+       struct krb5_ctx *kctx;
+       char        *p = (char *) inbuf->data;
+       char        *end = (char *) (inbuf->data + inbuf->len);
+       unsigned int     tmp_uint, rc;
+
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+               CERROR("Fail to read version\n");
+               return GSS_S_FAILURE;
+       }
+
+       /* only support 0, 1 for the moment */
+       if (tmp_uint > 2) {
+               CERROR("Invalid version %u\n", tmp_uint);
+               return GSS_S_FAILURE;
+       }
+
+       OBD_ALLOC_PTR(kctx);
+       if (!kctx)
+               return GSS_S_FAILURE;
+
+       if (tmp_uint == 0 || tmp_uint == 1) {
+               kctx->kc_initiate = tmp_uint;
+               rc = import_context_rfc1964(kctx, p, end);
+       } else {
+               rc = import_context_rfc4121(kctx, p, end);
+       }
+
+       if (rc == 0)
+               rc = krb5_init_keys(kctx);
+
+       if (rc) {
+               delete_context_kerberos(kctx);
+               OBD_FREE_PTR(kctx);
+
+               return GSS_S_FAILURE;
+       }
+
+       gctx->internal_ctx_id = kctx;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+                                       struct gss_ctx *gctx_new)
+{
+       struct krb5_ctx *kctx = gctx->internal_ctx_id;
+       struct krb5_ctx *knew;
+
+       OBD_ALLOC_PTR(knew);
+       if (!knew)
+               return GSS_S_FAILURE;
+
+       knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+       knew->kc_cfx = kctx->kc_cfx;
+       knew->kc_seed_init = kctx->kc_seed_init;
+       knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+       knew->kc_endtime = kctx->kc_endtime;
+
+       memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+       knew->kc_seq_send = kctx->kc_seq_recv;
+       knew->kc_seq_recv = kctx->kc_seq_send;
+       knew->kc_enctype = kctx->kc_enctype;
+
+       if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+               goto out_err;
+
+       if (keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+               goto out_err;
+       if (keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+               goto out_err;
+       if (keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+               goto out_err;
+       if (krb5_init_keys(knew))
+               goto out_err;
+
+       gctx_new->internal_ctx_id = knew;
+       CDEBUG(D_SEC, "succesfully copied reverse context\n");
+       return GSS_S_COMPLETE;
+
+out_err:
+       delete_context_kerberos(knew);
+       OBD_FREE_PTR(knew);
+       return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+                                  unsigned long  *endtime)
+{
+       struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+       *endtime = (unsigned long) ((__u32) kctx->kc_endtime);
+       return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+       struct krb5_ctx *kctx = internal_ctx;
+
+       delete_context_kerberos(kctx);
+       OBD_FREE_PTR(kctx);
+}
+
+static
+void buf_to_sg(struct scatterlist *sg, void *ptr, int len)
+{
+       sg_set_buf(sg, ptr, len);
+}
+
+static
+__u32 krb5_encrypt(struct ll_crypto_cipher *tfm,
+                  int decrypt,
+                  void * iv,
+                  void * in,
+                  void * out,
+                  int length)
+{
+       struct blkcipher_desc desc;
+       struct scatterlist    sg;
+       __u8 local_iv[16] = {0};
+       __u32 ret = -EINVAL;
+
+       LASSERT(tfm);
+       desc.tfm  = tfm;
+       desc.info = local_iv;
+       desc.flags= 0;
+
+       if (length % ll_crypto_blkcipher_blocksize(tfm) != 0) {
+               CERROR("output length %d mismatch blocksize %d\n",
+                      length, ll_crypto_blkcipher_blocksize(tfm));
+               goto out;
+       }
+
+       if (ll_crypto_blkcipher_ivsize(tfm) > 16) {
+               CERROR("iv size too large %d\n", ll_crypto_blkcipher_ivsize(tfm));
+               goto out;
+       }
+
+       if (iv)
+               memcpy(local_iv, iv, ll_crypto_blkcipher_ivsize(tfm));
+
+       memcpy(out, in, length);
+       buf_to_sg(&sg, out, length);
+
+       if (decrypt)
+               ret = ll_crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+       else
+               ret = ll_crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+
+out:
+       return(ret);
+}
+
+
+static inline
+int krb5_digest_hmac(struct ll_crypto_hash *tfm,
+                    rawobj_t *key,
+                    struct krb5_header *khdr,
+                    int msgcnt, rawobj_t *msgs,
+                    int iovcnt, lnet_kiov_t *iovs,
+                    rawobj_t *cksum)
+{
+       struct hash_desc   desc;
+       struct scatterlist sg[1];
+       int             i;
+
+       ll_crypto_hash_setkey(tfm, key->data, key->len);
+       desc.tfm  = tfm;
+       desc.flags= 0;
+
+       ll_crypto_hash_init(&desc);
+
+       for (i = 0; i < msgcnt; i++) {
+               if (msgs[i].len == 0)
+                       continue;
+               buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+               ll_crypto_hash_update(&desc, sg, msgs[i].len);
+       }
+
+       for (i = 0; i < iovcnt; i++) {
+               if (iovs[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+                           iovs[i].kiov_offset);
+               ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+       }
+
+       if (khdr) {
+               buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+               ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+       }
+
+       return ll_crypto_hash_final(&desc, cksum->data);
+}
+
+
+static inline
+int krb5_digest_norm(struct ll_crypto_hash *tfm,
+                    struct krb5_keyblock *kb,
+                    struct krb5_header *khdr,
+                    int msgcnt, rawobj_t *msgs,
+                    int iovcnt, lnet_kiov_t *iovs,
+                    rawobj_t *cksum)
+{
+       struct hash_desc   desc;
+       struct scatterlist sg[1];
+       int             i;
+
+       LASSERT(kb->kb_tfm);
+       desc.tfm  = tfm;
+       desc.flags= 0;
+
+       ll_crypto_hash_init(&desc);
+
+       for (i = 0; i < msgcnt; i++) {
+               if (msgs[i].len == 0)
+                       continue;
+               buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+               ll_crypto_hash_update(&desc, sg, msgs[i].len);
+       }
+
+       for (i = 0; i < iovcnt; i++) {
+               if (iovs[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+                           iovs[i].kiov_offset);
+               ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+       }
+
+       if (khdr) {
+               buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+               ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+       }
+
+       ll_crypto_hash_final(&desc, cksum->data);
+
+       return krb5_encrypt(kb->kb_tfm, 0, NULL, cksum->data,
+                           cksum->data, cksum->len);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+                        struct krb5_keyblock *kb,
+                        struct krb5_header *khdr,
+                        int msgcnt, rawobj_t *msgs,
+                        int iovcnt, lnet_kiov_t *iovs,
+                        rawobj_t *cksum)
+{
+       struct krb5_enctype   *ke = &enctypes[enctype];
+       struct ll_crypto_hash *tfm;
+       __u32             code = GSS_S_FAILURE;
+       int                 rc;
+
+       if (!(tfm = ll_crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+               CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+               return GSS_S_FAILURE;
+       }
+
+       cksum->len = ll_crypto_hash_digestsize(tfm);
+       OBD_ALLOC_LARGE(cksum->data, cksum->len);
+       if (!cksum->data) {
+               cksum->len = 0;
+               goto out_tfm;
+       }
+
+       if (ke->ke_hash_hmac)
+               rc = krb5_digest_hmac(tfm, &kb->kb_key,
+                                     khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+       else
+               rc = krb5_digest_norm(tfm, kb,
+                                     khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+
+       if (rc == 0)
+               code = GSS_S_COMPLETE;
+out_tfm:
+       ll_crypto_free_hash(tfm);
+       return code;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+                            struct krb5_header *khdr,
+                            int privacy)
+{
+       unsigned char acceptor_flag;
+
+       acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+       if (privacy) {
+               khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+               khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+               khdr->kh_ec = cpu_to_be16(0);
+               khdr->kh_rrc = cpu_to_be16(0);
+       } else {
+               khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+               khdr->kh_flags = acceptor_flag;
+               khdr->kh_ec = cpu_to_be16(0xffff);
+               khdr->kh_rrc = cpu_to_be16(0xffff);
+       }
+
+       khdr->kh_filler = 0xff;
+       spin_lock(&krb5_seq_lock);
+       khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+       spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+                               struct krb5_header *khdr,
+                               int privacy)
+{
+       unsigned char acceptor_flag;
+       __u16    tok_id, ec_rrc;
+
+       acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+       if (privacy) {
+               tok_id = KG_TOK_WRAP_MSG;
+               ec_rrc = 0x0;
+       } else {
+               tok_id = KG_TOK_MIC_MSG;
+               ec_rrc = 0xffff;
+       }
+
+       /* sanity checks */
+       if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+               CERROR("bad token id\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+               CERROR("bad direction flag\n");
+               return GSS_S_BAD_SIG;
+       }
+       if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+               CERROR("missing confidential flag\n");
+               return GSS_S_BAD_SIG;
+       }
+       if (khdr->kh_filler != 0xff) {
+               CERROR("bad filler\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+           be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+               CERROR("bad EC or RRC\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+                          int msgcnt,
+                          rawobj_t *msgs,
+                          int iovcnt,
+                          lnet_kiov_t *iovs,
+                          rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 0);
+
+       /* checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                              khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+               return GSS_S_FAILURE;
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+       memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+
+       token->len = sizeof(*khdr) + ke->ke_hash_size;
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+                             int msgcnt,
+                             rawobj_t *msgs,
+                             int iovcnt,
+                             lnet_kiov_t *iovs,
+                             rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       __u32           major;
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 0);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+               CERROR("short signature: %u, require %d\n",
+                      token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+               return GSS_S_FAILURE;
+       }
+
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                              khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+               CERROR("failed to make checksum\n");
+               return GSS_S_FAILURE;
+       }
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               rawobj_free(&cksum);
+               return GSS_S_BAD_SIG;
+       }
+
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+static
+int add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+       int padding;
+
+       padding = (blocksize - (msg->len & (blocksize - 1))) &
+                 (blocksize - 1);
+       if (!padding)
+               return 0;
+
+       if (msg->len + padding > msg_buflen) {
+               CERROR("bufsize %u too small: datalen %u, padding %u\n",
+                       msg_buflen, msg->len, padding);
+               return -EINVAL;
+       }
+
+       memset(msg->data + msg->len, padding, padding);
+       msg->len += padding;
+       return 0;
+}
+
+static
+int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm,
+                        int mode_ecb,
+                        int inobj_cnt,
+                        rawobj_t *inobjs,
+                        rawobj_t *outobj,
+                        int enc)
+{
+       struct blkcipher_desc desc;
+       struct scatterlist    src, dst;
+       __u8              local_iv[16] = {0}, *buf;
+       __u32            datalen = 0;
+       int                i, rc;
+       ENTRY;
+
+       buf = outobj->data;
+       desc.tfm  = tfm;
+       desc.info = local_iv;
+       desc.flags = 0;
+
+       for (i = 0; i < inobj_cnt; i++) {
+               LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+               buf_to_sg(&src, inobjs[i].data, inobjs[i].len);
+               buf_to_sg(&dst, buf, outobj->len - datalen);
+
+               if (mode_ecb) {
+                       if (enc)
+                               rc = ll_crypto_blkcipher_encrypt(
+                                       &desc, &dst, &src, src.length);
+                       else
+                               rc = ll_crypto_blkcipher_decrypt(
+                                       &desc, &dst, &src, src.length);
+               } else {
+                       if (enc)
+                               rc = ll_crypto_blkcipher_encrypt_iv(
+                                       &desc, &dst, &src, src.length);
+                       else
+                               rc = ll_crypto_blkcipher_decrypt_iv(
+                                       &desc, &dst, &src, src.length);
+               }
+
+               if (rc) {
+                       CERROR("encrypt error %d\n", rc);
+                       RETURN(rc);
+               }
+
+               datalen += inobjs[i].len;
+               buf += inobjs[i].len;
+       }
+
+       outobj->len = datalen;
+       RETURN(0);
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm,
+                     struct krb5_header *khdr,
+                     char *confounder,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *cipher,
+                     int adj_nob)
+{
+       struct blkcipher_desc   ciph_desc;
+       __u8                local_iv[16] = {0};
+       struct scatterlist      src, dst;
+       int                  blocksize, i, rc, nob = 0;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+
+       blocksize = ll_crypto_blkcipher_blocksize(tfm);
+       LASSERT(blocksize > 1);
+       LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+       ciph_desc.tfm  = tfm;
+       ciph_desc.info = local_iv;
+       ciph_desc.flags = 0;
+
+       /* encrypt confounder */
+       buf_to_sg(&src, confounder, blocksize);
+       buf_to_sg(&dst, cipher->data, blocksize);
+
+       rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize);
+       if (rc) {
+               CERROR("error to encrypt confounder: %d\n", rc);
+               return rc;
+       }
+
+       /* encrypt clear pages */
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               sg_set_page(&src, desc->bd_iov[i].kiov_page,
+                           (desc->bd_iov[i].kiov_len + blocksize - 1) &
+                           (~(blocksize - 1)),
+                           desc->bd_iov[i].kiov_offset);
+               if (adj_nob)
+                       nob += src.length;
+               sg_set_page(&dst, desc->bd_enc_iov[i].kiov_page, src.length,
+                           src.offset);
+
+               desc->bd_enc_iov[i].kiov_offset = dst.offset;
+               desc->bd_enc_iov[i].kiov_len = dst.length;
+
+               rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+                                                   src.length);
+               if (rc) {
+                       CERROR("error to encrypt page: %d\n", rc);
+                       return rc;
+               }
+       }
+
+       /* encrypt krb5 header */
+       buf_to_sg(&src, khdr, sizeof(*khdr));
+       buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+       rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc,
+                                           &dst, &src, sizeof(*khdr));
+       if (rc) {
+               CERROR("error to encrypt krb5 header: %d\n", rc);
+               return rc;
+       }
+
+       if (adj_nob)
+               desc->bd_nob = nob;
+
+       return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's kiov_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to bd_enc_iov[]->kiov_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus kiov_len is accurate already, so we should not adjust it at all.
+ *   and bd_enc_iov[]->kiov_len should be round_up(bd_iov[]->kiov_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm,
+                     struct krb5_header *khdr,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *cipher,
+                     rawobj_t *plain,
+                     int adj_nob)
+{
+       struct blkcipher_desc   ciph_desc;
+       __u8                local_iv[16] = {0};
+       struct scatterlist      src, dst;
+       int                  ct_nob = 0, pt_nob = 0;
+       int                  blocksize, i, rc;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+       LASSERT(desc->bd_nob_transferred);
+
+       blocksize = ll_crypto_blkcipher_blocksize(tfm);
+       LASSERT(blocksize > 1);
+       LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+       ciph_desc.tfm  = tfm;
+       ciph_desc.info = local_iv;
+       ciph_desc.flags = 0;
+
+       if (desc->bd_nob_transferred % blocksize) {
+               CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+               return -EPROTO;
+       }
+
+       /* decrypt head (confounder) */
+       buf_to_sg(&src, cipher->data, blocksize);
+       buf_to_sg(&dst, plain->data, blocksize);
+
+       rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize);
+       if (rc) {
+               CERROR("error to decrypt confounder: %d\n", rc);
+               return rc;
+       }
+
+       for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+            i++) {
+               if (desc->bd_enc_iov[i].kiov_offset % blocksize != 0 ||
+                   desc->bd_enc_iov[i].kiov_len % blocksize != 0) {
+                       CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+                              i, desc->bd_enc_iov[i].kiov_offset,
+                              desc->bd_enc_iov[i].kiov_len, blocksize);
+                       return -EFAULT;
+               }
+
+               if (adj_nob) {
+                       if (ct_nob + desc->bd_enc_iov[i].kiov_len >
+                           desc->bd_nob_transferred)
+                               desc->bd_enc_iov[i].kiov_len =
+                                       desc->bd_nob_transferred - ct_nob;
+
+                       desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len;
+                       if (pt_nob + desc->bd_enc_iov[i].kiov_len >desc->bd_nob)
+                               desc->bd_iov[i].kiov_len = desc->bd_nob -pt_nob;
+               } else {
+                       /* this should be guaranteed by LNET */
+                       LASSERT(ct_nob + desc->bd_enc_iov[i].kiov_len <=
+                               desc->bd_nob_transferred);
+                       LASSERT(desc->bd_iov[i].kiov_len <=
+                               desc->bd_enc_iov[i].kiov_len);
+               }
+
+               if (desc->bd_enc_iov[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&src, desc->bd_enc_iov[i].kiov_page,
+                           desc->bd_enc_iov[i].kiov_len,
+                           desc->bd_enc_iov[i].kiov_offset);
+               dst = src;
+               if (desc->bd_iov[i].kiov_len % blocksize == 0)
+                       sg_assign_page(&dst, desc->bd_iov[i].kiov_page);
+
+               rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+                                                   src.length);
+               if (rc) {
+                       CERROR("error to decrypt page: %d\n", rc);
+                       return rc;
+               }
+
+               if (desc->bd_iov[i].kiov_len % blocksize != 0) {
+                       memcpy(page_address(desc->bd_iov[i].kiov_page) +
+                              desc->bd_iov[i].kiov_offset,
+                              page_address(desc->bd_enc_iov[i].kiov_page) +
+                              desc->bd_iov[i].kiov_offset,
+                              desc->bd_iov[i].kiov_len);
+               }
+
+               ct_nob += desc->bd_enc_iov[i].kiov_len;
+               pt_nob += desc->bd_iov[i].kiov_len;
+       }
+
+       if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+               CERROR("%d cipher text transferred but only %d decrypted\n",
+                      desc->bd_nob_transferred, ct_nob);
+               return -EFAULT;
+       }
+
+       if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+               CERROR("%d plain text expected but only %d received\n",
+                      desc->bd_nob, pt_nob);
+               return -EFAULT;
+       }
+
+       /* if needed, clear up the rest unused iovs */
+       if (adj_nob)
+               while (i < desc->bd_iov_count)
+                       desc->bd_iov[i++].kiov_len = 0;
+
+       /* decrypt tail (krb5 header) */
+       buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr));
+       buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+       rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc,
+                                           &dst, &src, sizeof(*khdr));
+       if (rc) {
+               CERROR("error to decrypt tail: %d\n", rc);
+               return rc;
+       }
+
+       if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+               CERROR("krb5 header doesn't match\n");
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+                       rawobj_t *gsshdr,
+                       rawobj_t *msg,
+                       int msg_buflen,
+                       rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             data_desc[3], cipher;
+       __u8             conf[GSS_MAX_CIPHER_BLOCK];
+       int               rc = 0;
+
+       LASSERT(ke);
+       LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+       LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+               ke->ke_conf_size >=
+               ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+       /*
+        * final token format:
+        * ---------------------------------------------------
+        * | krb5 header | cipher text | checksum (16 bytes) |
+        * ---------------------------------------------------
+        */
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 1);
+
+       /* generate confounder */
+       cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+       /* get encryption blocksize. note kc_keye might not associated with
+        * a tfm, currently only for arcfour-hmac */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+       LASSERT(blocksize <= ke->ke_conf_size);
+
+       /* padding the message */
+       if (add_padding(msg, msg_buflen, blocksize))
+               return GSS_S_FAILURE;
+
+       /*
+        * clear text layout for checksum:
+        * ------------------------------------------------------
+        * | confounder | gss header | clear msgs | krb5 header |
+        * ------------------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+       data_desc[1].data = gsshdr->data;
+       data_desc[1].len = gsshdr->len;
+       data_desc[2].data = msg->data;
+       data_desc[2].len = msg->len;
+
+       /* compute checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 3, data_desc, 0, NULL, &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       /*
+        * clear text layout for encryption:
+        * -----------------------------------------
+        * | confounder | clear msgs | krb5 header |
+        * -----------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+       data_desc[1].data = msg->data;
+       data_desc[1].len = msg->len;
+       data_desc[2].data = (__u8 *) khdr;
+       data_desc[2].len = sizeof(*khdr);
+
+       /* cipher text will be directly inplace */
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = token->len - sizeof(*khdr);
+       LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               rawobj_t                 arc4_keye;
+               struct ll_crypto_cipher *arc4_tfm;
+
+               if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+                                      NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+                       CERROR("failed to obtain arc4 enc key\n");
+                       GOTO(arc4_out, rc = -EACCES);
+               }
+
+               arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+               if (IS_ERR(arc4_tfm)) {
+                       CERROR("failed to alloc tfm arc4 in ECB mode\n");
+                       GOTO(arc4_out_key, rc = -EACCES);
+               }
+
+               if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+                                              arc4_keye.len)) {
+                       CERROR("failed to set arc4 key, len %d\n",
+                              arc4_keye.len);
+                       GOTO(arc4_out_tfm, rc = -EACCES);
+               }
+
+               rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                         3, data_desc, &cipher, 1);
+arc4_out_tfm:
+               ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+               rawobj_free(&arc4_keye);
+arc4_out:
+               do {} while(0); /* just to avoid compile warning */
+       } else {
+               rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                         3, data_desc, &cipher, 1);
+       }
+
+       if (rc != 0) {
+               rawobj_free(&cksum);
+               return GSS_S_FAILURE;
+       }
+
+       /* fill in checksum */
+       LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+       memcpy((char *)(khdr + 1) + cipher.len,
+              cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+       rawobj_free(&cksum);
+
+       /* final token length */
+       token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+                            struct ptlrpc_bulk_desc *desc)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       int               blocksize, i;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+       LASSERT(kctx->kc_keye.kb_tfm);
+
+       blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(desc->bd_enc_iov[i].kiov_page);
+               /*
+                * offset should always start at page boundary of either
+                * client or server side.
+                */
+               if (desc->bd_iov[i].kiov_offset & blocksize) {
+                       CERROR("odd offset %d in page %d\n",
+                              desc->bd_iov[i].kiov_offset, i);
+                       return GSS_S_FAILURE;
+               }
+
+               desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset;
+               desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len +
+                                               blocksize - 1) & (~(blocksize - 1));
+       }
+
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+                            struct ptlrpc_bulk_desc *desc,
+                            rawobj_t *token, int adj_nob)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             data_desc[1], cipher;
+       __u8             conf[GSS_MAX_CIPHER_BLOCK];
+       int               rc = 0;
+
+       LASSERT(ke);
+       LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+       /*
+        * final token format:
+        * --------------------------------------------------
+        * | krb5 header | head/tail cipher text | checksum |
+        * --------------------------------------------------
+        */
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 1);
+
+       /* generate confounder */
+       cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+       /* get encryption blocksize. note kc_keye might not associated with
+        * a tfm, currently only for arcfour-hmac */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+
+       /*
+        * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+        * the bulk token size would be exactly (sizeof(krb5_header) +
+        * blocksize + sizeof(krb5_header) + hashsize)
+        */
+       LASSERT(blocksize <= ke->ke_conf_size);
+       LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+       LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+       /*
+        * clear text layout for checksum:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+
+       /* compute checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 1, data_desc,
+                              desc->bd_iov_count, desc->bd_iov,
+                              &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       /*
+        * clear text layout for encryption:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        *      |             |      |
+        *      ----------  (cipher pages)   |
+        * result token:   |               |
+        * -------------------------------------------
+        * | krb5 header | cipher text | cipher text |
+        * -------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = blocksize + sizeof(*khdr);
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LBUG();
+               rc = 0;
+       } else {
+               rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                                      conf, desc, &cipher, adj_nob);
+       }
+
+       if (rc != 0) {
+               rawobj_free(&cksum);
+               return GSS_S_FAILURE;
+       }
+
+       /* fill in checksum */
+       LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+       memcpy((char *)(khdr + 1) + cipher.len,
+              cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+       rawobj_free(&cksum);
+
+       /* final token length */
+       token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+                         rawobj_t      *gsshdr,
+                         rawobj_t      *token,
+                         rawobj_t      *msg)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       unsigned char       *tmpbuf;
+       int               blocksize, bodysize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             cipher_in, plain_out;
+       rawobj_t             hash_objs[3];
+       int               rc = 0;
+       __u32           major;
+
+       LASSERT(ke);
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 1);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       /* block size */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+
+       /* expected token layout:
+        * ----------------------------------------
+        * | krb5 header | cipher text | checksum |
+        * ----------------------------------------
+        */
+       bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+       if (bodysize % blocksize) {
+               CERROR("odd bodysize %d\n", bodysize);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+               CERROR("incomplete token: bodysize %d\n", bodysize);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+               CERROR("buffer too small: %u, require %d\n",
+                      msg->len, bodysize - ke->ke_conf_size);
+               return GSS_S_FAILURE;
+       }
+
+       /* decrypting */
+       OBD_ALLOC_LARGE(tmpbuf, bodysize);
+       if (!tmpbuf)
+               return GSS_S_FAILURE;
+
+       major = GSS_S_FAILURE;
+
+       cipher_in.data = (__u8 *) (khdr + 1);
+       cipher_in.len = bodysize;
+       plain_out.data = tmpbuf;
+       plain_out.len = bodysize;
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               rawobj_t                 arc4_keye;
+               struct ll_crypto_cipher *arc4_tfm;
+
+               cksum.data = token->data + token->len - ke->ke_hash_size;
+               cksum.len = ke->ke_hash_size;
+
+               if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+                                      NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+                       CERROR("failed to obtain arc4 enc key\n");
+                       GOTO(arc4_out, rc = -EACCES);
+               }
+
+               arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+               if (IS_ERR(arc4_tfm)) {
+                       CERROR("failed to alloc tfm arc4 in ECB mode\n");
+                       GOTO(arc4_out_key, rc = -EACCES);
+               }
+
+               if (ll_crypto_blkcipher_setkey(arc4_tfm,
+                                        arc4_keye.data, arc4_keye.len)) {
+                       CERROR("failed to set arc4 key, len %d\n",
+                              arc4_keye.len);
+                       GOTO(arc4_out_tfm, rc = -EACCES);
+               }
+
+               rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                         1, &cipher_in, &plain_out, 0);
+arc4_out_tfm:
+               ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+               rawobj_free(&arc4_keye);
+arc4_out:
+               cksum = RAWOBJ_EMPTY;
+       } else {
+               rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                         1, &cipher_in, &plain_out, 0);
+       }
+
+       if (rc != 0) {
+               CERROR("error decrypt\n");
+               goto out_free;
+       }
+       LASSERT(plain_out.len == bodysize);
+
+       /* expected clear text layout:
+        * -----------------------------------------
+        * | confounder | clear msgs | krb5 header |
+        * -----------------------------------------
+        */
+
+       /* verify krb5 header in token is not modified */
+       if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+                  sizeof(*khdr))) {
+               CERROR("decrypted krb5 header mismatch\n");
+               goto out_free;
+       }
+
+       /* verify checksum, compose clear text as layout:
+        * ------------------------------------------------------
+        * | confounder | gss header | clear msgs | krb5 header |
+        * ------------------------------------------------------
+        */
+       hash_objs[0].len = ke->ke_conf_size;
+       hash_objs[0].data = plain_out.data;
+       hash_objs[1].len = gsshdr->len;
+       hash_objs[1].data = gsshdr->data;
+       hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+       hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 3, hash_objs, 0, NULL, &cksum))
+               goto out_free;
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       if (memcmp((char *)(khdr + 1) + bodysize,
+                  cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               goto out_free;
+       }
+
+       msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+       memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+       major = GSS_S_COMPLETE;
+out_free:
+       OBD_FREE_LARGE(tmpbuf, bodysize);
+       rawobj_free(&cksum);
+       return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+                              struct ptlrpc_bulk_desc *desc,
+                              rawobj_t *token, int adj_nob)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             cipher, plain;
+       rawobj_t             data_desc[1];
+       int               rc;
+       __u32           major;
+
+       LASSERT(ke);
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 1);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       /* block size */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+               LBUG();
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+       LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+       /*
+        * token format is expected as:
+        * -----------------------------------------------
+        * | krb5 header | head/tail cipher text | cksum |
+        * -----------------------------------------------
+        */
+       if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+                        ke->ke_hash_size) {
+               CERROR("short token size: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = blocksize + sizeof(*khdr);
+       plain.data = cipher.data;
+       plain.len = cipher.len;
+
+       rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                              desc, &cipher, &plain, adj_nob);
+       if (rc)
+               return GSS_S_DEFECTIVE_TOKEN;
+
+       /*
+        * verify checksum, compose clear text as layout:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        */
+       data_desc[0].data = plain.data;
+       data_desc[0].len = blocksize;
+
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 1, data_desc,
+                              desc->bd_iov_count, desc->bd_iov,
+                              &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       if (memcmp(plain.data + blocksize + sizeof(*khdr),
+                  cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               rawobj_free(&cksum);
+               return GSS_S_BAD_SIG;
+       }
+
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx        *ctx,
+                        char             *buf,
+                        int                bufsize)
+{
+       struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+       int              written;
+
+       written = snprintf(buf, bufsize, "krb5 (%s)",
+                          enctype2str(kctx->kc_enctype));
+       return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+       .gss_import_sec_context     = gss_import_sec_context_kerberos,
+       .gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+       .gss_inquire_context    = gss_inquire_context_kerberos,
+       .gss_get_mic            = gss_get_mic_kerberos,
+       .gss_verify_mic      = gss_verify_mic_kerberos,
+       .gss_wrap                  = gss_wrap_kerberos,
+       .gss_unwrap              = gss_unwrap_kerberos,
+       .gss_prep_bulk        = gss_prep_bulk_kerberos,
+       .gss_wrap_bulk        = gss_wrap_bulk_kerberos,
+       .gss_unwrap_bulk            = gss_unwrap_bulk_kerberos,
+       .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+       .gss_display            = gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_NULL,
+               .sf_name        = "krb5n"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_AUTH,
+               .sf_name        = "krb5a"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_INTG,
+               .sf_name        = "krb5i"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_PRIV,
+               .sf_name        = "krb5p"
+       },
+};
+
+/*
+ * currently we leave module owner NULL
+ */
+static struct gss_api_mech gss_kerberos_mech = {
+       .gm_owner       = NULL, /*THIS_MODULE, */
+       .gm_name        = "krb5",
+       .gm_oid  = (rawobj_t)
+                               {9, "\052\206\110\206\367\022\001\002\002"},
+       .gm_ops  = &gss_kerberos_ops,
+       .gm_sf_num      = 4,
+       .gm_sfs  = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+       int status;
+
+       spin_lock_init(&krb5_seq_lock);
+
+       status = lgss_mech_register(&gss_kerberos_mech);
+       if (status)
+               CERROR("Failed to register kerberos gss mechanism!\n");
+       return status;
+}
+
+void __exit cleanup_kerberos_module(void)
+{
+       lgss_mech_unregister(&gss_kerberos_mech);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644 (file)
index 0000000..8cdad80
--- /dev/null
@@ -0,0 +1,359 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+       spin_lock(&registered_mechs_lock);
+       list_add(&gm->gm_list, &registered_mechs);
+       spin_unlock(&registered_mechs_lock);
+       CWARN("Register %s mechanism\n", gm->gm_name);
+       return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+       spin_lock(&registered_mechs_lock);
+       list_del(&gm->gm_list);
+       spin_unlock(&registered_mechs_lock);
+       CWARN("Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+       __module_get(gm->gm_owner);
+       return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+       struct gss_api_mech *pos, *gm = NULL;
+
+       spin_lock(&registered_mechs_lock);
+       list_for_each_entry(pos, &registered_mechs, gm_list) {
+               if (0 == strcmp(name, pos->gm_name)) {
+                       if (!try_module_get(pos->gm_owner))
+                               continue;
+                       gm = pos;
+                       break;
+               }
+       }
+       spin_unlock(&registered_mechs_lock);
+       return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+       int i;
+
+       for (i = 0; i < gm->gm_sf_num; i++) {
+               if (gm->gm_sfs[i].sf_subflavor == subflavor)
+                       return 1;
+       }
+       return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+       struct gss_api_mech *pos, *gm = NULL;
+
+       spin_lock(&registered_mechs_lock);
+       list_for_each_entry(pos, &registered_mechs, gm_list) {
+               if (!try_module_get(pos->gm_owner))
+                       continue;
+               if (!mech_supports_subflavor(pos, subflavor)) {
+                       module_put(pos->gm_owner);
+                       continue;
+               }
+               gm = pos;
+               break;
+       }
+       spin_unlock(&registered_mechs_lock);
+       return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+       module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+                             struct gss_api_mech *mech,
+                             struct gss_ctx **ctx_id)
+{
+       OBD_ALLOC_PTR(*ctx_id);
+       if (*ctx_id == NULL)
+               return GSS_S_FAILURE;
+
+       (*ctx_id)->mech_type = lgss_mech_get(mech);
+
+       LASSERT(mech);
+       LASSERT(mech->gm_ops);
+       LASSERT(mech->gm_ops->gss_import_sec_context);
+       return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+                               struct gss_ctx **ctx_id_new)
+{
+       struct gss_api_mech *mech = ctx_id->mech_type;
+       __u32           major;
+
+       LASSERT(mech);
+
+       OBD_ALLOC_PTR(*ctx_id_new);
+       if (*ctx_id_new == NULL)
+               return GSS_S_FAILURE;
+
+       (*ctx_id_new)->mech_type = lgss_mech_get(mech);
+
+       LASSERT(mech);
+       LASSERT(mech->gm_ops);
+       LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+       major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+       if (major != GSS_S_COMPLETE) {
+               lgss_mech_put(mech);
+               OBD_FREE_PTR(*ctx_id_new);
+               *ctx_id_new = NULL;
+       }
+       return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+                          unsigned long  *endtime)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_inquire_context(context_handle,
+                                     endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+                  int msgcnt,
+                  rawobj_t *msg,
+                  int iovcnt,
+                  lnet_kiov_t *iovs,
+                  rawobj_t *mic_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_get_mic(context_handle,
+                             msgcnt,
+                             msg,
+                             iovcnt,
+                             iovs,
+                             mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+                     int msgcnt,
+                     rawobj_t *msg,
+                     int iovcnt,
+                     lnet_kiov_t *iovs,
+                     rawobj_t *mic_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_verify_mic(context_handle,
+                                msgcnt,
+                                msg,
+                                iovcnt,
+                                iovs,
+                                mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+               rawobj_t *gsshdr,
+               rawobj_t *msg,
+               int msg_buflen,
+               rawobj_t *out_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+                 rawobj_t *gsshdr,
+                 rawobj_t *token,
+                 rawobj_t *out_msg)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+                    struct ptlrpc_bulk_desc *desc)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+                    struct ptlrpc_bulk_desc *desc,
+                    rawobj_t *token,
+                    int adj_nob)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *token,
+                      int adj_nob)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+       struct gss_api_mech *mech;
+
+       CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+       if (!*context_handle)
+               return(GSS_S_NO_CONTEXT);
+
+       mech = (*context_handle)->mech_type;
+       if ((*context_handle)->internal_ctx_id != 0) {
+               LASSERT(mech);
+               LASSERT(mech->gm_ops);
+               LASSERT(mech->gm_ops->gss_delete_sec_context);
+               mech->gm_ops->gss_delete_sec_context(
+                                       (*context_handle)->internal_ctx_id);
+       }
+       if (mech)
+               lgss_mech_put(mech);
+
+       OBD_FREE_PTR(*context_handle);
+       *context_handle=NULL;
+       return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+                char      *buf,
+                int         bufsize)
+{
+       LASSERT(ctx);
+       LASSERT(ctx->mech_type);
+       LASSERT(ctx->mech_type->gm_ops);
+       LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+       return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644 (file)
index 0000000..3df7257
--- /dev/null
@@ -0,0 +1,1252 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+       return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internel context helpers         *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+                                    struct vfs_cred *vcred)
+{
+       struct gss_cli_ctx *gctx;
+       int              rc;
+
+       OBD_ALLOC_PTR(gctx);
+       if (gctx == NULL)
+               return NULL;
+
+       rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+                                    &gss_pipefs_ctxops, vcred);
+       if (rc) {
+               OBD_FREE_PTR(gctx);
+               return NULL;
+       }
+
+       return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+       if (gss_cli_ctx_fini_common(sec, ctx))
+               return;
+
+       OBD_FREE_PTR(gctx);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+       set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+       atomic_inc(&ctx->cc_refcount);
+       hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+       LASSERT(spin_is_locked(&ctx->cc_sec->ps_lock));
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+       LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+       clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+       if (atomic_dec_and_test(&ctx->cc_refcount)) {
+               __hlist_del(&ctx->cc_cache);
+               hlist_add_head(&ctx->cc_cache, freelist);
+       } else {
+               hlist_del_init(&ctx->cc_cache);
+       }
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+                      struct hlist_head *freelist)
+{
+       if (cli_ctx_check_death(ctx)) {
+               if (freelist)
+                       ctx_unhash_pf(ctx, freelist);
+               return 1;
+       }
+
+       return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+                             struct hlist_head *freelist)
+{
+       LASSERT(ctx->cc_sec);
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+       return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+       /* a little bit optimization for null policy */
+       if (!ctx->cc_ops->match)
+               return 1;
+
+       return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       while (!hlist_empty(head)) {
+               ctx = hlist_entry(head->first, struct ptlrpc_cli_ctx,
+                                     cc_cache);
+
+               LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+               LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+                                    &ctx->cc_flags) == 0);
+
+               hlist_del_init(&ctx->cc_cache);
+               ctx_destroy_pf(ctx->cc_sec, ctx);
+       }
+}
+
+/****************************************
+ * context apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       if (ctx_check_death_pf(ctx, NULL))
+               return 1;
+       if (cli_ctx_is_ready(ctx))
+               return 0;
+       return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+       LASSERT(ctx->cc_sec);
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       cli_ctx_expire(ctx);
+
+       spin_lock(&ctx->cc_sec->ps_lock);
+
+       if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+               LASSERT(!hlist_unhashed(&ctx->cc_cache));
+               LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+               hlist_del_init(&ctx->cc_cache);
+               if (atomic_dec_and_test(&ctx->cc_refcount))
+                       LBUG();
+       }
+
+       spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation         *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+       return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+                           struct ptlrpc_cli_ctx *new)
+{
+       struct gss_sec_pipefs *gsec_pf;
+       struct ptlrpc_cli_ctx *ctx;
+       struct hlist_node     *next;
+       HLIST_HEAD(freelist);
+       unsigned int hash;
+       ENTRY;
+
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                             (__u64) new->cc_vcred.vc_uid);
+       LASSERT(hash < gsec_pf->gsp_chash_size);
+
+       spin_lock(&gsec->gs_base.ps_lock);
+
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_pf->gsp_chash[hash], cc_cache) {
+               if (!ctx_match_pf(ctx, &new->cc_vcred))
+                       continue;
+
+               cli_ctx_expire(ctx);
+               ctx_unhash_pf(ctx, &freelist);
+               break;
+       }
+
+       ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+       spin_unlock(&gsec->gs_base.ps_lock);
+
+       ctx_list_destroy_pf(&freelist);
+       EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+                              struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct vfs_cred   vcred;
+       struct ptlrpc_cli_ctx   *cli_ctx;
+       int                   rc;
+       ENTRY;
+
+       vcred.vc_uid = 0;
+       vcred.vc_gid = 0;
+
+       cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+       if (!cli_ctx)
+               RETURN(-ENOMEM);
+
+       rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+       if (rc) {
+               ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+               RETURN(rc);
+       }
+
+       gss_sec_ctx_replace_pf(gsec, cli_ctx);
+       RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+                        struct hlist_head *freelist)
+{
+       struct ptlrpc_sec       *sec;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct hlist_node       *next;
+       int i;
+       ENTRY;
+
+       sec = &gsec_pf->gsp_base.gs_base;
+
+       CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+               hlist_for_each_entry_safe(ctx, next,
+                                             &gsec_pf->gsp_chash[i], cc_cache)
+                       ctx_check_death_locked_pf(ctx, freelist);
+       }
+
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+       EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+                                    struct ptlrpc_svc_ctx *ctx,
+                                    struct sptlrpc_flavor *sf)
+{
+       struct gss_sec_pipefs   *gsec_pf;
+       int                   alloc_size, hash_size, i;
+       ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+       if (ctx ||
+           sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+               hash_size = 1;
+       else
+               hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+       alloc_size = sizeof(*gsec_pf) +
+                    sizeof(struct hlist_head) * hash_size;
+
+       OBD_ALLOC(gsec_pf, alloc_size);
+       if (!gsec_pf)
+               RETURN(NULL);
+
+       gsec_pf->gsp_chash_size = hash_size;
+       for (i = 0; i < hash_size; i++)
+               INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+       if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+                                 imp, ctx, sf))
+               goto err_free;
+
+       if (ctx == NULL) {
+               if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+                       goto err_destroy;
+       } else {
+               if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+                       goto err_destroy;
+       }
+
+       RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+       gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+       OBD_FREE(gsec_pf, alloc_size);
+       RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_pipefs   *gsec_pf;
+       struct gss_sec    *gsec;
+
+       CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       LASSERT(gsec_pf->gsp_chash);
+       LASSERT(gsec_pf->gsp_chash_size);
+
+       gss_sec_pipe_upcall_fini(gsec);
+
+       gss_sec_destroy_common(gsec);
+
+       OBD_FREE(gsec, sizeof(*gsec_pf) +
+                      sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+                                             struct vfs_cred *vcred,
+                                             int create, int remove_dead)
+{
+       struct gss_sec   *gsec;
+       struct gss_sec_pipefs  *gsec_pf;
+       struct ptlrpc_cli_ctx  *ctx = NULL, *new = NULL;
+       struct hlist_head       *hash_head;
+       struct hlist_node       *next;
+       HLIST_HEAD(freelist);
+       unsigned int        hash, gc = 0, found = 0;
+       ENTRY;
+
+       might_sleep();
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                             (__u64) vcred->vc_uid);
+       hash_head = &gsec_pf->gsp_chash[hash];
+       LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+       spin_lock(&sec->ps_lock);
+
+       /* gc_next == 0 means never do gc */
+       if (remove_dead && sec->ps_gc_next &&
+           cfs_time_after(cfs_time_current_sec(), sec->ps_gc_next)) {
+               gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+               gc = 1;
+       }
+
+       hlist_for_each_entry_safe(ctx, next, hash_head, cc_cache) {
+               if (gc == 0 &&
+                   ctx_check_death_locked_pf(ctx,
+                                             remove_dead ? &freelist : NULL))
+                       continue;
+
+               if (ctx_match_pf(ctx, vcred)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               if (new && new != ctx) {
+                       /* lost the race, just free it */
+                       hlist_add_head(&new->cc_cache, &freelist);
+                       new = NULL;
+               }
+
+               /* hot node, move to head */
+               if (hash_head->first != &ctx->cc_cache) {
+                       __hlist_del(&ctx->cc_cache);
+                       hlist_add_head(&ctx->cc_cache, hash_head);
+               }
+       } else {
+               /* don't allocate for reverse sec */
+               if (sec_is_reverse(sec)) {
+                       spin_unlock(&sec->ps_lock);
+                       RETURN(NULL);
+               }
+
+               if (new) {
+                       ctx_enhash_pf(new, hash_head);
+                       ctx = new;
+               } else if (create) {
+                       spin_unlock(&sec->ps_lock);
+                       new = ctx_create_pf(sec, vcred);
+                       if (new) {
+                               clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+                               goto retry;
+                       }
+               } else {
+                       ctx = NULL;
+               }
+       }
+
+       /* hold a ref */
+       if (ctx)
+               atomic_inc(&ctx->cc_refcount);
+
+       spin_unlock(&sec->ps_lock);
+
+       /* the allocator of the context must give the first push to refresh */
+       if (new) {
+               LASSERT(new == ctx);
+               gss_cli_ctx_refresh_pf(new);
+       }
+
+       ctx_list_destroy_pf(&freelist);
+       RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           int sync)
+{
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+       LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+       /* if required async, we must clear the UPTODATE bit to prevent extra
+        * rpcs during destroy procedure. */
+       if (!sync)
+               clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+       /* destroy this context */
+       ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *      server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+                              uid_t uid,
+                              int grace, int force)
+{
+       struct gss_sec    *gsec;
+       struct gss_sec_pipefs   *gsec_pf;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct hlist_node       *next;
+       HLIST_HEAD(freelist);
+       int i, busy = 0;
+       ENTRY;
+
+       might_sleep_if(grace);
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       spin_lock(&sec->ps_lock);
+       for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+               hlist_for_each_entry_safe(ctx, next,
+                                             &gsec_pf->gsp_chash[i],
+                                             cc_cache) {
+                       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+                       if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+                               continue;
+
+                       if (atomic_read(&ctx->cc_refcount) > 1) {
+                               busy++;
+                               if (!force)
+                                       continue;
+
+                               CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+                                     "grace %d\n",
+                                     atomic_read(&ctx->cc_refcount),
+                                     ctx, ctx->cc_vcred.vc_uid,
+                                     sec2target_str(ctx->cc_sec), grace);
+                       }
+                       ctx_unhash_pf(ctx, &freelist);
+
+                       set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+                       if (!grace)
+                               clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+                                         &ctx->cc_flags);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       ctx_list_destroy_pf(&freelist);
+       RETURN(busy);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+       return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+                           struct ptlrpc_svc_ctx *ctx)
+{
+       struct ptlrpc_sec *sec;
+       int             rc;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       LASSERT(sec);
+       rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+       sptlrpc_sec_put(sec);
+       return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions             *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT       "/lustre"
+#define LUSTRE_PIPE_KRB5       LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+       __u32                      gum_seq;
+       __u32                      gum_uid;
+       __u32                      gum_gid;
+       __u32                      gum_svc;     /* MDS/OSS... */
+       __u64                      gum_nid;     /* peer NID */
+       __u8                        gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+       struct rpc_pipe_msg          gum_base;
+       atomic_t                    gum_refcount;
+       struct list_head                      gum_list;
+       __u32                      gum_mechidx;
+       struct gss_sec           *gum_gsec;
+       struct gss_cli_ctx           *gum_gctx;
+       struct gss_upcall_msg_data      gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+       return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+       MECH_KRB5   = 0,
+       MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+       LASSERT(!strcmp(name, "krb5"));
+       return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+       spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+       spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+       __u32 idx = msg->gum_mechidx;
+
+       upcall_list_lock(idx);
+       list_add(&msg->gum_list, &upcall_lists[idx]);
+       upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+       __u32 idx = msg->gum_mechidx;
+
+       upcall_list_lock(idx);
+       list_del_init(&msg->gum_list);
+       upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers       *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+       ENTRY;
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+       if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+               EXIT;
+               return;
+       }
+
+       if (gmsg->gum_gctx) {
+               sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+               sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+               gmsg->gum_gctx = NULL;
+       }
+
+       LASSERT(list_empty(&gmsg->gum_list));
+       LASSERT(list_empty(&gmsg->gum_base.list));
+       OBD_FREE_PTR(gmsg);
+       EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+       __u32 idx = gmsg->gum_mechidx;
+
+       LASSERT(idx < MECH_MAX);
+       LASSERT(spin_is_locked(&upcall_locks[idx]));
+
+       if (list_empty(&gmsg->gum_list))
+               return;
+
+       list_del_init(&gmsg->gum_list);
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+       atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+       __u32 idx = gmsg->gum_mechidx;
+
+       LASSERT(idx < MECH_MAX);
+       upcall_list_lock(idx);
+       gss_unhash_msg_nolock(gmsg);
+       upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+       if (gmsg->gum_gctx) {
+               struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+               sptlrpc_cli_ctx_expire(ctx);
+               set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+       }
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+       struct gss_upcall_msg *gmsg;
+
+       upcall_list_lock(mechidx);
+       list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+               if (gmsg->gum_data.gum_seq != seq)
+                       continue;
+
+               LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+               LASSERT(gmsg->gum_mechidx == mechidx);
+
+               atomic_inc(&gmsg->gum_refcount);
+               upcall_list_unlock(mechidx);
+               return gmsg;
+       }
+       upcall_list_unlock(mechidx);
+       return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+       if (*buflen < reslen) {
+               CERROR("buflen %u < %u\n", *buflen, reslen);
+               return -EINVAL;
+       }
+
+       memcpy(res, *buf, reslen);
+       *buf += reslen;
+       *buflen -= reslen;
+       return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis                   *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                       char *dst, size_t buflen)
+{
+       char *data = (char *)msg->data + msg->copied;
+       ssize_t mlen = msg->len;
+       ssize_t left;
+       ENTRY;
+
+       if (mlen > buflen)
+               mlen = buflen;
+       left = copy_to_user(dst, data, mlen);
+       if (left < 0) {
+               msg->errno = left;
+               RETURN(left);
+       }
+       mlen -= left;
+       msg->copied += mlen;
+       msg->errno = 0;
+       RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+       struct rpc_inode        *rpci = RPC_I(filp->f_dentry->d_inode);
+       struct gss_upcall_msg   *gss_msg;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct gss_cli_ctx      *gctx = NULL;
+       char                *buf, *data;
+       int                   datalen;
+       int                   timeout, rc;
+       __u32               mechidx, seq, gss_err;
+       ENTRY;
+
+       mechidx = (__u32) (long) rpci->private;
+       LASSERT(mechidx < MECH_MAX);
+
+       OBD_ALLOC(buf, mlen);
+       if (!buf)
+               RETURN(-ENOMEM);
+
+       if (copy_from_user(buf, src, mlen)) {
+               CERROR("failed copy user space data\n");
+               GOTO(out_free, rc = -EFAULT);
+       }
+       data = buf;
+       datalen = mlen;
+
+       /* data passed down format:
+        *  - seq
+        *  - timeout
+        *  - gc_win / error
+        *  - wire_ctx (rawobj)
+        *  - mech_ctx (rawobj)
+        */
+       if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+               CERROR("fail to get seq\n");
+               GOTO(out_free, rc = -EFAULT);
+       }
+
+       gss_msg = gss_find_upcall(mechidx, seq);
+       if (!gss_msg) {
+               CERROR("upcall %u has aborted earlier\n", seq);
+               GOTO(out_free, rc = -EINVAL);
+       }
+
+       gss_unhash_msg(gss_msg);
+       gctx = gss_msg->gum_gctx;
+       LASSERT(gctx);
+       LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+       /* timeout is not in use for now */
+       if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+               GOTO(out_msg, rc = -EFAULT);
+
+       /* lgssd signal an error by gc_win == 0 */
+       if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+                            sizeof(gctx->gc_win)))
+               GOTO(out_msg, rc = -EFAULT);
+
+       if (gctx->gc_win == 0) {
+               /* followed by:
+                * - rpc error
+                * - gss error
+                */
+               if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+                       GOTO(out_msg, rc = -EFAULT);
+               if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+                       GOTO(out_msg, rc = -EFAULT);
+
+               if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+                       CWARN("both rpc & gss error code not set\n");
+                       rc = -EPERM;
+               }
+       } else {
+               rawobj_t tmpobj;
+
+               /* handle */
+               if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                       GOTO(out_msg, rc = -EFAULT);
+               if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+                       GOTO(out_msg, rc = -ENOMEM);
+
+               /* mechctx */
+               if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                       GOTO(out_msg, rc = -EFAULT);
+               gss_err = lgss_import_sec_context(&tmpobj,
+                                                 gss_msg->gum_gsec->gs_mech,
+                                                 &gctx->gc_mechctx);
+               rc = 0;
+       }
+
+       if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+               gss_cli_ctx_uptodate(gctx);
+       } else {
+               ctx = &gctx->gc_base;
+               sptlrpc_cli_ctx_expire(ctx);
+               if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+                       set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+               CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+                      ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+                      test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+                      "fatal error" : "non-fatal");
+       }
+
+       rc = mlen;
+
+out_msg:
+       gss_release_msg(gss_msg);
+
+out_free:
+       OBD_FREE(buf, mlen);
+       /* FIXME
+        * hack pipefs: always return asked length unless all following
+        * downcalls might be messed up. */
+       rc = mlen;
+       RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+       struct gss_upcall_msg     *gmsg;
+       struct gss_upcall_msg_data     *gumd;
+       static cfs_time_t              ratelimit = 0;
+       ENTRY;
+
+       LASSERT(list_empty(&msg->list));
+
+       /* normally errno is >= 0 */
+       if (msg->errno >= 0) {
+               EXIT;
+               return;
+       }
+
+       gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+       gumd = &gmsg->gum_data;
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+       CERROR("failed msg %p (seq %u, uid %u, svc %u, nid "LPX64", obd %.*s): "
+              "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+              gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+              gumd->gum_obd, msg->errno);
+
+       atomic_inc(&gmsg->gum_refcount);
+       gss_unhash_msg(gmsg);
+       if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+               cfs_time_t now = cfs_time_current_sec();
+
+               if (cfs_time_after(now, ratelimit)) {
+                       CWARN("upcall timed out, is lgssd running?\n");
+                       ratelimit = now + 15;
+               }
+       }
+       gss_msg_fail_ctx(gmsg);
+       gss_release_msg(gmsg);
+       EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+       struct rpc_inode *rpci = RPC_I(inode);
+       __u32        idx;
+       ENTRY;
+
+       idx = (__u32) (long) rpci->private;
+       LASSERT(idx < MECH_MAX);
+
+       upcall_list_lock(idx);
+       while (!list_empty(&upcall_lists[idx])) {
+               struct gss_upcall_msg      *gmsg;
+               struct gss_upcall_msg_data *gumd;
+
+               gmsg = list_entry(upcall_lists[idx].next,
+                                     struct gss_upcall_msg, gum_list);
+               gumd = &gmsg->gum_data;
+               LASSERT(list_empty(&gmsg->gum_base.list));
+
+               CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+                      "nid "LPX64", obd %.*s\n", gmsg,
+                      gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+                      gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+                      gumd->gum_obd);
+
+               gmsg->gum_base.errno = -EPIPE;
+               atomic_inc(&gmsg->gum_refcount);
+               gss_unhash_msg_nolock(gmsg);
+
+               gss_msg_fail_ctx(gmsg);
+
+               upcall_list_unlock(idx);
+               gss_release_msg(gmsg);
+               upcall_list_lock(idx);
+       }
+       upcall_list_unlock(idx);
+       EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+       .upcall  = gss_pipe_upcall,
+       .downcall       = gss_pipe_downcall,
+       .destroy_msg    = gss_pipe_destroy_msg,
+       .release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions           *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       struct obd_import         *imp;
+       struct gss_sec       *gsec;
+       struct gss_upcall_msg      *gmsg;
+       int                      rc = 0;
+       ENTRY;
+
+       might_sleep();
+
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_import);
+       LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+       imp = ctx->cc_sec->ps_import;
+       if (!imp->imp_connection) {
+               CERROR("import has no connection set\n");
+               RETURN(-EINVAL);
+       }
+
+       gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+       OBD_ALLOC_PTR(gmsg);
+       if (!gmsg)
+               RETURN(-ENOMEM);
+
+       /* initialize pipefs base msg */
+       INIT_LIST_HEAD(&gmsg->gum_base.list);
+       gmsg->gum_base.data = &gmsg->gum_data;
+       gmsg->gum_base.len = sizeof(gmsg->gum_data);
+       gmsg->gum_base.copied = 0;
+       gmsg->gum_base.errno = 0;
+
+       /* init upcall msg */
+       atomic_set(&gmsg->gum_refcount, 1);
+       gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+       gmsg->gum_gsec = gsec;
+       gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+                                     struct gss_cli_ctx, gc_base);
+       gmsg->gum_data.gum_seq = upcall_get_sequence();
+       gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+       gmsg->gum_data.gum_gid = 0; /* not used for now */
+       gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+       gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+       strncpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+               sizeof(gmsg->gum_data.gum_obd));
+
+       /* This only could happen when sysadmin set it dead/expired
+        * using lctl by force. */
+       if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+               CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_flags);
+
+               LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+               ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+               rc = -EIO;
+               goto err_free;
+       }
+
+       upcall_msg_enlist(gmsg);
+
+       rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+                             &gmsg->gum_base);
+       if (rc) {
+               CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+               upcall_msg_delist(gmsg);
+               goto err_free;
+       }
+
+       RETURN(0);
+err_free:
+       OBD_FREE_PTR(gmsg);
+       RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       /* if we are refreshing for root, also update the reverse
+        * handle index, do not confuse reverse contexts. */
+       if (ctx->cc_vcred.vc_uid == 0) {
+               struct gss_sec *gsec;
+
+               gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+               gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+       }
+
+       return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy         *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+       .match            = gss_cli_ctx_match,
+       .refresh                = gss_cli_ctx_refresh_pf,
+       .validate              = gss_cli_ctx_validate_pf,
+       .die                = gss_cli_ctx_die_pf,
+       .sign              = gss_cli_ctx_sign,
+       .verify          = gss_cli_ctx_verify,
+       .seal              = gss_cli_ctx_seal,
+       .unseal          = gss_cli_ctx_unseal,
+       .wrap_bulk            = gss_cli_ctx_wrap_bulk,
+       .unwrap_bulk        = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+       .create_sec          = gss_sec_create_pf,
+       .destroy_sec        = gss_sec_destroy_pf,
+       .kill_sec              = gss_sec_kill,
+       .lookup_ctx          = gss_sec_lookup_ctx_pf,
+       .release_ctx        = gss_sec_release_ctx_pf,
+       .flush_ctx_cache        = gss_sec_flush_ctx_cache_pf,
+       .install_rctx      = gss_sec_install_rctx,
+       .alloc_reqbuf      = gss_alloc_reqbuf,
+       .free_reqbuf        = gss_free_reqbuf,
+       .alloc_repbuf      = gss_alloc_repbuf,
+       .free_repbuf        = gss_free_repbuf,
+       .enlarge_reqbuf  = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+       .accept          = gss_svc_accept_pf,
+       .invalidate_ctx  = gss_svc_invalidate_ctx,
+       .alloc_rs              = gss_svc_alloc_rs,
+       .authorize            = gss_svc_authorize,
+       .free_rs                = gss_svc_free_rs,
+       .free_ctx              = gss_svc_free_ctx,
+       .unwrap_bulk        = gss_svc_unwrap_bulk,
+       .wrap_bulk            = gss_svc_wrap_bulk,
+       .install_rctx      = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "gss.pipefs",
+       .sp_policy            = SPTLRPC_POLICY_GSS_PIPEFS,
+       .sp_cops                = &gss_sec_pipefs_cops,
+       .sp_sops                = &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+       struct dentry   *de;
+
+       /* pipe dir */
+       de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+       if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+               CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+               return PTR_ERR(de);
+       }
+
+       /* FIXME hack pipefs: dput will sometimes cause oops during module
+        * unload and lgssd close the pipe fds. */
+
+       /* krb5 mechanism */
+       de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+                       RPC_PIPE_WAIT_FOR_OPEN);
+       if (!de || IS_ERR(de)) {
+               CERROR("failed to make rpc_pipe %s: %ld\n",
+                      LUSTRE_PIPE_KRB5, PTR_ERR(de));
+               rpc_rmdir(LUSTRE_PIPE_ROOT);
+               return PTR_ERR(de);
+       }
+
+       de_pipes[MECH_KRB5] = de;
+       INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+       spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+       return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+       __u32   i;
+
+       for (i = 0; i < MECH_MAX; i++) {
+               LASSERT(list_empty(&upcall_lists[i]));
+
+               /* dput pipe dentry here might cause lgssd oops. */
+               de_pipes[i] = NULL;
+       }
+
+       rpc_unlink(LUSTRE_PIPE_KRB5);
+       rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+       int rc;
+
+       rc = gss_init_pipefs_upcall();
+       if (rc)
+               return rc;
+
+       rc = sptlrpc_register_policy(&gss_policy_pipefs);
+       if (rc) {
+               gss_exit_pipefs_upcall();
+               return rc;
+       }
+
+       return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+       gss_exit_pipefs_upcall();
+       sptlrpc_unregister_policy(&gss_policy_pipefs);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644 (file)
index 0000000..474ecf8
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+       LASSERT(equi(obj->len, obj->data));
+       return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+       LASSERT(obj);
+       LASSERT(len >= 0);
+
+       obj->len = len;
+       if (len) {
+               OBD_ALLOC_LARGE(obj->data, len);
+               if (!obj->data) {
+                       obj->len = 0;
+                       RETURN(-ENOMEM);
+               }
+               memcpy(obj->data, buf, len);
+       } else
+               obj->data = NULL;
+       return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+       LASSERT(obj);
+
+       if (obj->len) {
+               LASSERT(obj->data);
+               OBD_FREE_LARGE(obj->data, obj->len);
+               obj->len = 0;
+               obj->data = NULL;
+       } else
+               LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+       LASSERT(a && b);
+
+       return (a->len == b->len &&
+               (!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+       LASSERT(src && dest);
+
+       dest->len = src->len;
+       if (dest->len) {
+               OBD_ALLOC_LARGE(dest->data, dest->len);
+               if (!dest->data) {
+                       dest->len = 0;
+                       return -ENOMEM;
+               }
+               memcpy(dest->data, src->data, dest->len);
+       } else
+               dest->data = NULL;
+       return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       __u32 len;
+
+       LASSERT(obj);
+       LASSERT(buf);
+       LASSERT(buflen);
+
+       len = cfs_size_round4(obj->len);
+
+       if (*buflen < 4 + len) {
+               CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+               return -EINVAL;
+       }
+
+       *(*buf)++ = cpu_to_le32(obj->len);
+       memcpy(*buf, obj->data, obj->len);
+       *buf += (len >> 2);
+       *buflen -= (4 + len);
+
+       return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+                           int alloc, int local)
+{
+       __u32 len;
+
+       if (*buflen < sizeof(__u32)) {
+               CERROR("buflen %u\n", *buflen);
+               return -EINVAL;
+       }
+
+       obj->len = *(*buf)++;
+       if (!local)
+               obj->len = le32_to_cpu(obj->len);
+       *buflen -= sizeof(__u32);
+
+       if (!obj->len) {
+               obj->data = NULL;
+               return 0;
+       }
+
+       len = local ? obj->len : cfs_size_round4(obj->len);
+       if (*buflen < len) {
+               CERROR("buflen %u < %u\n", *buflen, len);
+               obj->len = 0;
+               return -EINVAL;
+       }
+
+       if (!alloc)
+               obj->data = (__u8 *) *buf;
+       else {
+               OBD_ALLOC_LARGE(obj->data, obj->len);
+               if (!obj->data) {
+                       CERROR("fail to alloc %u bytes\n", obj->len);
+                       obj->len = 0;
+                       return -ENOMEM;
+               }
+               memcpy(obj->data, *buf, obj->len);
+       }
+
+       *((char **)buf) += len;
+       *buflen -= len;
+
+       return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+       rawobj->len = netobj->len;
+       rawobj->data = netobj->data;
+       return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+       rawobj->len = 0;
+       rawobj->data = NULL;
+
+       if (netobj->len == 0)
+               return 0;
+
+       OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+       if (rawobj->data == NULL)
+               return -ENOMEM;
+
+       rawobj->len = netobj->len;
+       memcpy(rawobj->data, netobj->data, netobj->len);
+       return 0;
+}
+
+/****************************************
+ * misc more                       *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                        void *res, __u32 reslen)
+{
+       if (*buflen < reslen) {
+               CERROR("buflen %u < %u\n", *buflen, reslen);
+               return -EINVAL;
+       }
+
+       memcpy(res, *buf, reslen);
+       *buf += reslen;
+       *buflen -= reslen;
+       return 0;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644 (file)
index 0000000..31b50ea
--- /dev/null
@@ -0,0 +1,1099 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static spinlock_t __ctx_index_lock;
+static __u64 __ctx_index;
+
+__u64 gss_get_next_ctx_index(void)
+{
+       __u64 idx;
+
+       spin_lock(&__ctx_index_lock);
+       idx = __ctx_index++;
+       spin_unlock(&__ctx_index_lock);
+
+       return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+       unsigned long hash = 0;
+       unsigned long l = 0;
+       int len = 0;
+       unsigned char c;
+
+       do {
+               if (len == length) {
+                       c = (char) len;
+                       len = -1;
+               } else
+                       c = *buf++;
+
+               l = (l << 8) | c;
+               len++;
+
+               if ((len & (BITS_PER_LONG/8-1)) == 0)
+                       hash = cfs_hash_long(hash^l, BITS_PER_LONG);
+       } while (len);
+
+       return hash >> (BITS_PER_LONG - bits);
+}
+
+/****************************************
+ * rsi cache                       *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+       struct cache_head       h;
+       __u32              lustre_svc;
+       __u64              nid;
+       wait_queue_head_t            waitq;
+       rawobj_t                in_handle, in_token;
+       rawobj_t                out_handle, out_token;
+       int                  major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static inline int rsi_hash(struct rsi *item)
+{
+       return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+                       RSI_HASHBITS) ^
+              hash_mem((char *)item->in_token.data, item->in_token.len,
+                       RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+       return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+               rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+       rawobj_free(&rsi->in_handle);
+       rawobj_free(&rsi->in_token);
+       rawobj_free(&rsi->out_handle);
+       rawobj_free(&rsi->out_token);
+}
+
+static void rsi_request(struct cache_detail *cd,
+                       struct cache_head *h,
+                       char **bpp, int *blen)
+{
+       struct rsi *rsi = container_of(h, struct rsi, h);
+       __u64 index = 0;
+
+       /* if in_handle is null, provide kernel suggestion */
+       if (rsi->in_handle.len == 0)
+               index = gss_get_next_ctx_index();
+
+       qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+                    sizeof(rsi->lustre_svc));
+       qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+       qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+       qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+       qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+       (*bpp)[-1] = '\n';
+}
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+       return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+       new->out_handle = RAWOBJ_EMPTY;
+       new->out_token = RAWOBJ_EMPTY;
+
+       new->in_handle = item->in_handle;
+       item->in_handle = RAWOBJ_EMPTY;
+       new->in_token = item->in_token;
+       item->in_token = RAWOBJ_EMPTY;
+
+       new->lustre_svc = item->lustre_svc;
+       new->nid = item->nid;
+       init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+       LASSERT(new->out_handle.len == 0);
+       LASSERT(new->out_token.len == 0);
+
+       new->out_handle = item->out_handle;
+       item->out_handle = RAWOBJ_EMPTY;
+       new->out_token = item->out_token;
+       item->out_token = RAWOBJ_EMPTY;
+
+       new->major_status = item->major_status;
+       new->minor_status = item->minor_status;
+}
+
+static void rsi_put(struct kref *ref)
+{
+       struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+       LASSERT(rsi->h.next == NULL);
+       rsi_free(rsi);
+       OBD_FREE_PTR(rsi);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+       struct rsi *item = container_of(a, struct rsi, h);
+       struct rsi *tmp = container_of(b, struct rsi, h);
+
+       return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+       struct rsi *new = container_of(cnew, struct rsi, h);
+       struct rsi *item = container_of(citem, struct rsi, h);
+
+       __rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+       struct rsi *new = container_of(cnew, struct rsi, h);
+       struct rsi *item = container_of(citem, struct rsi, h);
+
+       __rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+       struct rsi *rsi;
+
+       OBD_ALLOC_PTR(rsi);
+       if (rsi)
+               return &rsi->h;
+       else
+               return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+       char       *buf = mesg;
+       char       *ep;
+       int          len;
+       struct rsi      rsii, *rsip = NULL;
+       time_t    expiry;
+       int          status = -EINVAL;
+       ENTRY;
+
+
+       memset(&rsii, 0, sizeof(rsii));
+
+       /* handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       /* token */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.in_token, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       rsip = rsi_lookup(&rsii);
+       if (!rsip)
+               goto out;
+
+       rsii.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       if (expiry == 0)
+               goto out;
+
+       len = qword_get(&mesg, buf, mlen);
+       if (len <= 0)
+               goto out;
+
+       /* major */
+       rsii.major_status = simple_strtol(buf, &ep, 10);
+       if (*ep)
+               goto out;
+
+       /* minor */
+       len = qword_get(&mesg, buf, mlen);
+       if (len <= 0)
+               goto out;
+       rsii.minor_status = simple_strtol(buf, &ep, 10);
+       if (*ep)
+               goto out;
+
+       /* out_handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       /* out_token */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.out_token, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       rsii.h.expiry_time = expiry;
+       rsip = rsi_update(&rsii, rsip);
+       status = 0;
+out:
+       rsi_free(&rsii);
+       if (rsip) {
+               wake_up_all(&rsip->waitq);
+               cache_put(&rsip->h, &rsi_cache);
+       } else {
+               status = -ENOMEM;
+       }
+
+       if (status)
+               CERROR("rsi parse error %d\n", status);
+       RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+       .hash_size      = RSI_HASHMAX,
+       .hash_table     = rsi_table,
+       .name      = "auth.sptlrpc.init",
+       .cache_put      = rsi_put,
+       .cache_upcall   = rsi_upcall,
+       .cache_parse    = rsi_parse,
+       .match    = rsi_match,
+       .init      = rsi_init,
+       .update  = update_rsi,
+       .alloc    = rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+       struct cache_head *ch;
+       int hash = rsi_hash(item);
+
+       ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+       if (ch)
+               return container_of(ch, struct rsi, h);
+       else
+               return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+       struct cache_head *ch;
+       int hash = rsi_hash(new);
+
+       ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+       if (ch)
+               return container_of(ch, struct rsi, h);
+       else
+               return NULL;
+}
+
+/****************************************
+ * rsc cache                       *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+       struct cache_head       h;
+       struct obd_device      *target;
+       rawobj_t                handle;
+       struct gss_svc_ctx      ctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+       rawobj_free(&rsci->handle);
+       rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+       lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+       return hash_mem((char *)rsci->handle.data,
+                       rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+       return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+       new->handle = tmp->handle;
+       tmp->handle = RAWOBJ_EMPTY;
+
+       new->target = NULL;
+       memset(&new->ctx, 0, sizeof(new->ctx));
+       new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+       new->ctx = tmp->ctx;
+       tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+       tmp->ctx.gsc_mechctx = NULL;
+
+       memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+       spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+static void rsc_put(struct kref *ref)
+{
+       struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+       LASSERT(rsci->h.next == NULL);
+       rsc_free(rsci);
+       OBD_FREE_PTR(rsci);
+}
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+       struct rsc *new = container_of(a, struct rsc, h);
+       struct rsc *tmp = container_of(b, struct rsc, h);
+
+       return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+       struct rsc *new = container_of(cnew, struct rsc, h);
+       struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+       __rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+       struct rsc *new = container_of(cnew, struct rsc, h);
+       struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+       __rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+       struct rsc *rsc;
+
+       OBD_ALLOC_PTR(rsc);
+       if (rsc)
+               return &rsc->h;
+       else
+               return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+       char            *buf = mesg;
+       int               len, rv, tmp_int;
+       struct rsc         rsci, *rscp = NULL;
+       time_t         expiry;
+       int               status = -EINVAL;
+       struct gss_api_mech *gm = NULL;
+
+       memset(&rsci, 0, sizeof(rsci));
+
+       /* context handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0) goto out;
+       status = -ENOMEM;
+       if (rawobj_alloc(&rsci.handle, buf, len))
+               goto out;
+
+       rsci.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       status = -EINVAL;
+       if (expiry == 0)
+               goto out;
+
+       /* remote flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get remote flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_remote = (tmp_int != 0);
+
+       /* root user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get oss user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+       /* mds user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get mds user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+       /* oss user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get oss user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+       /* mapped uid */
+       rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+       if (rv) {
+               CERROR("fail to get mapped uid\n");
+               goto out;
+       }
+
+       rscp = rsc_lookup(&rsci);
+       if (!rscp)
+               goto out;
+
+       /* uid, or NEGATIVE */
+       rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+       if (rv == -EINVAL)
+               goto out;
+       if (rv == -ENOENT) {
+               CERROR("NOENT? set rsc entry negative\n");
+               set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+       } else {
+               rawobj_t tmp_buf;
+               unsigned long ctx_expiry;
+
+               /* gid */
+               if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+                       goto out;
+
+               /* mech name */
+               len = qword_get(&mesg, buf, mlen);
+               if (len < 0)
+                       goto out;
+               gm = lgss_name_to_mech(buf);
+               status = -EOPNOTSUPP;
+               if (!gm)
+                       goto out;
+
+               status = -EINVAL;
+               /* mech-specific data: */
+               len = qword_get(&mesg, buf, mlen);
+               if (len < 0)
+                       goto out;
+
+               tmp_buf.len = len;
+               tmp_buf.data = (unsigned char *)buf;
+               if (lgss_import_sec_context(&tmp_buf, gm,
+                                           &rsci.ctx.gsc_mechctx))
+                       goto out;
+
+               /* currently the expiry time passed down from user-space
+                * is invalid, here we retrive it from mech. */
+               if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+                       CERROR("unable to get expire time, drop it\n");
+                       goto out;
+               }
+               expiry = (time_t) ctx_expiry;
+       }
+
+       rsci.h.expiry_time = expiry;
+       rscp = rsc_update(&rsci, rscp);
+       status = 0;
+out:
+       if (gm)
+               lgss_mech_put(gm);
+       rsc_free(&rsci);
+       if (rscp)
+               cache_put(&rscp->h, &rsc_cache);
+       else
+               status = -ENOMEM;
+
+       if (status)
+               CERROR("parse rsc error %d\n", status);
+       return status;
+}
+
+static struct cache_detail rsc_cache = {
+       .hash_size      = RSC_HASHMAX,
+       .hash_table     = rsc_table,
+       .name      = "auth.sptlrpc.context",
+       .cache_put      = rsc_put,
+       .cache_parse    = rsc_parse,
+       .match    = rsc_match,
+       .init      = rsc_init,
+       .update  = update_rsc,
+       .alloc    = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+       struct cache_head *ch;
+       int             hash = rsc_hash(item);
+
+       ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+       if (ch)
+               return container_of(ch, struct rsc, h);
+       else
+               return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+       struct cache_head *ch;
+       int             hash = rsc_hash(new);
+
+       ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+       if (ch)
+               return container_of(ch, struct rsc, h);
+       else
+               return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)       cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush                   *
+ ****************************************/
+
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+       struct cache_head **ch;
+       struct rsc *rscp;
+       int n;
+       ENTRY;
+
+       write_lock(&rsc_cache.hash_lock);
+       for (n = 0; n < RSC_HASHMAX; n++) {
+               for (ch = &rsc_cache.hash_table[n]; *ch;) {
+                       rscp = container_of(*ch, struct rsc, h);
+
+                       if (!match(rscp, data)) {
+                               ch = &((*ch)->next);
+                               continue;
+                       }
+
+                       /* it seems simply set NEGATIVE doesn't work */
+                       *ch = (*ch)->next;
+                       rscp->h.next = NULL;
+                       cache_get(&rscp->h);
+                       set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+                       COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+                       rsc_cache.entries--;
+               }
+       }
+       write_unlock(&rsc_cache.hash_lock);
+       EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+       if ((int) uid == -1)
+               return 1;
+       return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+       return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+       if (uid == -1)
+               CWARN("flush all gss contexts...\n");
+
+       rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+       rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+       rsc_flush_target(target);
+}
+EXPORT_SYMBOL(gss_secsvc_flush);
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+       struct rsc  rsci;
+       struct rsc *found;
+
+       memset(&rsci, 0, sizeof(rsci));
+       if (rawobj_dup(&rsci.handle, handle))
+               return NULL;
+
+       found = rsc_lookup(&rsci);
+       rsc_free(&rsci);
+       if (!found)
+               return NULL;
+       if (cache_check(&rsc_cache, &found->h, NULL))
+               return NULL;
+       return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                  struct gss_sec *gsec,
+                                  struct gss_cli_ctx *gctx)
+{
+       struct rsc      rsci, *rscp = NULL;
+       unsigned long   ctx_expiry;
+       __u32      major;
+       int          rc;
+       ENTRY;
+
+       memset(&rsci, 0, sizeof(rsci));
+
+       if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+                        sizeof(gsec->gs_rvs_hdl)))
+               GOTO(out, rc = -ENOMEM);
+
+       rscp = rsc_lookup(&rsci);
+       if (rscp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       major = lgss_copy_reverse_context(gctx->gc_mechctx,
+                                         &rsci.ctx.gsc_mechctx);
+       if (major != GSS_S_COMPLETE)
+               GOTO(out, rc = -ENOMEM);
+
+       if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+               CERROR("unable to get expire time, drop it\n");
+               GOTO(out, rc = -EINVAL);
+       }
+       rsci.h.expiry_time = (time_t) ctx_expiry;
+
+       if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)
+               rsci.ctx.gsc_usr_mds = 1;
+       else if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0)
+               rsci.ctx.gsc_usr_oss = 1;
+       else
+               rsci.ctx.gsc_usr_root = 1;
+
+       rscp = rsc_update(&rsci, rscp);
+       if (rscp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rscp->target = imp->imp_obd;
+       rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+       CWARN("create reverse svc ctx %p to %s: idx "LPX64"\n",
+             &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+       rc = 0;
+out:
+       if (rscp)
+               cache_put(&rscp->h, &rsc_cache);
+       rsc_free(&rsci);
+
+       if (rc)
+               CERROR("create reverse svc ctx: idx "LPX64", rc %d\n",
+                      gsec->gs_rvs_hdl, rc);
+       RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+       const cfs_time_t        expire = 20;
+       struct rsc           *rscp;
+
+       rscp = gss_svc_searchbyctx(handle);
+       if (rscp) {
+               CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+                      &rscp->ctx, rscp);
+
+               rscp->h.expiry_time = cfs_time_current_sec() + expire;
+               COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+       }
+       return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+       struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+       return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+       struct rsc           *rscp;
+
+       rscp = gss_svc_searchbyctx(handle);
+       if (rscp) {
+               CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+                      &rscp->ctx, rscp, seq + 1);
+
+               rscp->ctx.gsc_rvs_seq = seq + 1;
+               COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+       }
+       return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+       return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                              struct gss_svc_reqctx *grctx,
+                              struct gss_wire_ctx *gw,
+                              struct obd_device *target,
+                              __u32 lustre_svc,
+                              rawobj_t *rvs_hdl,
+                              rawobj_t *in_token)
+{
+       struct ptlrpc_reply_state *rs;
+       struct rsc              *rsci = NULL;
+       struct rsi              *rsip = NULL, rsikey;
+       wait_queue_t         wait;
+       int                     replen = sizeof(struct ptlrpc_body);
+       struct gss_rep_header     *rephdr;
+       int                     first_check = 1;
+       int                     rc = SECSVC_DROP;
+       ENTRY;
+
+       memset(&rsikey, 0, sizeof(rsikey));
+       rsikey.lustre_svc = lustre_svc;
+       rsikey.nid = (__u64) req->rq_peer.nid;
+
+       /* duplicate context handle. for INIT it always 0 */
+       if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+               CERROR("fail to dup context handle\n");
+               GOTO(out, rc);
+       }
+
+       if (rawobj_dup(&rsikey.in_token, in_token)) {
+               CERROR("can't duplicate token\n");
+               rawobj_free(&rsikey.in_handle);
+               GOTO(out, rc);
+       }
+
+       rsip = rsi_lookup(&rsikey);
+       rsi_free(&rsikey);
+       if (!rsip) {
+               CERROR("error in rsi_lookup.\n");
+
+               if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                       rc = SECSVC_COMPLETE;
+
+               GOTO(out, rc);
+       }
+
+       cache_get(&rsip->h); /* take an extra ref */
+       init_waitqueue_head(&rsip->waitq);
+       init_waitqueue_entry_current(&wait);
+       add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+       /* Note each time cache_check() will drop a reference if return
+        * non-zero. We hold an extra reference on initial rsip, but must
+        * take care of following calls. */
+       rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+       switch (rc) {
+       case -EAGAIN: {
+               int valid;
+
+               if (first_check) {
+                       first_check = 0;
+
+                       read_lock(&rsi_cache.hash_lock);
+                       valid = test_bit(CACHE_VALID, &rsip->h.flags);
+                       if (valid == 0)
+                               set_current_state(TASK_INTERRUPTIBLE);
+                       read_unlock(&rsi_cache.hash_lock);
+
+                       if (valid == 0)
+                               schedule_timeout(GSS_SVC_UPCALL_TIMEOUT *
+                                                    HZ);
+
+                       cache_get(&rsip->h);
+                       goto cache_check;
+               }
+               CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+               break;
+       }
+       case -ENOENT:
+               CWARN("cache_check return ENOENT, drop\n");
+               break;
+       case 0:
+               /* if not the first check, we have to release the extra
+                * reference we just added on it. */
+               if (!first_check)
+                       cache_put(&rsip->h, &rsi_cache);
+               CDEBUG(D_SEC, "cache_check is good\n");
+               break;
+       }
+
+       remove_wait_queue(&rsip->waitq, &wait);
+       cache_put(&rsip->h, &rsi_cache);
+
+       if (rc)
+               GOTO(out, rc = SECSVC_DROP);
+
+       rc = SECSVC_DROP;
+       rsci = gss_svc_searchbyctx(&rsip->out_handle);
+       if (!rsci) {
+               CERROR("authentication failed\n");
+
+               if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                       rc = SECSVC_COMPLETE;
+
+               GOTO(out, rc);
+       } else {
+               cache_get(&rsci->h);
+               grctx->src_ctx = &rsci->ctx;
+       }
+
+       if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+               CERROR("failed duplicate reverse handle\n");
+               GOTO(out, rc);
+       }
+
+       rsci->target = target;
+
+       CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+              rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+       if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+               CERROR("handle size %u too large\n", rsip->out_handle.len);
+               GOTO(out, rc = SECSVC_DROP);
+       }
+
+       grctx->src_init = 1;
+       grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len);
+
+       rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+       if (rc) {
+               CERROR("failed to pack reply: %d\n", rc);
+               GOTO(out, rc = SECSVC_DROP);
+       }
+
+       rs = req->rq_reply_state;
+       LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+       LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+               sizeof(*rephdr) + rsip->out_handle.len);
+       LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+       rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+       rephdr->gh_version = PTLRPC_GSS_VERSION;
+       rephdr->gh_flags = 0;
+       rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+       rephdr->gh_major = rsip->major_status;
+       rephdr->gh_minor = rsip->minor_status;
+       rephdr->gh_seqwin = GSS_SEQ_WIN;
+       rephdr->gh_handle.len = rsip->out_handle.len;
+       memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+              rsip->out_handle.len);
+
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+              rsip->out_token.len);
+
+       rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+                                              rsip->out_token.len, 0);
+
+       rc = SECSVC_OK;
+
+out:
+       /* it looks like here we should put rsip also, but this mess up
+        * with NFS cache mgmt code... FIXME */
+#if 0
+       if (rsip)
+               rsi_put(&rsip->h, &rsi_cache);
+#endif
+
+       if (rsci) {
+               /* if anything went wrong, we don't keep the context too */
+               if (rc != SECSVC_OK)
+                       set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+               else
+                       CDEBUG(D_SEC, "create rsc with idx "LPX64"\n",
+                              gss_handle_to_u64(&rsci->handle));
+
+               COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+       }
+       RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                          struct gss_wire_ctx *gw)
+{
+       struct rsc *rsc;
+
+       rsc = gss_svc_searchbyctx(&gw->gw_handle);
+       if (!rsc) {
+               CWARN("Invalid gss ctx idx "LPX64" from %s\n",
+                     gss_handle_to_u64(&gw->gw_handle),
+                     libcfs_nid2str(req->rq_peer.nid));
+               return NULL;
+       }
+
+       return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+       struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+       COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+       struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+       /* can't be found */
+       set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+       /* to be removed at next scan */
+       rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+       int     i;
+
+       spin_lock_init(&__ctx_index_lock);
+       /*
+        * this helps reducing context index confliction. after server reboot,
+        * conflicting request from clients might be filtered out by initial
+        * sequence number checking, thus no chance to sent error notification
+        * back to clients.
+        */
+       cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+
+       cache_register(&rsi_cache);
+       cache_register(&rsc_cache);
+
+       /* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+        * the init upcall channel, otherwise there's big chance that the first
+        * upcall issued before the channel be opened thus nfsv4 cache code will
+        * drop the request direclty, thus lead to unnecessary recovery time.
+        * here we wait at miximum 1.5 seconds. */
+       for (i = 0; i < 6; i++) {
+               if (atomic_read(&rsi_cache.readers) > 0)
+                       break;
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               LASSERT(HZ >= 4);
+               schedule_timeout(HZ / 4);
+       }
+
+       if (atomic_read(&rsi_cache.readers) == 0)
+               CWARN("Init channel is not opened by lsvcgssd, following "
+                     "request might be dropped until lsvcgssd is active\n");
+
+       return 0;
+}
+
+void __exit gss_exit_svc_upcall(void)
+{
+       cache_purge(&rsi_cache);
+       cache_unregister(&rsi_cache);
+
+       cache_purge(&rsc_cache);
+       cache_unregister(&rsc_cache);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644 (file)
index 0000000..2522e05
--- /dev/null
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct proc_dir_entry *gss_proc_root = NULL;
+static struct proc_dir_entry *gss_proc_lk = NULL;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+       spinlock_t  oos_lock;
+       atomic_t    oos_cli_count;       /* client occurrence */
+       int          oos_cli_behind;      /* client max seqs behind */
+       atomic_t    oos_svc_replay[3];   /* server replay detected */
+       atomic_t    oos_svc_pass[3];     /* server verified ok */
+} gss_stat_oos = {
+       .oos_cli_count  = ATOMIC_INIT(0),
+       .oos_cli_behind = 0,
+       .oos_svc_replay = { ATOMIC_INIT(0), },
+       .oos_svc_pass   = { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+       atomic_inc(&gss_stat_oos.oos_cli_count);
+
+       spin_lock(&gss_stat_oos.oos_lock);
+       if (behind > gss_stat_oos.oos_cli_behind)
+               gss_stat_oos.oos_cli_behind = behind;
+       spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+       LASSERT(phase >= 0 && phase <= 2);
+
+       if (replay)
+               atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+       else
+               atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_read_oos(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
+{
+       int written;
+
+       written = snprintf(page, count,
+                       "seqwin:                %u\n"
+                       "backwin:              %u\n"
+                       "client fall behind seqwin\n"
+                       "  occurrence:    %d\n"
+                       "  max seq behind:      %d\n"
+                       "server replay detected:\n"
+                       "  phase 0:          %d\n"
+                       "  phase 1:          %d\n"
+                       "  phase 2:          %d\n"
+                       "server verify ok:\n"
+                       "  phase 2:          %d\n",
+                       GSS_SEQ_WIN_MAIN,
+                       GSS_SEQ_WIN_BACK,
+                       atomic_read(&gss_stat_oos.oos_cli_count),
+                       gss_stat_oos.oos_cli_behind,
+                       atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+                       atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+                       atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+                       atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+
+       return written;
+}
+
+static int gss_proc_write_secinit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+       int rc;
+
+       rc = gss_do_ctx_init_rpc((char *) buffer, count);
+       if (rc) {
+               LASSERT(rc < 0);
+               return rc;
+       }
+
+       return ((int) count);
+}
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+       { "replays", gss_proc_read_oos, NULL },
+       { "init_channel", NULL, gss_proc_write_secinit, NULL, NULL, 0222 },
+       { NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_read_dl(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       return snprintf(page, count, "%u\n", gss_lk_debug_level);
+}
+
+static int gss_lk_proc_write_dl(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+       int     val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0 || val > 4)
+               return -ERANGE;
+
+       gss_lk_debug_level = val;
+       return count;
+}
+
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+       { "debug_level", gss_lk_proc_read_dl, gss_lk_proc_write_dl, NULL },
+       { NULL }
+};
+
+void gss_exit_lproc(void)
+{
+       if (gss_proc_lk) {
+               lprocfs_remove(&gss_proc_lk);
+               gss_proc_lk = NULL;
+       }
+
+       if (gss_proc_root) {
+               lprocfs_remove(&gss_proc_root);
+               gss_proc_root = NULL;
+       }
+}
+
+int gss_init_lproc(void)
+{
+       int     rc;
+
+       spin_lock_init(&gss_stat_oos.oos_lock);
+
+       gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root,
+                                        gss_lprocfs_vars, NULL);
+       if (IS_ERR(gss_proc_root)) {
+               gss_proc_root = NULL;
+               GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+       }
+
+       gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
+                                      gss_lk_lprocfs_vars, NULL);
+       if (IS_ERR(gss_proc_lk)) {
+               gss_proc_lk = NULL;
+               GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+       }
+
+       return 0;
+
+err_out:
+       CERROR("failed to initialize gss lproc entries: %d\n", rc);
+       gss_exit_lproc();
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644 (file)
index 0000000..ebca858
--- /dev/null
@@ -0,0 +1,2916 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+       LASSERT(msg->lm_bufcount > 0);
+       return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+       return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber                   *
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+       __swab32s(&ghdr->gh_flags);
+       __swab32s(&ghdr->gh_proc);
+       __swab32s(&ghdr->gh_seq);
+       __swab32s(&ghdr->gh_svc);
+       __swab32s(&ghdr->gh_pad1);
+       __swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                  int swabbed)
+{
+       struct gss_header *ghdr;
+
+       ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+       if (ghdr == NULL)
+               return NULL;
+
+       if (swabbed)
+               gss_header_swabber(ghdr);
+
+       if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+               CERROR("gss header has length %d, now %u received\n",
+                      (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+                      msg->lm_buflens[segment]);
+               return NULL;
+       }
+
+       return ghdr;
+}
+
+#if 0
+static
+void gss_netobj_swabber(netobj_t *obj)
+{
+       __swab32s(&obj->len);
+}
+
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment)
+{
+       netobj_t  *obj;
+
+       obj = lustre_swab_buf(msg, segment, sizeof(*obj), gss_netobj_swabber);
+       if (obj && sizeof(*obj) + obj->len > msg->lm_buflens[segment]) {
+               CERROR("netobj require length %u but only %u received\n",
+                      (unsigned int) sizeof(*obj) + obj->len,
+                      msg->lm_buflens[segment]);
+               return NULL;
+       }
+
+       return obj;
+}
+#endif
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+       if (privacy)
+               return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+       else
+               return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+                       struct gss_ctx *mechctx,
+                       enum lustre_sec_part sp,
+                       __u32 flags, __u32 proc, __u32 seq, __u32 svc,
+                       rawobj_t *handle)
+{
+       struct gss_header      *ghdr;
+       rawobj_t                text[4], mic;
+       int                  textcnt, max_textcnt, mic_idx;
+       __u32              major;
+
+       LASSERT(msg->lm_bufcount >= 2);
+
+       /* gss hdr */
+       LASSERT(msg->lm_buflens[0] >=
+               sizeof(*ghdr) + (handle ? handle->len : 0));
+       ghdr = lustre_msg_buf(msg, 0, 0);
+
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) sp;
+       ghdr->gh_flags = flags;
+       ghdr->gh_proc = proc;
+       ghdr->gh_seq = seq;
+       ghdr->gh_svc = svc;
+       if (!handle) {
+               /* fill in a fake one */
+               ghdr->gh_handle.len = 0;
+       } else {
+               ghdr->gh_handle.len = handle->len;
+               memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+       }
+
+       /* no actual signature for null mode */
+       if (svc == SPTLRPC_SVC_NULL)
+               return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+       /* MIC */
+       mic_idx = msg_last_segidx(msg);
+       max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+       for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+               text[textcnt].len = msg->lm_buflens[textcnt];
+               text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+       }
+
+       mic.len = msg->lm_buflens[mic_idx];
+       mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+       major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("fail to generate MIC: %08x\n", major);
+               return -EPERM;
+       }
+       LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+       return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+                    struct gss_ctx *mechctx,
+                    __u32 svc)
+{
+       rawobj_t        text[4], mic;
+       int          textcnt, max_textcnt;
+       int          mic_idx;
+       __u32      major;
+
+       LASSERT(msg->lm_bufcount >= 2);
+
+       if (svc == SPTLRPC_SVC_NULL)
+               return GSS_S_COMPLETE;
+
+       mic_idx = msg_last_segidx(msg);
+       max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+       for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+               text[textcnt].len = msg->lm_buflens[textcnt];
+               text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+       }
+
+       mic.len = msg->lm_buflens[mic_idx];
+       mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+       major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+       if (major != GSS_S_COMPLETE)
+               CERROR("mic verify error: %08x\n", major);
+
+       return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+                  struct lustre_msg *msgbuf,
+                  int *msg_len, int msgbuf_len)
+{
+       rawobj_t                 clear_obj, hdrobj, token;
+       __u8                *clear_buf;
+       int                   clear_buflen;
+       __u32               major;
+       ENTRY;
+
+       if (msgbuf->lm_bufcount != 2) {
+               CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+               RETURN(GSS_S_FAILURE);
+       }
+
+       /* allocate a temporary clear text buffer, same sized as token,
+        * we assume the final clear text size <= token size */
+       clear_buflen = lustre_msg_buflen(msgbuf, 1);
+       OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+       if (!clear_buf)
+               RETURN(GSS_S_FAILURE);
+
+       /* buffer objects */
+       hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+       hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+       token.len = lustre_msg_buflen(msgbuf, 1);
+       token.data = lustre_msg_buf(msgbuf, 1, 0);
+       clear_obj.len = clear_buflen;
+       clear_obj.data = clear_buf;
+
+       major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("unwrap message error: %08x\n", major);
+               GOTO(out_free, major = GSS_S_FAILURE);
+       }
+       LASSERT(clear_obj.len <= clear_buflen);
+       LASSERT(clear_obj.len <= msgbuf_len);
+
+       /* now the decrypted message */
+       memcpy(msgbuf, clear_obj.data, clear_obj.len);
+       *msg_len = clear_obj.len;
+
+       major = GSS_S_COMPLETE;
+out_free:
+       OBD_FREE_LARGE(clear_buf, clear_buflen);
+       RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount));
+
+       if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+               if (!ctx->cc_early_expire)
+                       clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+               CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_expire,
+                     ctx->cc_expire == 0 ? 0 :
+                     cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+               sptlrpc_cli_ctx_wakeup(ctx);
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+       if (unlikely(cli_ctx_is_dead(ctx)))
+               return 1;
+
+       /* expire is 0 means never expire. a newly created gss context
+        * which during upcall may has 0 expiration */
+       if (ctx->cc_expire == 0)
+               return 0;
+
+       /* check real expiration */
+       if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+               return 0;
+
+       cli_ctx_expire(ctx);
+       return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+       struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+       unsigned long      ctx_expiry;
+
+       if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+               CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+                      gctx, ctx->cc_vcred.vc_uid);
+               ctx_expiry = 1; /* make it expired now */
+       }
+
+       ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+                                             ctx->cc_sec->ps_flvr.sf_flags);
+
+       /* At this point this ctx might have been marked as dead by
+        * someone else, in which case nobody will make further use
+        * of it. we don't care, and mark it UPTODATE will help
+        * destroying server side context when it be destroied. */
+       set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+       if (sec_is_reverse(ctx->cc_sec)) {
+               CWARN("server installed reverse ctx %p idx "LPX64", "
+                     "expiry %lu(%+lds)\n", ctx,
+                     gss_handle_to_u64(&gctx->gc_handle),
+                     ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+       } else {
+               CWARN("client refreshed ctx %p idx "LPX64" (%u->%s), "
+                     "expiry %lu(%+lds)\n", ctx,
+                     gss_handle_to_u64(&gctx->gc_handle),
+                     ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+
+               /* install reverse svc ctx for root context */
+               if (ctx->cc_vcred.vc_uid == 0)
+                       gss_sec_install_rctx(ctx->cc_sec->ps_import,
+                                            ctx->cc_sec, ctx);
+       }
+
+       sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+       LASSERT(gctx->gc_base.cc_sec);
+
+       if (gctx->gc_mechctx) {
+               lgss_delete_sec_context(&gctx->gc_mechctx);
+               gctx->gc_mechctx = NULL;
+       }
+
+       if (!rawobj_empty(&gctx->gc_svc_handle)) {
+               /* forward ctx: mark buddy reverse svcctx soon-expire. */
+               if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+                   !rawobj_empty(&gctx->gc_svc_handle))
+                       gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+               rawobj_free(&gctx->gc_svc_handle);
+       }
+
+       rawobj_free(&gctx->gc_handle);
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * note we should not check sequence before verify the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following request be dropped.
+ *
+ * so here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform a initial sequence checking in
+ *      main window, which only try and don't actually set any bits. if the
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then set the bit and accept; if it fit in the window but bit
+ *      already set, then reject; if it fall behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fit in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * return value:
+ *   1: looks like a replay
+ *   0: is ok
+ *  -1: is a replay
+ *
+ * note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * this mechanism can't totally solve the problem, but could help much less
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+                    __u32 seq_num, int phase)
+{
+       LASSERT(phase >= 0 && phase <= 2);
+
+       if (seq_num > *max_seq) {
+               /*
+                * 1. high above the window
+                */
+               if (phase == 0)
+                       return 0;
+
+               if (seq_num >= *max_seq + win_size) {
+                       memset(window, 0, win_size / 8);
+                       *max_seq = seq_num;
+               } else {
+                       while(*max_seq < seq_num) {
+                               (*max_seq)++;
+                               __clear_bit((*max_seq) % win_size, window);
+                       }
+               }
+               __set_bit(seq_num % win_size, window);
+       } else if (seq_num + win_size <= *max_seq) {
+               /*
+                * 2. low behind the window
+                */
+               if (phase == 0 || phase == 2)
+                       goto replay;
+
+               CWARN("seq %u is %u behind (size %d), check backup window\n",
+                     seq_num, *max_seq - win_size - seq_num, win_size);
+               return 1;
+       } else {
+               /*
+                * 3. fit into the window
+                */
+               switch (phase) {
+               case 0:
+                       if (test_bit(seq_num % win_size, window))
+                               goto replay;
+                       break;
+               case 1:
+               case 2:
+                    if (__test_and_set_bit(seq_num % win_size, window))
+                               goto replay;
+                       break;
+               }
+       }
+
+       return 0;
+
+replay:
+       CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+              seq_num,
+              seq_num + win_size > *max_seq ? "in" : "behind",
+              phase == 2 ? "backup " : "main",
+              *max_seq, win_size);
+       return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+       int rc = 0;
+
+       spin_lock(&ssd->ssd_lock);
+
+       if (set == 0) {
+               /*
+                * phase 0 testing
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                     &ssd->ssd_max_main, seq_num, 0);
+               if (unlikely(rc))
+                       gss_stat_oos_record_svc(0, 1);
+       } else {
+               /*
+                * phase 1 checking main window
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                     &ssd->ssd_max_main, seq_num, 1);
+               switch (rc) {
+               case -1:
+                       gss_stat_oos_record_svc(1, 1);
+                       /* fall through */
+               case 0:
+                       goto exit;
+               }
+               /*
+                * phase 2 checking back window
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+                                     &ssd->ssd_max_back, seq_num, 2);
+               if (rc)
+                       gss_stat_oos_record_svc(2, 1);
+               else
+                       gss_stat_oos_record_svc(2, 0);
+       }
+exit:
+       spin_unlock(&ssd->ssd_lock);
+       return rc;
+}
+
+/***************************************
+ * cred APIs                      *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+                                 int msgsize, int privacy)
+{
+       return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+                               struct sptlrpc_flavor *flvr,
+                               int reply, int read)
+{
+       int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+       LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+       if ((!reply && !read) || (reply && read)) {
+               switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+               case SPTLRPC_BULK_SVC_NULL:
+                       break;
+               case SPTLRPC_BULK_SVC_INTG:
+                       payload += gss_cli_payload(ctx, 0, 0);
+                       break;
+               case SPTLRPC_BULK_SVC_PRIV:
+                       payload += gss_cli_payload(ctx, 0, 1);
+                       break;
+               case SPTLRPC_BULK_SVC_AUTH:
+               default:
+                       LBUG();
+               }
+       }
+
+       return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+       return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_CTX_NEW)
+               strncat(buf, "new,", bufsize);
+       if (flags & PTLRPC_CTX_UPTODATE)
+               strncat(buf, "uptodate,", bufsize);
+       if (flags & PTLRPC_CTX_DEAD)
+               strncat(buf, "dead,", bufsize);
+       if (flags & PTLRPC_CTX_ERROR)
+               strncat(buf, "error,", bufsize);
+       if (flags & PTLRPC_CTX_CACHED)
+               strncat(buf, "cached,", bufsize);
+       if (flags & PTLRPC_CTX_ETERNAL)
+               strncat(buf, "eternal,", bufsize);
+       if (buf[0] == '\0')
+               strncat(buf, "-,", bufsize);
+
+       buf[strlen(buf) - 1] = '\0';
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+                    struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+       __u32               flags = 0, seq, svc;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+       LASSERT(req->rq_cli_ctx == ctx);
+
+       /* nothing to do for context negotiation RPCs */
+       if (req->rq_ctx_init)
+               RETURN(0);
+
+       svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       if (req->rq_pack_bulk)
+               flags |= LUSTRE_GSS_PACK_BULK;
+       if (req->rq_pack_udesc)
+               flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+       seq = atomic_inc_return(&gctx->gc_seq);
+
+       rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+                         ctx->cc_sec->ps_part,
+                         flags, gctx->gc_proc, seq, svc,
+                         &gctx->gc_handle);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* gss_sign_msg() msg might take long time to finish, in which period
+        * more rpcs could be wrapped up and sent out. if we found too many
+        * of them we should repack this rpc, because sent it too late might
+        * lead to the sequence number fall behind the window on server and
+        * be dropped. also applies to gss_cli_ctx_seal().
+        *
+        * Note: null mode dosen't check sequence number. */
+       if (svc != SPTLRPC_SVC_NULL &&
+           atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+               int behind = atomic_read(&gctx->gc_seq) - seq;
+
+               gss_stat_oos_record_cli(behind);
+               CWARN("req %p: %u behind, retry signing\n", req, behind);
+               goto redo;
+       }
+
+       req->rq_reqdata_len = rc;
+       RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+                                 struct ptlrpc_request *req,
+                                 struct gss_header *ghdr)
+{
+       struct gss_err_header *errhdr;
+       int rc;
+
+       LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+       errhdr = (struct gss_err_header *) ghdr;
+
+       CWARN("req x"LPU64"/t"LPU64", ctx %p idx "LPX64"(%u->%s): "
+             "%sserver respond (%08x/%08x)\n",
+             req->rq_xid, req->rq_transno, ctx,
+             gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+             ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+             sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+             errhdr->gh_major, errhdr->gh_minor);
+
+       /* context fini rpc, let it failed */
+       if (req->rq_ctx_fini) {
+               CWARN("context fini rpc failed\n");
+               return -EINVAL;
+       }
+
+       /* reverse sec, just return error, don't expire this ctx because it's
+        * crucial to callback rpcs. note if the callback rpc failed because
+        * of bit flip during network transfer, the client will be evicted
+        * directly. so more gracefully we probably want let it retry for
+        * number of times. */
+       if (sec_is_reverse(ctx->cc_sec))
+               return -EINVAL;
+
+       if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+           errhdr->gh_major != GSS_S_BAD_SIG)
+               return -EACCES;
+
+       /* server return NO_CONTEXT might be caused by context expire
+        * or server reboot/failover. we try to refresh a new ctx which
+        * be transparent to upper layer.
+        *
+        * In some cases, our gss handle is possible to be incidentally
+        * identical to another handle since the handle itself is not
+        * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+        * returned, maybe other gss error for other mechanism.
+        *
+        * if we add new mechanism, make sure the correct error are
+        * returned in this case. */
+       CWARN("%s: server might lost the context, retrying\n",
+             errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+       sptlrpc_cli_ctx_expire(ctx);
+
+       /* we need replace the ctx right here, otherwise during
+        * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+        * which keep the ctx with RESEND flag, thus we'll never
+        * get rid of this ctx. */
+       rc = sptlrpc_req_replace_dead_ctx(req);
+       if (rc == 0)
+               req->rq_resend = 1;
+
+       return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+                      struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx     *gctx;
+       struct gss_header      *ghdr, *reqhdr;
+       struct lustre_msg      *msg = req->rq_repdata;
+       __u32              major;
+       int                  pack_bulk, swabbed, rc = 0;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(msg);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       /* special case for context negotiation, rq_repmsg/rq_replen actually
+        * are not used currently. but early reply always be treated normally */
+       if (req->rq_ctx_init && !req->rq_early) {
+               req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+               req->rq_replen = msg->lm_buflens[1];
+               RETURN(0);
+       }
+
+       if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+               CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+               RETURN(-EPROTO);
+       }
+
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       ghdr = gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(-EPROTO);
+       }
+
+       /* sanity checks */
+       reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+       LASSERT(reqhdr);
+
+       if (ghdr->gh_version != reqhdr->gh_version) {
+               CERROR("gss version %u mismatch, expect %u\n",
+                      ghdr->gh_version, reqhdr->gh_version);
+               RETURN(-EPROTO);
+       }
+
+       switch (ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_DATA:
+               pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+               if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                       CERROR("%s bulk flag in reply\n",
+                              req->rq_pack_bulk ? "missing" : "unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (ghdr->gh_seq != reqhdr->gh_seq) {
+                       CERROR("seqnum %u mismatch, expect %u\n",
+                              ghdr->gh_seq, reqhdr->gh_seq);
+                       RETURN(-EPROTO);
+               }
+
+               if (ghdr->gh_svc != reqhdr->gh_svc) {
+                       CERROR("svc %u mismatch, expect %u\n",
+                              ghdr->gh_svc, reqhdr->gh_svc);
+                       RETURN(-EPROTO);
+               }
+
+               if (swabbed)
+                       gss_header_swabber(ghdr);
+
+               major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+               if (major != GSS_S_COMPLETE) {
+                       CERROR("failed to verify reply: %x\n", major);
+                       RETURN(-EPERM);
+               }
+
+               if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+                       __u32 cksum;
+
+                       cksum = crc32_le(!(__u32) 0,
+                                        lustre_msg_buf(msg, 1, 0),
+                                        lustre_msg_buflen(msg, 1));
+                       if (cksum != msg->lm_cksum) {
+                               CWARN("early reply checksum mismatch: "
+                                     "%08x != %08x\n", cksum, msg->lm_cksum);
+                               RETURN(-EPROTO);
+                       }
+               }
+
+               if (pack_bulk) {
+                       /* bulk checksum is right after the lustre msg */
+                       if (msg->lm_bufcount < 3) {
+                               CERROR("Invalid reply bufcount %u\n",
+                                      msg->lm_bufcount);
+                               RETURN(-EPROTO);
+                       }
+
+                       rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+                       if (rc) {
+                               CERROR("unpack bulk desc: %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+
+               req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+               req->rq_replen = msg->lm_buflens[1];
+               break;
+       case PTLRPC_GSS_PROC_ERR:
+               if (req->rq_early) {
+                       CERROR("server return error with early reply\n");
+                       rc = -EPROTO;
+               } else {
+                       rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+               }
+               break;
+       default:
+               CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+               rc = -EPROTO;
+       }
+
+       RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+                    struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx;
+       rawobj_t                 hdrobj, msgobj, token;
+       struct gss_header       *ghdr;
+       __u32               buflens[2], major;
+       int                   wiresize, rc;
+       ENTRY;
+
+       LASSERT(req->rq_clrbuf);
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(req->rq_reqlen);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       /* final clear data length */
+       req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+                                                req->rq_clrbuf->lm_buflens);
+
+       /* calculate wire data length */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+       wiresize = lustre_msg_size_v2(2, buflens);
+
+       /* allocate wire buffer */
+       if (req->rq_pool) {
+               /* pre-allocated */
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+               LASSERT(req->rq_reqbuf_len >= wiresize);
+       } else {
+               OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+               req->rq_reqbuf_len = wiresize;
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+       req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       /* gss header */
+       ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = gctx->gc_proc;
+       ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+       ghdr->gh_handle.len = gctx->gc_handle.len;
+       memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+       if (req->rq_pack_bulk)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+       if (req->rq_pack_udesc)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+       ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+       /* buffer objects */
+       hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+       hdrobj.data = (__u8 *) ghdr;
+       msgobj.len = req->rq_clrdata_len;
+       msgobj.data = (__u8 *) req->rq_clrbuf;
+       token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+       token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+       major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+                         req->rq_clrbuf_len, &token);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("priv: wrap message error: %08x\n", major);
+               GOTO(err_free, rc = -EPERM);
+       }
+       LASSERT(token.len <= buflens[1]);
+
+       /* see explain in gss_cli_ctx_sign() */
+       if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+                    GSS_SEQ_REPACK_THRESHOLD)) {
+               int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+               gss_stat_oos_record_cli(behind);
+               CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+               ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+               goto redo;
+       }
+
+       /* now set the final wire data length */
+       req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+       RETURN(0);
+
+err_free:
+       if (!req->rq_pool) {
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+       RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+                      struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx;
+       struct gss_header       *ghdr;
+       struct lustre_msg       *msg = req->rq_repdata;
+       int                   msglen, pack_bulk, swabbed, rc;
+       __u32               major;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(req->rq_ctx_init == 0);
+       LASSERT(msg);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       ghdr = gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(-EPROTO);
+       }
+
+       /* sanity checks */
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("gss version %u mismatch, expect %u\n",
+                      ghdr->gh_version, PTLRPC_GSS_VERSION);
+               RETURN(-EPROTO);
+       }
+
+       switch (ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_DATA:
+               pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+               if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                       CERROR("%s bulk flag in reply\n",
+                              req->rq_pack_bulk ? "missing" : "unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (swabbed)
+                       gss_header_swabber(ghdr);
+
+               /* use rq_repdata_len as buffer size, which assume unseal
+                * doesn't need extra memory space. for precise control, we'd
+                * better calculate out actual buffer size as
+                * (repbuf_len - offset - repdata_len) */
+               major = gss_unseal_msg(gctx->gc_mechctx, msg,
+                                      &msglen, req->rq_repdata_len);
+               if (major != GSS_S_COMPLETE) {
+                       CERROR("failed to unwrap reply: %x\n", major);
+                       rc = -EPERM;
+                       break;
+               }
+
+               swabbed = __lustre_unpack_msg(msg, msglen);
+               if (swabbed < 0) {
+                       CERROR("Failed to unpack after decryption\n");
+                       RETURN(-EPROTO);
+               }
+
+               if (msg->lm_bufcount < 1) {
+                       CERROR("Invalid reply buffer: empty\n");
+                       RETURN(-EPROTO);
+               }
+
+               if (pack_bulk) {
+                       if (msg->lm_bufcount < 2) {
+                               CERROR("bufcount %u: missing bulk sec desc\n",
+                                      msg->lm_bufcount);
+                               RETURN(-EPROTO);
+                       }
+
+                       /* bulk checksum is the last segment */
+                       if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+                                                swabbed))
+                               RETURN(-EPROTO);
+               }
+
+               req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+               req->rq_replen = msg->lm_buflens[0];
+
+               rc = 0;
+               break;
+       case PTLRPC_GSS_PROC_ERR:
+               if (req->rq_early) {
+                       CERROR("server return error with early reply\n");
+                       rc = -EPROTO;
+               } else {
+                       rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+               }
+               break;
+       default:
+               CERROR("unexpected proc %d\n", ghdr->gh_proc);
+               rc = -EPERM;
+       }
+
+       RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation              *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+                           struct gss_sec *gsec,
+                           struct gss_cli_ctx *gctx)
+{
+       return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs                    *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+                         struct ptlrpc_sec_policy *policy,
+                         struct obd_import *imp,
+                         struct ptlrpc_svc_ctx *svcctx,
+                         struct sptlrpc_flavor *sf)
+{
+       struct ptlrpc_sec   *sec;
+
+       LASSERT(imp);
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+       gsec->gs_mech = lgss_subflavor_to_mech(
+                               SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+       if (!gsec->gs_mech) {
+               CERROR("gss backend 0x%x not found\n",
+                      SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+               return -EOPNOTSUPP;
+       }
+
+       spin_lock_init(&gsec->gs_lock);
+       gsec->gs_rvs_hdl = 0ULL;
+
+       /* initialize upper ptlrpc_sec */
+       sec = &gsec->gs_base;
+       sec->ps_policy = policy;
+       atomic_set(&sec->ps_refcount, 0);
+       atomic_set(&sec->ps_nctx, 0);
+       sec->ps_id = sptlrpc_get_next_secid();
+       sec->ps_flvr = *sf;
+       sec->ps_import = class_import_get(imp);
+       spin_lock_init(&sec->ps_lock);
+       INIT_LIST_HEAD(&sec->ps_gc_list);
+
+       if (!svcctx) {
+               sec->ps_gc_interval = GSS_GC_INTERVAL;
+       } else {
+               LASSERT(sec_is_reverse(sec));
+
+               /* never do gc on reverse sec */
+               sec->ps_gc_interval = 0;
+       }
+
+       if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+               sptlrpc_enc_pool_add_user();
+
+       CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+              policy->sp_name, gsec);
+       return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+       struct ptlrpc_sec      *sec = &gsec->gs_base;
+       ENTRY;
+
+       LASSERT(sec->ps_import);
+       LASSERT(atomic_read(&sec->ps_refcount) == 0);
+       LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+       if (gsec->gs_mech) {
+               lgss_mech_put(gsec->gs_mech);
+               gsec->gs_mech = NULL;
+       }
+
+       class_import_put(sec->ps_import);
+
+       if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+               sptlrpc_enc_pool_del_user();
+
+       EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+       sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_ctx_ops *ctxops,
+                           struct vfs_cred *vcred)
+{
+       struct gss_cli_ctx    *gctx = ctx2gctx(ctx);
+
+       gctx->gc_win = 0;
+       atomic_set(&gctx->gc_seq, 0);
+
+       INIT_HLIST_NODE(&ctx->cc_cache);
+       atomic_set(&ctx->cc_refcount, 0);
+       ctx->cc_sec = sec;
+       ctx->cc_ops = ctxops;
+       ctx->cc_expire = 0;
+       ctx->cc_flags = PTLRPC_CTX_NEW;
+       ctx->cc_vcred = *vcred;
+       spin_lock_init(&ctx->cc_lock);
+       INIT_LIST_HEAD(&ctx->cc_req_list);
+       INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+       /* take a ref on belonging sec, balanced in ctx destroying */
+       atomic_inc(&sec->ps_refcount);
+       /* statistic only */
+       atomic_inc(&sec->ps_nctx);
+
+       CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+              sec->ps_policy->sp_name, ctx->cc_sec,
+              ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+       return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       LASSERT(ctx->cc_sec == sec);
+
+       /*
+        * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+        * this is to avoid potential problems of client side reverse svc ctx
+        * be mis-destroyed in various recovery senarios. anyway client can
+        * manage its reverse ctx well by associating it with its buddy ctx.
+        */
+       if (sec_is_reverse(sec))
+               ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+       if (gctx->gc_mechctx) {
+               /* the final context fini rpc will use this ctx too, and it's
+                * asynchronous which finished by request_out_callback(). so
+                * we add refcount, whoever drop finally drop the refcount to
+                * 0 should responsible for the rest of destroy. */
+               atomic_inc(&ctx->cc_refcount);
+
+               gss_do_ctx_fini_rpc(gctx);
+               gss_cli_ctx_finalize(gctx);
+
+               if (!atomic_dec_and_test(&ctx->cc_refcount))
+                       return 1;
+       }
+
+       if (sec_is_reverse(sec))
+               CWARN("reverse sec %p: destroy ctx %p\n",
+                     ctx->cc_sec, ctx);
+       else
+               CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+                     sec->ps_policy->sp_name, ctx->cc_sec,
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+       return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int svc, int msgsize)
+{
+       int                    bufsize, txtsize;
+       int                    bufcnt = 2;
+       __u32                buflens[5];
+       ENTRY;
+
+       /*
+        * on-wire data layout:
+        * - gss header
+        * - lustre message
+        * - user descriptor (optional)
+        * - bulk sec descriptor (optional)
+        * - signature (optional)
+        *   - svc == NULL: NULL
+        *   - svc == AUTH: signature of gss header
+        *   - svc == INTG: signature of all above
+        *
+        * if this is context negotiation, reserver fixed space
+        * at the last (signature) segment regardless of svc mode.
+        */
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       txtsize = buflens[0];
+
+       buflens[1] = msgsize;
+       if (svc == SPTLRPC_SVC_INTG)
+               txtsize += buflens[1];
+
+       if (req->rq_pack_udesc) {
+               buflens[bufcnt] = sptlrpc_current_user_desc_size();
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_pack_bulk) {
+               buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                      &req->rq_flvr,
+                                                      0, req->rq_bulk_read);
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_ctx_init)
+               buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+       else if (svc != SPTLRPC_SVC_NULL)
+               buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+       bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+       if (!req->rq_reqbuf) {
+               bufsize = size_roundup_power2(bufsize);
+
+               OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+
+               req->rq_reqbuf_len = bufsize;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= bufsize);
+               memset(req->rq_reqbuf, 0, bufsize);
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+       req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+       LASSERT(req->rq_reqmsg);
+
+       /* pack user desc here, later we might leave current user's process */
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+       RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int msgsize)
+{
+       __u32                ibuflens[3], wbuflens[2];
+       int                    ibufcnt;
+       int                    clearsize, wiresize;
+       ENTRY;
+
+       LASSERT(req->rq_clrbuf == NULL);
+       LASSERT(req->rq_clrbuf_len == 0);
+
+       /* Inner (clear) buffers
+        *  - lustre message
+        *  - user descriptor (optional)
+        *  - bulk checksum (optional)
+        */
+       ibufcnt = 1;
+       ibuflens[0] = msgsize;
+
+       if (req->rq_pack_udesc)
+               ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+       if (req->rq_pack_bulk)
+               ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                          &req->rq_flvr, 0,
+                                                          req->rq_bulk_read);
+
+       clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+       /* to allow append padding during encryption */
+       clearsize += GSS_MAX_CIPHER_BLOCK;
+
+       /* Wrapper (wire) buffers
+        *  - gss header
+        *  - cipher text
+        */
+       wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+       wiresize = lustre_msg_size_v2(2, wbuflens);
+
+       if (req->rq_pool) {
+               /* rq_reqbuf is preallocated */
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf_len >= wiresize);
+
+               memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+               /* if the pre-allocated buffer is big enough, we just pack
+                * both clear buf & request buf in it, to avoid more alloc. */
+               if (clearsize + wiresize <= req->rq_reqbuf_len) {
+                       req->rq_clrbuf =
+                               (void *) (((char *) req->rq_reqbuf) + wiresize);
+               } else {
+                       CWARN("pre-allocated buf size %d is not enough for "
+                             "both clear (%d) and cipher (%d) text, proceed "
+                             "with extra allocation\n", req->rq_reqbuf_len,
+                             clearsize, wiresize);
+               }
+       }
+
+       if (!req->rq_clrbuf) {
+               clearsize = size_roundup_power2(clearsize);
+
+               OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+               if (!req->rq_clrbuf)
+                       RETURN(-ENOMEM);
+       }
+       req->rq_clrbuf_len = clearsize;
+
+       lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+       req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+       RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req,
+                    int msgsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+       LASSERT(!req->rq_pack_bulk ||
+               (req->rq_bulk_read || req->rq_bulk_write));
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_alloc_reqbuf_priv(sec, req, msgsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req)
+{
+       int     privacy;
+       ENTRY;
+
+       LASSERT(!req->rq_pool || req->rq_reqbuf);
+       privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+       if (!req->rq_clrbuf)
+               goto release_reqbuf;
+
+       /* release clear buffer */
+       LASSERT(privacy);
+       LASSERT(req->rq_clrbuf_len);
+
+       if (req->rq_pool == NULL ||
+           req->rq_clrbuf < req->rq_reqbuf ||
+           (char *) req->rq_clrbuf >=
+           (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+               OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+       req->rq_clrbuf = NULL;
+       req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+       if (!req->rq_pool && req->rq_reqbuf) {
+               LASSERT(req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+
+       EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+       bufsize = size_roundup_power2(bufsize);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+       if (!req->rq_repbuf)
+               return -ENOMEM;
+
+       req->rq_repbuf_len = bufsize;
+       return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int svc, int msgsize)
+{
+       int          txtsize;
+       __u32      buflens[4];
+       int          bufcnt = 2;
+       int          alloc_size;
+
+       /*
+        * on-wire data layout:
+        * - gss header
+        * - lustre message
+        * - bulk sec descriptor (optional)
+        * - signature (optional)
+        *   - svc == NULL: NULL
+        *   - svc == AUTH: signature of gss header
+        *   - svc == INTG: signature of all above
+        *
+        * if this is context negotiation, reserver fixed space
+        * at the last (signature) segment regardless of svc mode.
+        */
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       txtsize = buflens[0];
+
+       buflens[1] = msgsize;
+       if (svc == SPTLRPC_SVC_INTG)
+               txtsize += buflens[1];
+
+       if (req->rq_pack_bulk) {
+               buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                      &req->rq_flvr,
+                                                      1, req->rq_bulk_read);
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_ctx_init)
+               buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+       else if (svc != SPTLRPC_SVC_NULL)
+               buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+       alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+       /* add space for early reply */
+       alloc_size += gss_at_reply_off_integ;
+
+       return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int msgsize)
+{
+       int          txtsize;
+       __u32      buflens[2];
+       int          bufcnt;
+       int          alloc_size;
+
+       /* inner buffers */
+       bufcnt = 1;
+       buflens[0] = msgsize;
+
+       if (req->rq_pack_bulk)
+               buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                        &req->rq_flvr,
+                                                        1, req->rq_bulk_read);
+       txtsize = lustre_msg_size_v2(bufcnt, buflens);
+       txtsize += GSS_MAX_CIPHER_BLOCK;
+
+       /* wrapper buffers */
+       bufcnt = 2;
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+       alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+       /* add space for early reply */
+       alloc_size += gss_at_reply_off_priv;
+
+       return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req,
+                    int msgsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       ENTRY;
+
+       LASSERT(!req->rq_pack_bulk ||
+               (req->rq_bulk_read || req->rq_bulk_write));
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_alloc_repbuf_priv(sec, req, msgsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req)
+{
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+       req->rq_repdata = NULL;
+       req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+                               int segment, int newsize)
+{
+       int save, newmsg_size;
+
+       LASSERT(newsize >= msg->lm_buflens[segment]);
+
+       save = msg->lm_buflens[segment];
+       msg->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       msg->lm_buflens[segment] = save;
+
+       return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+                                int segment1, int newsize1,
+                                int segment2, int newsize2)
+{
+       int save1, save2, newmsg_size;
+
+       LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+       LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+       save1 = msg->lm_buflens[segment1];
+       save2 = msg->lm_buflens[segment2];
+       msg->lm_buflens[segment1] = newsize1;
+       msg->lm_buflens[segment2] = newsize2;
+       newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       msg->lm_buflens[segment1] = save1;
+       msg->lm_buflens[segment2] = save2;
+
+       return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+                           struct ptlrpc_request *req,
+                           int svc,
+                           int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       int                  txtsize, sigsize = 0, i;
+       int                  newmsg_size, newbuf_size;
+
+       /*
+        * gss header is at seg 0;
+        * embedded msg is at seg 1;
+        * signature (if any) is at the last seg
+        */
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+       LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+       LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+       /* 1. compute new embedded msg size */
+       newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+       LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+       /* 2. compute new wrapper msg size */
+       if (svc == SPTLRPC_SVC_NULL) {
+               /* no signature, get size directly */
+               newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+                                                  1, newmsg_size);
+       } else {
+               txtsize = req->rq_reqbuf->lm_buflens[0];
+
+               if (svc == SPTLRPC_SVC_INTG) {
+                       for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+                               txtsize += req->rq_reqbuf->lm_buflens[i];
+                       txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+               }
+
+               sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+               LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+               newbuf_size = get_enlarged_msgsize2(
+                                       req->rq_reqbuf,
+                                       1, newmsg_size,
+                                       msg_last_segidx(req->rq_reqbuf),
+                                       sigsize);
+       }
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+       if (req->rq_reqbuf_len < newbuf_size) {
+               newbuf_size = size_roundup_power2(newbuf_size);
+
+               OBD_ALLOC_LARGE(newbuf, newbuf_size);
+               if (newbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = newbuf;
+               req->rq_reqbuf_len = newbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+       }
+
+       /* do enlargement, from wrapper to embedded, from end to begin */
+       if (svc != SPTLRPC_SVC_NULL)
+               _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+                                            msg_last_segidx(req->rq_reqbuf),
+                                            sigsize);
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+       req->rq_reqlen = newmsg_size;
+       RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+                           struct ptlrpc_request *req,
+                           int segment, int newsize)
+{
+       struct lustre_msg      *newclrbuf;
+       int                  newmsg_size, newclrbuf_size, newcipbuf_size;
+       __u32              buflens[3];
+
+       /*
+        * embedded msg is at seg 0 of clear buffer;
+        * cipher text is at seg 2 of cipher buffer;
+        */
+       LASSERT(req->rq_pool ||
+               (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+       LASSERT(req->rq_reqbuf == NULL ||
+               (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+       LASSERT(req->rq_clrbuf);
+       LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+       LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+       /* compute new embedded msg size */
+       newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+       /* compute new clear buffer size */
+       newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+       newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+       /* compute new cipher buffer size */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+       buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+       newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+       /* handle the case that we put both clear buf and cipher buf into
+        * pre-allocated single buffer. */
+       if (unlikely(req->rq_pool) &&
+           req->rq_clrbuf >= req->rq_reqbuf &&
+           (char *) req->rq_clrbuf <
+           (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+               /* it couldn't be better we still fit into the
+                * pre-allocated buffer. */
+               if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+                       void *src, *dst;
+
+                       /* move clear text backward. */
+                       src = req->rq_clrbuf;
+                       dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+                       memmove(dst, src, req->rq_clrbuf_len);
+
+                       req->rq_clrbuf = (struct lustre_msg *) dst;
+                       req->rq_clrbuf_len = newclrbuf_size;
+                       req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+               } else {
+                       /* sadly we have to split out the clear buffer */
+                       LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+                       LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+               }
+       }
+
+       if (req->rq_clrbuf_len < newclrbuf_size) {
+               newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+               OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+               if (newclrbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+               if (req->rq_reqbuf == NULL ||
+                   req->rq_clrbuf < req->rq_reqbuf ||
+                   (char *) req->rq_clrbuf >=
+                   (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                       OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+               }
+
+               req->rq_clrbuf = newclrbuf;
+               req->rq_clrbuf_len = newclrbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+       req->rq_reqlen = newmsg_size;
+
+       RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int segment, int newsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+       LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+                        struct ptlrpc_sec *sec,
+                        struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_sec     *gsec;
+       struct gss_cli_ctx *gctx;
+       int              rc;
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+       return rc;
+}
+
+/********************************************
+ * server side API                       *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(grctx);
+       return (grctx->src_init || grctx->src_init_continue ||
+               grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+       if (grctx->src_ctx)
+               gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+       sptlrpc_policy_put(grctx->src_base.sc_policy);
+       OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+       atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+       if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+               gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+                struct ptlrpc_reply_state *rs,
+                struct gss_svc_reqctx *grctx,
+                __u32 svc)
+{
+       __u32   flags = 0;
+       int     rc;
+       ENTRY;
+
+       LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+       /* embedded lustre_msg might have been shrinked */
+       if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+               lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+       if (req->rq_pack_bulk)
+               flags |= LUSTRE_GSS_PACK_BULK;
+
+       rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+                         LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+                         grctx->src_wirectx.gw_seq, svc, NULL);
+       if (rc < 0)
+               RETURN(rc);
+
+       rs->rs_repdata_len = rc;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = gss_at_reply_off_integ;
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               if (svc == SPTLRPC_SVC_NULL)
+                       rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+                                       lustre_msg_buf(rs->rs_repbuf, 1, 0),
+                                       lustre_msg_buflen(rs->rs_repbuf, 1));
+               req->rq_reply_off = 0;
+       }
+
+       RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct ptlrpc_reply_state *rs;
+       struct gss_err_header     *ghdr;
+       int                     replen = sizeof(struct ptlrpc_body);
+       int                     rc;
+       ENTRY;
+
+       //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+       //      RETURN(-EINVAL);
+
+       grctx->src_err_notify = 1;
+       grctx->src_reserve_len = 0;
+
+       rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+       if (rc) {
+               CERROR("could not pack reply, err %d\n", rc);
+               RETURN(rc);
+       }
+
+       /* gss hdr */
+       rs = req->rq_reply_state;
+       LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+       ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+       ghdr->gh_major = major;
+       ghdr->gh_minor = minor;
+       ghdr->gh_handle.len = 0; /* fake context handle */
+
+       rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                               rs->rs_repbuf->lm_buflens);
+
+       CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+              major, minor, libcfs_nid2str(req->rq_peer.nid));
+       RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+                       struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct lustre_msg        *reqbuf = req->rq_reqbuf;
+       struct obd_uuid    *uuid;
+       struct obd_device        *target;
+       rawobj_t                   uuid_obj, rvs_hdl, in_token;
+       __u32                 lustre_svc;
+       __u32                *secdata, seclen;
+       int                     swabbed, rc;
+       ENTRY;
+
+       CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+              libcfs_nid2str(req->rq_peer.nid));
+
+       req->rq_ctx_init = 1;
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               CERROR("unexpected bulk flag\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+               CERROR("proc %u: invalid handle length %u\n",
+                      gw->gw_proc, gw->gw_handle.len);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+               CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       /* ctx initiate payload is in last segment */
+       secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+       seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+       if (seclen < 4 + 4) {
+               CERROR("sec size %d too small\n", seclen);
+               RETURN(SECSVC_DROP);
+       }
+
+       /* lustre svc type */
+       lustre_svc = le32_to_cpu(*secdata++);
+       seclen -= 4;
+
+       /* extract target uuid, note this code is somewhat fragile
+        * because touched internal structure of obd_uuid */
+       if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+               CERROR("failed to extract target uuid\n");
+               RETURN(SECSVC_DROP);
+       }
+       uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+       uuid = (struct obd_uuid *) uuid_obj.data;
+       target = class_uuid2obd(uuid);
+       if (!target || target->obd_stopping || !target->obd_set_up) {
+               CERROR("target '%s' is not available for context init (%s)\n",
+                      uuid->uuid, target == NULL ? "no target" :
+                      (target->obd_stopping ? "stopping" : "not set up"));
+               RETURN(SECSVC_DROP);
+       }
+
+       /* extract reverse handle */
+       if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+               CERROR("failed extract reverse handle\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       /* extract token */
+       if (rawobj_extract(&in_token, &secdata, &seclen)) {
+               CERROR("can't extract token\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+                                       &rvs_hdl, &in_token);
+       if (rc != SECSVC_OK)
+               RETURN(rc);
+
+       if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+           grctx->src_ctx->gsc_usr_root)
+               CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+                     grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+                     grctx->src_ctx->gsc_usr_mds ? "mds" :
+                       (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+       else
+               CWARN("create svc ctx %p: accept user %u from %s\n",
+                     grctx->src_ctx, grctx->src_ctx->gsc_uid,
+                     libcfs_nid2str(req->rq_peer.nid));
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (reqbuf->lm_bufcount < 4) {
+                       CERROR("missing user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+               if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+       req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+       RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+                          struct gss_svc_reqctx *grctx,
+                          struct gss_wire_ctx *gw,
+                          __u32 *major)
+{
+       struct gss_svc_ctx *gctx = grctx->src_ctx;
+       struct lustre_msg  *msg = req->rq_reqbuf;
+       int              offset = 2;
+       int              swabbed;
+       ENTRY;
+
+       *major = GSS_S_COMPLETE;
+
+       if (msg->lm_bufcount < 2) {
+               CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+               RETURN(-EINVAL);
+       }
+
+       if (gw->gw_svc == SPTLRPC_SVC_NULL)
+               goto verified;
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+               CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+       if (*major != GSS_S_COMPLETE) {
+               CERROR("failed to verify request: %x\n", *major);
+               RETURN(-EACCES);
+       }
+
+       if (gctx->gsc_reverse == 0 &&
+           gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+               CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+verified:
+       swabbed = ptlrpc_req_need_swab(req);
+
+       /* user descriptor */
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (msg->lm_bufcount < (offset + 1)) {
+                       CERROR("no user desc included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+               offset++;
+       }
+
+       /* check bulk_sec_desc data */
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               if (msg->lm_bufcount < (offset + 1)) {
+                       CERROR("missing bulk sec descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                       RETURN(-EINVAL);
+
+               req->rq_pack_bulk = 1;
+               grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+               grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+       req->rq_reqlen = msg->lm_buflens[1];
+       RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+                          struct gss_svc_reqctx *grctx,
+                          struct gss_wire_ctx *gw,
+                          __u32 *major)
+{
+       struct gss_svc_ctx *gctx = grctx->src_ctx;
+       struct lustre_msg  *msg = req->rq_reqbuf;
+       int              swabbed, msglen, offset = 1;
+       ENTRY;
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+               CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       *major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+                              &msglen, req->rq_reqdata_len);
+       if (*major != GSS_S_COMPLETE) {
+               CERROR("failed to unwrap request: %x\n", *major);
+               RETURN(-EACCES);
+       }
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+               CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       swabbed = __lustre_unpack_msg(msg, msglen);
+       if (swabbed < 0) {
+               CERROR("Failed to unpack after decryption\n");
+               RETURN(-EINVAL);
+       }
+       req->rq_reqdata_len = msglen;
+
+       if (msg->lm_bufcount < 1) {
+               CERROR("Invalid buffer: is empty\n");
+               RETURN(-EINVAL);
+       }
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (msg->lm_bufcount < offset + 1) {
+                       CERROR("no user descriptor included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+               offset++;
+       }
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               if (msg->lm_bufcount < offset + 1) {
+                       CERROR("no bulk checksum included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                       RETURN(-EINVAL);
+
+               req->rq_pack_bulk = 1;
+               grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+               grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+       req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+       RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+                       struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       __u32             major = 0;
+       int                 rc = 0;
+       ENTRY;
+
+       grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+       if (!grctx->src_ctx) {
+               major = GSS_S_NO_CONTEXT;
+               goto error;
+       }
+
+       switch (gw->gw_svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               rc = gss_svc_verify_request(req, grctx, gw, &major);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               rc = gss_svc_unseal_request(req, grctx, gw, &major);
+               break;
+       default:
+               CERROR("unsupported gss service %d\n", gw->gw_svc);
+               rc = -EINVAL;
+       }
+
+       if (rc == 0)
+               RETURN(SECSVC_OK);
+
+       CERROR("svc %u failed: major 0x%08x: req xid "LPU64" ctx %p idx "
+              LPX64"(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+              grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+              grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+       /* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+        * might happen after server reboot, to allow recovery. */
+       if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+           gss_pack_err_notify(req, major, 0) == 0)
+               RETURN(SECSVC_COMPLETE);
+
+       RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+                          struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       __u32              major;
+       ENTRY;
+
+       req->rq_ctx_fini = 1;
+       req->rq_no_reply = 1;
+
+       grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+       if (!grctx->src_ctx) {
+               CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+               CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gss_svc_verify_request(req, grctx, gw, &major))
+               RETURN(SECSVC_DROP);
+
+       CWARN("destroy svc ctx %p idx "LPX64" (%u->%s)\n",
+             grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+             grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+       gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (req->rq_reqbuf->lm_bufcount < 4) {
+                       CERROR("missing user descriptor, ignore it\n");
+                       RETURN(SECSVC_OK);
+               }
+               if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+                                            ptlrpc_req_need_swab(req))) {
+                       CERROR("Mal-formed user descriptor, ignore it\n");
+                       RETURN(SECSVC_OK);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+       }
+
+       RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+       struct gss_header      *ghdr;
+       struct gss_svc_reqctx  *grctx;
+       struct gss_wire_ctx    *gw;
+       int                  swabbed, rc;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_svc_ctx == NULL);
+
+       if (req->rq_reqbuf->lm_bufcount < 2) {
+               CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       /* sanity checks */
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+                      PTLRPC_GSS_VERSION);
+               RETURN(SECSVC_DROP);
+       }
+
+       req->rq_sp_from = ghdr->gh_sp;
+
+       /* alloc grctx data */
+       OBD_ALLOC_PTR(grctx);
+       if (!grctx)
+               RETURN(SECSVC_DROP);
+
+       grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+       atomic_set(&grctx->src_base.sc_refcount, 1);
+       req->rq_svc_ctx = &grctx->src_base;
+       gw = &grctx->src_wirectx;
+
+       /* save wire context */
+       gw->gw_flags = ghdr->gh_flags;
+       gw->gw_proc = ghdr->gh_proc;
+       gw->gw_seq = ghdr->gh_seq;
+       gw->gw_svc = ghdr->gh_svc;
+       rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+       /* keep original wire header which subject to checksum verification */
+       if (swabbed)
+               gss_header_swabber(ghdr);
+
+       switch(ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_INIT:
+       case PTLRPC_GSS_PROC_CONTINUE_INIT:
+               rc = gss_svc_handle_init(req, gw);
+               break;
+       case PTLRPC_GSS_PROC_DATA:
+               rc = gss_svc_handle_data(req, gw);
+               break;
+       case PTLRPC_GSS_PROC_DESTROY:
+               rc = gss_svc_handle_destroy(req, gw);
+               break;
+       default:
+               CERROR("unknown proc %u\n", gw->gw_proc);
+               rc = SECSVC_DROP;
+               break;
+       }
+
+       switch (rc) {
+       case SECSVC_OK:
+               LASSERT (grctx->src_ctx);
+
+               req->rq_auth_gss = 1;
+               req->rq_auth_remote = grctx->src_ctx->gsc_remote;
+               req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+               req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+               req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+               req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+               req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+               break;
+       case SECSVC_COMPLETE:
+               break;
+       case SECSVC_DROP:
+               gss_svc_reqctx_free(grctx);
+               req->rq_svc_ctx = NULL;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct gss_svc_reqctx  *grctx;
+       ENTRY;
+
+       if (svc_ctx == NULL) {
+               EXIT;
+               return;
+       }
+
+       grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+       CWARN("gss svc invalidate ctx %p(%u)\n",
+             grctx->src_ctx, grctx->src_ctx->gsc_uid);
+       gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+       EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+                   int msgsize, int privacy)
+{
+       /* we should treat early reply normally, but which is actually sharing
+        * the same ctx with original request, so in this case we should
+        * ignore the special ctx's special flags */
+       if (early == 0 && gss_svc_reqctx_is_special(grctx))
+               return grctx->src_reserve_len;
+
+       return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+                               struct sptlrpc_flavor *flvr,
+                               int read)
+{
+       int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+       if (read) {
+               switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+               case SPTLRPC_BULK_SVC_NULL:
+                       break;
+               case SPTLRPC_BULK_SVC_INTG:
+                       payload += gss_mech_payload(NULL, 0, 0);
+                       break;
+               case SPTLRPC_BULK_SVC_PRIV:
+                       payload += gss_mech_payload(NULL, 0, 1);
+                       break;
+               case SPTLRPC_BULK_SVC_AUTH:
+               default:
+                       LBUG();
+               }
+       }
+
+       return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+       struct gss_svc_reqctx       *grctx;
+       struct ptlrpc_reply_state   *rs;
+       int                       early, privacy, svc, bsd_off = 0;
+       __u32                   ibuflens[2], buflens[4];
+       int                       ibufcnt = 0, bufcnt;
+       int                       txtsize, wmsg_size, rs_size;
+       ENTRY;
+
+       LASSERT(msglen % 8 == 0);
+
+       if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+               CERROR("client request bulk sec on non-bulk rpc\n");
+               RETURN(-EPROTO);
+       }
+
+       svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       early = (req->rq_packed_final == 0);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       if (!early && gss_svc_reqctx_is_special(grctx))
+               privacy = 0;
+       else
+               privacy = (svc == SPTLRPC_SVC_PRIV);
+
+       if (privacy) {
+               /* inner clear buffers */
+               ibufcnt = 1;
+               ibuflens[0] = msglen;
+
+               if (req->rq_pack_bulk) {
+                       LASSERT(grctx->src_reqbsd);
+
+                       bsd_off = ibufcnt;
+                       ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+                                                       grctx->src_ctx,
+                                                       &req->rq_flvr,
+                                                       req->rq_bulk_read);
+               }
+
+               txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+               txtsize += GSS_MAX_CIPHER_BLOCK;
+
+               /* wrapper buffer */
+               bufcnt = 2;
+               buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+               buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+       } else {
+               bufcnt = 2;
+               buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+               buflens[1] = msglen;
+
+               txtsize = buflens[0];
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[1];
+
+               if (req->rq_pack_bulk) {
+                       LASSERT(grctx->src_reqbsd);
+
+                       bsd_off = bufcnt;
+                       buflens[bufcnt] = gss_svc_bulk_payload(
+                                                       grctx->src_ctx,
+                                                       &req->rq_flvr,
+                                                       req->rq_bulk_read);
+                       if (svc == SPTLRPC_SVC_INTG)
+                               txtsize += buflens[bufcnt];
+                       bufcnt++;
+               }
+
+               if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+                   svc != SPTLRPC_SVC_NULL)
+                       buflens[bufcnt++] = gss_svc_payload(grctx, early,
+                                                           txtsize, 0);
+       }
+
+       wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+       rs_size = sizeof(*rs) + wmsg_size;
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = wmsg_size;
+
+       /* initialize the buffer */
+       if (privacy) {
+               lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+               rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+       } else {
+               lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+               rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+               rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+       }
+
+       if (bsd_off) {
+               grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+               grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+                                                          bsd_off);
+       }
+
+       gss_svc_reqctx_addref(grctx);
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+
+       LASSERT(rs->rs_msg);
+       req->rq_reply_state = rs;
+       RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+                       struct ptlrpc_reply_state *rs,
+                       struct gss_svc_reqctx *grctx)
+{
+       struct gss_svc_ctx      *gctx = grctx->src_ctx;
+       rawobj_t                 hdrobj, msgobj, token;
+       struct gss_header       *ghdr;
+       __u8                *token_buf;
+       int                   token_buflen;
+       __u32               buflens[2], major;
+       int                   msglen, rc;
+       ENTRY;
+
+       /* get clear data length. note embedded lustre_msg might
+        * have been shrinked */
+       if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+               msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+       else
+               msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                           rs->rs_repbuf->lm_buflens);
+
+       /* temporarily use tail of buffer to hold gss header data */
+       LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+       ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+                               rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = LUSTRE_SP_ANY;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+       ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+       ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+       ghdr->gh_handle.len = 0;
+       if (req->rq_pack_bulk)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+       /* allocate temporary cipher buffer */
+       token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+       OBD_ALLOC_LARGE(token_buf, token_buflen);
+       if (token_buf == NULL)
+               RETURN(-ENOMEM);
+
+       hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+       hdrobj.data = (__u8 *) ghdr;
+       msgobj.len = msglen;
+       msgobj.data = (__u8 *) rs->rs_repbuf;
+       token.len = token_buflen;
+       token.data = token_buf;
+
+       major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+                         rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("wrap message error: %08x\n", major);
+               GOTO(out_free, rc = -EPERM);
+       }
+       LASSERT(token.len <= token_buflen);
+
+       /* we are about to override data at rs->rs_repbuf, nullify pointers
+        * to which to catch further illegal usage. */
+       if (req->rq_pack_bulk) {
+               grctx->src_repbsd = NULL;
+               grctx->src_repbsd_size = 0;
+       }
+
+       /* now fill the actual wire data
+        * - gss header
+        * - gss token
+        */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = token.len;
+
+       rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+       LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+       lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+       rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+              PTLRPC_GSS_HEADER_SIZE);
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+       /* reply offset */
+       if (req->rq_packed_final &&
+           (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+               req->rq_reply_off = gss_at_reply_off_priv;
+       else
+               req->rq_reply_off = 0;
+
+       /* to catch upper layer's further access */
+       rs->rs_msg = NULL;
+       req->rq_repmsg = NULL;
+       req->rq_replen = 0;
+
+       rc = 0;
+out_free:
+       OBD_FREE_LARGE(token_buf, token_buflen);
+       RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+       int                     early, rc;
+       ENTRY;
+
+       early = (req->rq_packed_final == 0);
+
+       if (!early && gss_svc_reqctx_is_special(grctx)) {
+               LASSERT(rs->rs_repdata_len != 0);
+
+               req->rq_reply_off = gss_at_reply_off_integ;
+               RETURN(0);
+       }
+
+       /* early reply could happen in many cases */
+       if (!early &&
+           gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+           gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+               CERROR("proc %d not support\n", gw->gw_proc);
+               RETURN(-EINVAL);
+       }
+
+       LASSERT(grctx->src_ctx);
+
+       switch (gw->gw_svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               rc = gss_svc_seal(req, rs, grctx);
+               break;
+       default:
+               CERROR("Unknown service %d\n", gw->gw_svc);
+               GOTO(out, rc = -EINVAL);
+       }
+       rc = 0;
+
+out:
+       RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+       struct gss_svc_reqctx *grctx;
+
+       LASSERT(rs->rs_svc_ctx);
+       grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+       gss_svc_reqctx_decref(grctx);
+       rs->rs_svc_ctx = NULL;
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+       gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                        struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+       struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+       struct gss_ctx   *mechctx = NULL;
+
+       LASSERT(cli_gctx);
+       LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+       cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+       cli_gctx->gc_win = GSS_SEQ_WIN;
+
+       /* The problem is the reverse ctx might get lost in some recovery
+        * situations, and the same svc_ctx will be used to re-create it.
+        * if there's callback be sentout before that, new reverse ctx start
+        * with sequence 0 will lead to future callback rpc be treated as
+        * replay.
+        *
+        * each reverse root ctx will record its latest sequence number on its
+        * buddy svcctx before be destroied, so here we continue use it.
+        */
+       atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+       if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+               CERROR("failed to dup svc handle\n");
+               goto err_out;
+       }
+
+       if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+           GSS_S_COMPLETE) {
+               CERROR("failed to copy mech context\n");
+               goto err_svc_handle;
+       }
+
+       if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+               CERROR("failed to dup reverse handle\n");
+               goto err_ctx;
+       }
+
+       cli_gctx->gc_mechctx = mechctx;
+       gss_cli_ctx_uptodate(cli_gctx);
+
+       return 0;
+
+err_ctx:
+       lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+       rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+       return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+       __u32 buflens[3];
+       int clearsize;
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = lustre_msg_early_size();
+       buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+       gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+       buflens[0] = lustre_msg_early_size();
+       clearsize = lustre_msg_size_v2(1, buflens);
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+       buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+       gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+int __init sptlrpc_gss_init(void)
+{
+       int rc;
+
+       rc = gss_init_lproc();
+       if (rc)
+               return rc;
+
+       rc = gss_init_cli_upcall();
+       if (rc)
+               goto out_lproc;
+
+       rc = gss_init_svc_upcall();
+       if (rc)
+               goto out_cli_upcall;
+
+       rc = init_kerberos_module();
+       if (rc)
+               goto out_svc_upcall;
+
+       /* register policy after all other stuff be intialized, because it
+        * might be in used immediately after the registration. */
+
+       rc = gss_init_keyring();
+       if (rc)
+               goto out_kerberos;
+
+#ifdef HAVE_GSS_PIPEFS
+       rc = gss_init_pipefs();
+       if (rc)
+               goto out_keyring;
+#endif
+
+       gss_init_at_reply_offset();
+
+       return 0;
+
+#ifdef HAVE_GSS_PIPEFS
+out_keyring:
+       gss_exit_keyring();
+#endif
+
+out_kerberos:
+       cleanup_kerberos_module();
+out_svc_upcall:
+       gss_exit_svc_upcall();
+out_cli_upcall:
+       gss_exit_cli_upcall();
+out_lproc:
+       gss_exit_lproc();
+       return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+       gss_exit_keyring();
+#ifdef HAVE_GSS_PIPEFS
+       gss_exit_pipefs();
+#endif
+       cleanup_kerberos_module();
+       gss_exit_svc_upcall();
+       gss_exit_cli_upcall();
+       gss_exit_lproc();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("GSS security policy for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644 (file)
index 0000000..47a3c05
--- /dev/null
@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+        __u64 pcaa_peer_committed;
+       int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+                              enum lustre_imp_state state)
+{
+       imp->imp_state = state;
+       imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+       imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+               cfs_time_current_sec();
+       imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+               IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)                                \
+do {                                                                      \
+       if (imp->imp_state != LUSTRE_IMP_CLOSED) {                           \
+              CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+                     imp, obd2cli_tgt(imp->imp_obd),                     \
+                     ptlrpc_import_state_name(imp->imp_state),         \
+                     ptlrpc_import_state_name(state));                 \
+              __import_set_state(imp, state);                           \
+       }                                                                     \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)                                   \
+do {                                                                   \
+       spin_lock(&imp->imp_lock);                                      \
+       IMPORT_SET_STATE_NOLOCK(imp, state);                            \
+       spin_unlock(&imp->imp_lock);                                    \
+} while(0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                   struct ptlrpc_request *request,
+                                   void * data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+       spin_lock(&imp->imp_lock);
+
+       imp->imp_generation++;
+       imp->imp_state =  LUSTRE_IMP_NEW;
+
+       spin_unlock(&imp->imp_lock);
+
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+       *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+               ? uuid : uuid + strlen(prefix);
+
+       *uuid_len = strlen(*uuid_start);
+
+       if (*uuid_len < strlen(UUID_STR))
+               return;
+
+       if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+                   UUID_STR, strlen(UUID_STR)))
+               *uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *          and caused the disconnection.  In some cases, multiple
+ *          inflight requests can fail to a single target (e.g. OST
+ *          bulk requests) and if one has already caused a reconnection
+ *          (increasing the import->conn_cnt) the older failure should
+ *          not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+       int rc = 0;
+
+       spin_lock(&imp->imp_lock);
+
+       if (imp->imp_state == LUSTRE_IMP_FULL &&
+           (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+               char *target_start;
+               int   target_len;
+
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+
+               if (imp->imp_replayable) {
+                       LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+                              "lost; in progress operations using this "
+                              "service will wait for recovery to complete\n",
+                              imp->imp_obd->obd_name, target_len, target_start,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid));
+               } else {
+                       LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+                              "%.*s (at %s) was lost; in progress "
+                              "operations using this service will fail\n",
+                              imp->imp_obd->obd_name,
+                              target_len, target_start,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid));
+               }
+               ptlrpc_deactivate_timeouts(imp);
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+               spin_unlock(&imp->imp_lock);
+
+               if (obd_dump_on_timeout)
+                       libcfs_debug_dumplog();
+
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+               rc = 1;
+       } else {
+               spin_unlock(&imp->imp_lock);
+               CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+                      imp->imp_client->cli_name, imp,
+                      (imp->imp_state == LUSTRE_IMP_FULL &&
+                       imp->imp_conn_cnt > conn_cnt) ?
+                      "reconnected" : "not connected", imp->imp_conn_cnt,
+                      conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+       }
+
+       return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+       ENTRY;
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+       CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+       imp->imp_invalid = 1;
+       imp->imp_generation++;
+       spin_unlock(&imp->imp_lock);
+
+       ptlrpc_abort_inflight(imp);
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+       EXIT;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+       spin_lock(&imp->imp_lock);
+       ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+       long dl;
+
+       if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+             (req->rq_phase == RQ_PHASE_BULK) ||
+             (req->rq_phase == RQ_PHASE_NEW)))
+               return 0;
+
+       if (req->rq_timedout)
+               return 0;
+
+       if (req->rq_phase == RQ_PHASE_NEW)
+               dl = req->rq_sent;
+       else
+               dl = req->rq_deadline;
+
+       if (dl <= now)
+               return 0;
+
+       return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+       time_t now = cfs_time_current_sec();
+       struct list_head *tmp, *n;
+       struct ptlrpc_request *req;
+       unsigned int timeout = 0;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_list);
+               timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+       }
+       spin_unlock(&imp->imp_lock);
+       return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+       struct list_head *tmp, *n;
+       struct ptlrpc_request *req;
+       struct l_wait_info lwi;
+       unsigned int timeout;
+       int rc;
+
+       atomic_inc(&imp->imp_inval_count);
+
+       if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+               ptlrpc_deactivate_import(imp);
+
+       LASSERT(imp->imp_invalid);
+
+       /* Wait forever until inflight == 0. We really can't do it another
+        * way because in some cases we need to wait for very long reply
+        * unlink. We can't do anything before that because there is really
+        * no guarantee that some rdma transfer is not in progress right now. */
+       do {
+               /* Calculate max timeout for waiting on rpcs to error
+                * out. Use obd_timeout if calculated value is smaller
+                * than it. */
+               if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                       timeout = ptlrpc_inflight_timeout(imp);
+                       timeout += timeout / 3;
+
+                       if (timeout == 0)
+                               timeout = obd_timeout;
+               } else {
+                       /* decrease the interval to increase race condition */
+                       timeout = 1;
+               }
+
+               CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+                      timeout);
+
+               /* Wait for all requests to error out and call completion
+                * callbacks. Cap it at obd_timeout -- these should all
+                * have been locally cancelled by ptlrpc_abort_inflight. */
+               lwi = LWI_TIMEOUT_INTERVAL(
+                       cfs_timeout_cap(cfs_time_seconds(timeout)),
+                       (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+                       NULL, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 (atomic_read(&imp->imp_inflight) == 0),
+                                 &lwi);
+               if (rc) {
+                       const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+                       CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+                              cli_tgt, rc,
+                              atomic_read(&imp->imp_inflight));
+
+                       spin_lock(&imp->imp_lock);
+                       if (atomic_read(&imp->imp_inflight) == 0) {
+                               int count = atomic_read(&imp->imp_unregistering);
+
+                               /* We know that "unregistering" rpcs only can
+                                * survive in sending or delaying lists (they
+                                * maybe waiting for long reply unlink in
+                                * sluggish nets). Let's check this. If there
+                                * is no inflight and unregistering != 0, this
+                                * is bug. */
+                               LASSERTF(count == 0, "Some RPCs are still "
+                                        "unregistering: %d\n", count);
+
+                               /* Let's save one loop as soon as inflight have
+                                * dropped to zero. No new inflights possible at
+                                * this point. */
+                               rc = 0;
+                       } else {
+                               list_for_each_safe(tmp, n,
+                                                      &imp->imp_sending_list) {
+                                       req = list_entry(tmp,
+                                                            struct ptlrpc_request,
+                                                            rq_list);
+                                       DEBUG_REQ(D_ERROR, req,
+                                                 "still on sending list");
+                               }
+                               list_for_each_safe(tmp, n,
+                                                      &imp->imp_delayed_list) {
+                                       req = list_entry(tmp,
+                                                            struct ptlrpc_request,
+                                                            rq_list);
+                                       DEBUG_REQ(D_ERROR, req,
+                                                 "still on delayed list");
+                               }
+
+                               CERROR("%s: RPCs in \"%s\" phase found (%d). "
+                                      "Network is sluggish? Waiting them "
+                                      "to error out.\n", cli_tgt,
+                                      ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+                                      atomic_read(&imp->
+                                                      imp_unregistering));
+                       }
+                       spin_unlock(&imp->imp_lock);
+                 }
+       } while (rc != 0);
+
+       /*
+        * Let's additionally check that no new rpcs added to import in
+        * "invalidate" state.
+        */
+       LASSERT(atomic_read(&imp->imp_inflight) == 0);
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+       sptlrpc_import_flush_all_ctx(imp);
+
+       atomic_dec(&imp->imp_inval_count);
+       wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+       struct obd_device *obd = imp->imp_obd;
+
+       spin_lock(&imp->imp_lock);
+       imp->imp_invalid = 0;
+       ptlrpc_activate_timeouts(imp);
+       spin_unlock(&imp->imp_lock);
+       obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+       ENTRY;
+
+       LASSERT(!imp->imp_dlm_fake);
+
+       if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+               if (!imp->imp_replayable) {
+                       CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                              "auto-deactivating\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid,
+                              imp->imp_obd->obd_name);
+                       ptlrpc_deactivate_import(imp);
+               }
+
+               CDEBUG(D_HA, "%s: waking up pinger\n",
+                      obd2cli_tgt(imp->imp_obd));
+
+               spin_lock(&imp->imp_lock);
+               imp->imp_force_verify = 1;
+               spin_unlock(&imp->imp_lock);
+
+               ptlrpc_pinger_wake_up();
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+       ptlrpc_set_import_discon(imp, 0);
+       /* Force a new connect attempt */
+       ptlrpc_invalidate_import(imp);
+       /* Do a fresh connect next time by zeroing the handle */
+       ptlrpc_disconnect_import(imp, 1);
+       /* Wait for all invalidate calls to finish */
+       if (atomic_read(&imp->imp_inval_count) > 0) {
+               int rc;
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 (atomic_read(&imp->imp_inval_count) == 0),
+                                 &lwi);
+               if (rc)
+                       CERROR("Interrupted, inval=%d\n",
+                              atomic_read(&imp->imp_inval_count));
+       }
+
+       /* Allow reconnect attempts */
+       imp->imp_obd->obd_no_recov = 0;
+       /* Remove 'invalid' flag */
+       ptlrpc_activate_import(imp);
+       /* Attempt a new connect */
+       ptlrpc_recover_import(imp, NULL, 0);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+       struct obd_import_conn *imp_conn = NULL, *conn;
+       struct obd_export *dlmexp;
+       char *target_start;
+       int target_len, tried_all = 1;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+
+       if (list_empty(&imp->imp_conn_list)) {
+               CERROR("%s: no connections available\n",
+                      imp->imp_obd->obd_name);
+               spin_unlock(&imp->imp_lock);
+               RETURN(-EINVAL);
+       }
+
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n",
+                      imp->imp_obd->obd_name,
+                      libcfs_nid2str(conn->oic_conn->c_peer.nid),
+                      conn->oic_last_attempt);
+
+               /* If we have not tried this connection since
+                  the last successful attempt, go with this one */
+               if ((conn->oic_last_attempt == 0) ||
+                   cfs_time_beforeq_64(conn->oic_last_attempt,
+                                      imp->imp_last_success_conn)) {
+                       imp_conn = conn;
+                       tried_all = 0;
+                       break;
+               }
+
+               /* If all of the connections have already been tried
+                  since the last successful connection; just choose the
+                  least recently used */
+               if (!imp_conn)
+                       imp_conn = conn;
+               else if (cfs_time_before_64(conn->oic_last_attempt,
+                                           imp_conn->oic_last_attempt))
+                       imp_conn = conn;
+       }
+
+       /* if not found, simply choose the current one */
+       if (!imp_conn || imp->imp_force_reconnect) {
+               LASSERT(imp->imp_conn_current);
+               imp_conn = imp->imp_conn_current;
+               tried_all = 0;
+       }
+       LASSERT(imp_conn->oic_conn);
+
+       /* If we've tried everything, and we're back to the beginning of the
+          list, increase our timeout and try again. It will be reset when
+          we do finally connect. (FIXME: really we should wait for all network
+          state associated with the last connection attempt to drain before
+          trying to reconnect on it.) */
+       if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+               struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+               if (at_get(at) < CONNECTION_SWITCH_MAX) {
+                       at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+                       if (at_get(at) > CONNECTION_SWITCH_MAX)
+                               at_reset(at, CONNECTION_SWITCH_MAX);
+               }
+               LASSERT(imp_conn->oic_last_attempt);
+               CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+                       "to %ds\n", imp->imp_obd->obd_name, at_get(at));
+       }
+
+       imp_conn->oic_last_attempt = cfs_time_current_64();
+
+       /* switch connection, don't mind if it's same as the current one */
+       if (imp->imp_connection)
+               ptlrpc_connection_put(imp->imp_connection);
+       imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+       dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+       LASSERT(dlmexp != NULL);
+       if (dlmexp->exp_connection)
+               ptlrpc_connection_put(dlmexp->exp_connection);
+       dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+       class_export_put(dlmexp);
+
+       if (imp->imp_conn_current != imp_conn) {
+               if (imp->imp_conn_current) {
+                       deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                                 &target_start, &target_len);
+
+                       CDEBUG(D_HA, "%s: Connection changing to"
+                              " %.*s (at %s)\n",
+                              imp->imp_obd->obd_name,
+                              target_len, target_start,
+                              libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+               }
+
+               imp->imp_conn_current = imp_conn;
+       }
+
+       CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+              imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+              libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(0);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+       struct ptlrpc_request *req;
+       struct list_head *tmp;
+
+       if (list_empty(&imp->imp_replay_list))
+               return 0;
+       tmp = imp->imp_replay_list.next;
+       req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+       *transno = req->rq_transno;
+       if (req->rq_transno == 0) {
+               DEBUG_REQ(D_ERROR, req, "zero transno in replay");
+               LBUG();
+       }
+
+       return 1;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int initial_connect = 0;
+       int set_transno = 0;
+       __u64 committed_before_reconnect = 0;
+       struct ptlrpc_request *request;
+       char *bufs[] = { NULL,
+                        obd2cli_tgt(imp->imp_obd),
+                        obd->obd_uuid.uuid,
+                        (char *)&imp->imp_dlm_handle,
+                        (char *)&imp->imp_connect_data };
+       struct ptlrpc_connect_async_args *aa;
+       int rc;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("can't connect to a closed import\n");
+               RETURN(-EINVAL);
+       } else if (imp->imp_state == LUSTRE_IMP_FULL) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("already connected\n");
+               RETURN(0);
+       } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("already connecting\n");
+               RETURN(-EALREADY);
+       }
+
+       IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+       imp->imp_conn_cnt++;
+       imp->imp_resend_replay = 0;
+
+       if (!lustre_handle_is_used(&imp->imp_remote_handle))
+               initial_connect = 1;
+       else
+               committed_before_reconnect = imp->imp_peer_committed_transno;
+
+       set_transno = ptlrpc_first_transno(imp,
+                                          &imp->imp_connect_data.ocd_transno);
+       spin_unlock(&imp->imp_lock);
+
+       rc = import_select_connection(imp);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = sptlrpc_import_sec_adapt(imp, NULL, 0);
+       if (rc)
+               GOTO(out, rc);
+
+       /* Reset connect flags to the originally requested flags, in case
+        * the server is updated on-the-fly we will get the new features. */
+       imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+       /* Reset ocd_version each time so the server knows the exact versions */
+       imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+       imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+       imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+       rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+                          &obd->obd_uuid, &imp->imp_connect_data, NULL);
+       if (rc)
+               GOTO(out, rc);
+
+       request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+       if (request == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+                                     imp->imp_connect_op, bufs, NULL);
+       if (rc) {
+               ptlrpc_request_free(request);
+               GOTO(out, rc);
+       }
+
+       /* Report the rpc service time to the server so that it knows how long
+        * to wait for clients to join recovery */
+       lustre_msg_set_service_time(request->rq_reqmsg,
+                                   at_timeout2est(request->rq_timeout));
+
+       /* The amount of time we give the server to process the connect req.
+        * import_select_connection will increase the net latency on
+        * repeated reconnect attempts to cover slow networks.
+        * We override/ignore the server rpc completion estimate here,
+        * which may be large if this is a reconnect attempt */
+       request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+       lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+       lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+       request->rq_no_resend = request->rq_no_delay = 1;
+       request->rq_send_state = LUSTRE_IMP_CONNECTING;
+       /* Allow a slightly larger reply for future growth compatibility */
+       req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+                            sizeof(struct obd_connect_data)+16*sizeof(__u64));
+       ptlrpc_request_set_replen(request);
+       request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+       CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
+       aa = ptlrpc_req_async_args(request);
+       memset(aa, 0, sizeof *aa);
+
+       aa->pcaa_peer_committed = committed_before_reconnect;
+       aa->pcaa_initial_connect = initial_connect;
+
+       if (aa->pcaa_initial_connect) {
+               spin_lock(&imp->imp_lock);
+               imp->imp_replayable = 1;
+               spin_unlock(&imp->imp_lock);
+               lustre_msg_add_op_flags(request->rq_reqmsg,
+                                       MSG_CONNECT_INITIAL);
+       }
+
+       if (set_transno)
+               lustre_msg_add_op_flags(request->rq_reqmsg,
+                                       MSG_CONNECT_TRANSNO);
+
+       DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+                 request->rq_timeout);
+       ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+       rc = 0;
+out:
+       if (rc != 0) {
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+       int force_verify;
+
+       spin_lock(&imp->imp_lock);
+       force_verify = imp->imp_force_verify != 0;
+       spin_unlock(&imp->imp_lock);
+
+       if (force_verify)
+               ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+       return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                   struct ptlrpc_request *request,
+                                   void *data, int rc)
+{
+       struct ptlrpc_connect_async_args *aa = data;
+       struct obd_import *imp = request->rq_import;
+       struct client_obd *cli = &imp->imp_obd->u.cli;
+       struct lustre_handle old_hdl;
+       __u64 old_connect_flags;
+       int msg_flags;
+       struct obd_connect_data *ocd;
+       struct obd_export *exp;
+       int ret;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               imp->imp_connect_tried = 1;
+               spin_unlock(&imp->imp_lock);
+               RETURN(0);
+       }
+
+       if (rc) {
+               /* if this reconnect to busy export - not need select new target
+                * for connecting*/
+               imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+               spin_unlock(&imp->imp_lock);
+               ptlrpc_maybe_ping_import_soon(imp);
+               GOTO(out, rc);
+       }
+       spin_unlock(&imp->imp_lock);
+
+       LASSERT(imp->imp_conn_current);
+
+       msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+       ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+                                  RCL_SERVER);
+       /* server replied obd_connect_data is always bigger */
+       ocd = req_capsule_server_sized_get(&request->rq_pill,
+                                          &RMF_CONNECT_DATA, ret);
+
+       if (ocd == NULL) {
+               CERROR("%s: no connect data from server\n",
+                      imp->imp_obd->obd_name);
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       spin_lock(&imp->imp_lock);
+
+       /* All imports are pingable */
+       imp->imp_pingable = 1;
+       imp->imp_force_reconnect = 0;
+       imp->imp_force_verify = 0;
+
+       imp->imp_connect_data = *ocd;
+
+       CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+              imp->imp_obd->obd_name, ocd->ocd_instance);
+       exp = class_conn2export(&imp->imp_dlm_handle);
+
+       spin_unlock(&imp->imp_lock);
+
+       /* check that server granted subset of flags we asked for. */
+       if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+           ocd->ocd_connect_flags) {
+               CERROR("%s: Server didn't granted asked subset of flags: "
+                      "asked="LPX64" grranted="LPX64"\n",
+                      imp->imp_obd->obd_name,imp->imp_connect_flags_orig,
+                      ocd->ocd_connect_flags);
+               GOTO(out, rc = -EPROTO);
+       }
+
+       if (!exp) {
+               /* This could happen if export is cleaned during the
+                  connect attempt */
+               CERROR("%s: missing export after connect\n",
+                      imp->imp_obd->obd_name);
+               GOTO(out, rc = -ENODEV);
+       }
+       old_connect_flags = exp_connect_flags(exp);
+       exp->exp_connect_data = *ocd;
+       imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+       class_export_put(exp);
+
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+       if (aa->pcaa_initial_connect) {
+               spin_lock(&imp->imp_lock);
+               if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+                       imp->imp_replayable = 1;
+                       spin_unlock(&imp->imp_lock);
+                       CDEBUG(D_HA, "connected to replayable target: %s\n",
+                              obd2cli_tgt(imp->imp_obd));
+               } else {
+                       imp->imp_replayable = 0;
+                       spin_unlock(&imp->imp_lock);
+               }
+
+               /* if applies, adjust the imp->imp_msg_magic here
+                * according to reply flags */
+
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+
+               /* Initial connects are allowed for clients with non-random
+                * uuids when servers are in recovery.  Simply signal the
+                * servers replay is complete and wait in REPLAY_WAIT. */
+               if (msg_flags & MSG_CONNECT_RECOVERING) {
+                       CDEBUG(D_HA, "connect to %s during recovery\n",
+                              obd2cli_tgt(imp->imp_obd));
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+               } else {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                       ptlrpc_activate_import(imp);
+               }
+
+               GOTO(finish, rc = 0);
+       }
+
+       /* Determine what recovery state to move the import to. */
+       if (MSG_CONNECT_RECONNECT & msg_flags) {
+               memset(&old_hdl, 0, sizeof(old_hdl));
+               if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+                           sizeof (old_hdl))) {
+                       LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+                                     "bad handle "LPX64"\n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     imp->imp_connection->c_remote_uuid.uuid,
+                                     imp->imp_dlm_handle.cookie);
+                       GOTO(out, rc = -ENOTCONN);
+               }
+
+               if (memcmp(&imp->imp_remote_handle,
+                          lustre_msg_get_handle(request->rq_repmsg),
+                          sizeof(imp->imp_remote_handle))) {
+                       int level = msg_flags & MSG_CONNECT_RECOVERING ?
+                               D_HA : D_WARNING;
+
+                       /* Bug 16611/14775: if server handle have changed,
+                        * that means some sort of disconnection happened.
+                        * If the server is not in recovery, that also means it
+                        * already erased all of our state because of previous
+                        * eviction. If it is in recovery - we are safe to
+                        * participate since we can reestablish all of our state
+                        * with server again */
+                       if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+                               CDEBUG(level,"%s@%s changed server handle from "
+                                      LPX64" to "LPX64
+                                      " but is still in recovery\n",
+                                      obd2cli_tgt(imp->imp_obd),
+                                      imp->imp_connection->c_remote_uuid.uuid,
+                                      imp->imp_remote_handle.cookie,
+                                      lustre_msg_get_handle(
+                                      request->rq_repmsg)->cookie);
+                       } else {
+                               LCONSOLE_WARN("Evicted from %s (at %s) "
+                                             "after server handle changed from "
+                                             LPX64" to "LPX64"\n",
+                                             obd2cli_tgt(imp->imp_obd),
+                                             imp->imp_connection-> \
+                                             c_remote_uuid.uuid,
+                                             imp->imp_remote_handle.cookie,
+                                             lustre_msg_get_handle(
+                                             request->rq_repmsg)->cookie);
+                       }
+
+
+                       imp->imp_remote_handle =
+                                    *lustre_msg_get_handle(request->rq_repmsg);
+
+                       if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+                               IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                               GOTO(finish, rc = 0);
+                       }
+
+               } else {
+                       CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid);
+               }
+
+               if (imp->imp_invalid) {
+                       CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+                              "marking evicted\n", imp->imp_obd->obd_name);
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+               } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+                       CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+                              imp->imp_obd->obd_name,
+                              obd2cli_tgt(imp->imp_obd));
+
+                       spin_lock(&imp->imp_lock);
+                       imp->imp_resend_replay = 1;
+                       spin_unlock(&imp->imp_lock);
+
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+               } else {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+               }
+       } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+               LASSERT(imp->imp_replayable);
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+               imp->imp_last_replay_transno = 0;
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+       } else {
+               DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+                         " not set: %x)", imp->imp_obd->obd_name, msg_flags);
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+       }
+
+       /* Sanity checks for a reconnected import. */
+       if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+               CERROR("imp_replayable flag does not match server "
+                      "after reconnect. We should LBUG right here.\n");
+       }
+
+       if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+           lustre_msg_get_last_committed(request->rq_repmsg) <
+           aa->pcaa_peer_committed) {
+               CERROR("%s went back in time (transno "LPD64
+                      " was previously committed, server now claims "LPD64
+                      ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+                      "id=9646\n",
+                      obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+                      lustre_msg_get_last_committed(request->rq_repmsg));
+       }
+
+finish:
+       rc = ptlrpc_import_recovery_state_machine(imp);
+       if (rc != 0) {
+               if (rc == -ENOTCONN) {
+                       CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
+                              "invalidating and reconnecting\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid);
+                       ptlrpc_connect_import(imp);
+                       imp->imp_connect_tried = 1;
+                       RETURN(0);
+               }
+       } else {
+
+               spin_lock(&imp->imp_lock);
+               list_del(&imp->imp_conn_current->oic_item);
+               list_add(&imp->imp_conn_current->oic_item,
+                            &imp->imp_conn_list);
+               imp->imp_last_success_conn =
+                       imp->imp_conn_current->oic_last_attempt;
+
+               spin_unlock(&imp->imp_lock);
+
+               if (!ocd->ocd_ibits_known &&
+                   ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+                       CERROR("Inodebits aware server returned zero compatible"
+                              " bits?\n");
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                   (ocd->ocd_version > LUSTRE_VERSION_CODE +
+                                       LUSTRE_VERSION_OFFSET_WARN ||
+                    ocd->ocd_version < LUSTRE_VERSION_CODE -
+                                       LUSTRE_VERSION_OFFSET_WARN)) {
+                       /* Sigh, some compilers do not like #ifdef in the middle
+                          of macro arguments */
+                       const char *older = "older. Consider upgrading server "
+                                           "or downgrading client";
+                       const char *newer = "newer than client version. "
+                                           "Consider upgrading client";
+
+                       LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
+                                     "is much %s (%s)\n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                     OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                     OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                     OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                     ocd->ocd_version > LUSTRE_VERSION_CODE ?
+                                     newer : older, LUSTRE_VERSION_STRING);
+               }
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+               /* Check if server has LU-1252 fix applied to not always swab
+                * the IR MNE entries. Do this only once per connection.  This
+                * fixup is version-limited, because we don't want to carry the
+                * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+                * need interop with unpatched 2.2 servers.  For newer servers,
+                * the client will do MNE swabbing only as needed.  LU-1644 */
+               if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                            !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+                            OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+                            OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+                            OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+                            strcmp(imp->imp_obd->obd_type->typ_name,
+                                   LUSTRE_MGC_NAME) == 0))
+                       imp->imp_need_mne_swab = 1;
+               else /* clear if server was upgraded since last connect */
+                       imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+                       /* We sent to the server ocd_cksum_types with bits set
+                        * for algorithms we understand. The server masked off
+                        * the checksum types it doesn't support */
+                       if ((ocd->ocd_cksum_types &
+                            cksum_types_supported_client()) == 0) {
+                               LCONSOLE_WARN("The negotiation of the checksum "
+                                             "alogrithm to use with server %s "
+                                             "failed (%x/%x), disabling "
+                                             "checksums\n",
+                                             obd2cli_tgt(imp->imp_obd),
+                                             ocd->ocd_cksum_types,
+                                             cksum_types_supported_client());
+                               cli->cl_checksum = 0;
+                               cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+                       } else {
+                               cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+                       }
+               } else {
+                       /* The server does not support OBD_CONNECT_CKSUM.
+                        * Enforce ADLER for backward compatibility*/
+                       cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+               }
+               cli->cl_cksum_type =cksum_type_select(cli->cl_supp_cksum_types);
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+                       cli->cl_max_pages_per_rpc =
+                               min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+                                   cli->cl_max_pages_per_rpc);
+               else if (imp->imp_connect_op == MDS_CONNECT ||
+                        imp->imp_connect_op == MGS_CONNECT)
+                       cli->cl_max_pages_per_rpc = 1;
+
+               /* Reset ns_connect_flags only for initial connect. It might be
+                * changed in while using FS and if we reset it in reconnect
+                * this leads to losing user settings done before such as
+                * disable lru_resize, etc. */
+               if (old_connect_flags != exp_connect_flags(exp) ||
+                   aa->pcaa_initial_connect) {
+                       CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+                              "flags: "LPX64"\n", imp->imp_obd->obd_name,
+                             ocd->ocd_connect_flags);
+                       imp->imp_obd->obd_namespace->ns_connect_flags =
+                               ocd->ocd_connect_flags;
+                       imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+                               ocd->ocd_connect_flags;
+               }
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+                   (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+                       /* We need a per-message support flag, because
+                          a. we don't know if the incoming connect reply
+                             supports AT or not (in reply_in_callback)
+                             until we unpack it.
+                          b. failovered server means export and flags are gone
+                             (in ptlrpc_send_reply).
+                          Can only be set when we know AT is supported at
+                          both ends */
+                       imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+               else
+                       imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+                   (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+                       imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+               else
+                       imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+               LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+                       (cli->cl_max_pages_per_rpc > 0));
+       }
+
+out:
+       imp->imp_connect_tried = 1;
+
+       if (rc != 0) {
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+               if (rc == -EACCES) {
+                       /*
+                        * Give up trying to reconnect
+                        * EACCES means client has no permission for connection
+                        */
+                       imp->imp_obd->obd_no_recov = 1;
+                       ptlrpc_deactivate_import(imp);
+               }
+
+               if (rc == -EPROTO) {
+                       struct obd_connect_data *ocd;
+
+                       /* reply message might not be ready */
+                       if (request->rq_repmsg == NULL)
+                               RETURN(-EPROTO);
+
+                       ocd = req_capsule_server_get(&request->rq_pill,
+                                                    &RMF_CONNECT_DATA);
+                       if (ocd &&
+                           (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                           (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                          /* Actually servers are only supposed to refuse
+                             connection from liblustre clients, so we should
+                             never see this from VFS context */
+                               LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+                                       "(%d.%d.%d.%d)"
+                                       " refused connection from this client "
+                                       "with an incompatible version (%s).  "
+                                       "Client must be recompiled\n",
+                                       obd2cli_tgt(imp->imp_obd),
+                                       OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                       OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                       OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                       OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                       LUSTRE_VERSION_STRING);
+                               ptlrpc_deactivate_import(imp);
+                               IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+                       }
+                       RETURN(-EPROTO);
+               }
+
+               ptlrpc_maybe_ping_import_soon(imp);
+
+               CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+       }
+
+       wake_up_all(&imp->imp_recovery_waitq);
+       RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+                                     struct ptlrpc_request *req,
+                                     void * data, int rc)
+{
+       ENTRY;
+       atomic_dec(&req->rq_import->imp_replay_inflight);
+       if (req->rq_status == 0 &&
+           !req->rq_import->imp_vbr_failed) {
+               ptlrpc_import_recovery_state_machine(req->rq_import);
+       } else {
+               if (req->rq_import->imp_vbr_failed) {
+                       CDEBUG(D_WARNING,
+                              "%s: version recovery fails, reconnecting\n",
+                              req->rq_import->imp_obd->obd_name);
+               } else {
+                       CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+                                    "reconnecting\n",
+                              req->rq_import->imp_obd->obd_name,
+                              req->rq_status);
+               }
+               ptlrpc_connect_import(req->rq_import);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+               RETURN(0);
+
+       LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+       atomic_inc(&imp->imp_replay_inflight);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+                                       OBD_PING);
+       if (req == NULL) {
+               atomic_dec(&imp->imp_replay_inflight);
+               RETURN(-ENOMEM);
+       }
+
+       ptlrpc_request_set_replen(req);
+       req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+       lustre_msg_add_flags(req->rq_reqmsg,
+                            MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+       if (AT_OFF)
+               req->rq_timeout *= 3;
+       req->rq_interpret_reply = completed_replay_interpret;
+
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+       struct obd_import *imp = data;
+
+       ENTRY;
+
+       unshare_fs_struct();
+
+       CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+              imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+              imp->imp_connection->c_remote_uuid.uuid);
+
+       ptlrpc_invalidate_import(imp);
+
+       if (obd_dump_on_eviction) {
+               CERROR("dump the log upon eviction\n");
+               libcfs_debug_dumplog();
+       }
+
+       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+       ptlrpc_import_recovery_state_machine(imp);
+
+       class_import_put(imp);
+       RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+       int rc = 0;
+       int inflight;
+       char *target_start;
+       int target_len;
+
+       ENTRY;
+       if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+               /* Don't care about MGC eviction */
+               if (strcmp(imp->imp_obd->obd_type->typ_name,
+                          LUSTRE_MGC_NAME) != 0) {
+                       LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+                                          "by %.*s; in progress operations "
+                                          "using this service will fail.\n",
+                                          imp->imp_obd->obd_name, target_len,
+                                          target_start);
+               }
+               CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      imp->imp_connection->c_remote_uuid.uuid);
+               /* reset vbr_failed flag upon eviction */
+               spin_lock(&imp->imp_lock);
+               imp->imp_vbr_failed = 0;
+               spin_unlock(&imp->imp_lock);
+
+               {
+               task_t *task;
+               /* bug 17802:  XXX client_disconnect_export vs connect request
+                * race. if client will evicted at this time, we start
+                * invalidate thread without reference to import and import can
+                * be freed at same time. */
+               class_import_get(imp);
+               task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+                                    "ll_imp_inval");
+               if (IS_ERR(task)) {
+                       class_import_put(imp);
+                       CERROR("error starting invalidate thread: %d\n", rc);
+                       rc = PTR_ERR(task);
+               } else {
+                       rc = 0;
+               }
+               RETURN(rc);
+               }
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+               CDEBUG(D_HA, "replay requested by %s\n",
+                      obd2cli_tgt(imp->imp_obd));
+               rc = ptlrpc_replay_next(imp, &inflight);
+               if (inflight == 0 &&
+                   atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+                       rc = ldlm_replay_locks(imp);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+               rc = 0;
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+               if (atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+                       rc = signal_completed_replay(imp);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+               if (atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+               }
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+               CDEBUG(D_HA, "reconnected to %s@%s\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      imp->imp_connection->c_remote_uuid.uuid);
+
+               rc = ptlrpc_resend(imp);
+               if (rc)
+                       GOTO(out, rc);
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+               ptlrpc_activate_import(imp);
+
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+               LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+                             imp->imp_obd->obd_name,
+                             target_len, target_start,
+                             libcfs_nid2str(imp->imp_connection->c_peer.nid));
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_FULL) {
+               wake_up_all(&imp->imp_recovery_waitq);
+               ptlrpc_wake_delayed(imp);
+       }
+
+out:
+       RETURN(rc);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+       struct ptlrpc_request *req;
+       int rq_opc, rc = 0;
+       int nowait = imp->imp_obd->obd_force;
+       ENTRY;
+
+       if (nowait)
+               GOTO(set_state, rc);
+
+       switch (imp->imp_connect_op) {
+       case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
+       case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
+       case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
+       default:
+               CERROR("don't know how to disconnect from %s (connect_op %d)\n",
+                      obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
+               RETURN(-EINVAL);
+       }
+
+       if (ptlrpc_import_in_recovery(imp)) {
+               struct l_wait_info lwi;
+               cfs_duration_t timeout;
+
+
+               if (AT_OFF) {
+                       if (imp->imp_server_timeout)
+                               timeout = cfs_time_seconds(obd_timeout / 2);
+                       else
+                               timeout = cfs_time_seconds(obd_timeout);
+               } else {
+                       int idx = import_at_get_index(imp,
+                               imp->imp_client->cli_request_portal);
+                       timeout = cfs_time_seconds(
+                               at_get(&imp->imp_at.iat_service_estimate[idx]));
+               }
+
+               lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+                                      back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 !ptlrpc_import_in_recovery(imp), &lwi);
+
+       }
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_FULL)
+               GOTO(out, 0);
+
+       spin_unlock(&imp->imp_lock);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+                                       LUSTRE_OBD_VERSION, rq_opc);
+       if (req) {
+               /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+                * it fails.  We can get through the above with a down server
+                * if the client doesn't know the server is gone yet. */
+               req->rq_no_resend = 1;
+
+               /* We want client umounts to happen quickly, no matter the
+                  server state... */
+               req->rq_timeout = min_t(int, req->rq_timeout,
+                                       INITIAL_CONNECT_TIMEOUT);
+
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+               req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               ptlrpc_req_finished(req);
+       }
+
+set_state:
+       spin_lock(&imp->imp_lock);
+out:
+       if (noclose)
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+       else
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+       memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+       imp->imp_generation++;
+       spin_unlock(&imp->imp_lock);
+       ptlrpc_abort_inflight(imp);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+       unsigned int old = at->at_current;
+       time_t now = cfs_time_current_sec();
+       time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+       LASSERT(at);
+       CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+              val, at, now - at->at_binstart, at->at_current,
+              at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+       if (val == 0)
+               /* 0's don't count, because we never want our timeout to
+                  drop to 0, and because 0 could mean an error */
+               return 0;
+
+       spin_lock(&at->at_lock);
+
+       if (unlikely(at->at_binstart == 0)) {
+               /* Special case to remove default from history */
+               at->at_current = val;
+               at->at_worst_ever = val;
+               at->at_worst_time = now;
+               at->at_hist[0] = val;
+               at->at_binstart = now;
+       } else if (now - at->at_binstart < binlimit ) {
+               /* in bin 0 */
+               at->at_hist[0] = max(val, at->at_hist[0]);
+               at->at_current = max(val, at->at_current);
+       } else {
+               int i, shift;
+               unsigned int maxv = val;
+               /* move bins over */
+               shift = (now - at->at_binstart) / binlimit;
+               LASSERT(shift > 0);
+               for(i = AT_BINS - 1; i >= 0; i--) {
+                       if (i >= shift) {
+                               at->at_hist[i] = at->at_hist[i - shift];
+                               maxv = max(maxv, at->at_hist[i]);
+                       } else {
+                               at->at_hist[i] = 0;
+                       }
+               }
+               at->at_hist[0] = val;
+               at->at_current = maxv;
+               at->at_binstart += shift * binlimit;
+       }
+
+       if (at->at_current > at->at_worst_ever) {
+               at->at_worst_ever = at->at_current;
+               at->at_worst_time = now;
+       }
+
+       if (at->at_flags & AT_FLG_NOHIST)
+               /* Only keep last reported val; keeping the rest of the history
+                  for proc only */
+               at->at_current = val;
+
+       if (at_max > 0)
+               at->at_current =  min(at->at_current, at_max);
+       at->at_current =  max(at->at_current, at_min);
+
+       if (at->at_current != old)
+               CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
+                      "(val=%u) hist %u %u %u %u\n", at,
+                      old, at->at_current, at->at_current - old, val,
+                      at->at_hist[0], at->at_hist[1], at->at_hist[2],
+                      at->at_hist[3]);
+
+       /* if we changed, report the old value */
+       old = (at->at_current != old) ? old : 0;
+
+       spin_unlock(&at->at_lock);
+       return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+       struct imp_at *at = &imp->imp_at;
+       int i;
+
+       for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               if (at->iat_portal[i] == portal)
+                       return i;
+               if (at->iat_portal[i] == 0)
+                       /* unused */
+                       break;
+       }
+
+       /* Not found in list, add it under a lock */
+       spin_lock(&imp->imp_lock);
+
+       /* Check unused under lock */
+       for (; i < IMP_AT_MAX_PORTALS; i++) {
+               if (at->iat_portal[i] == portal)
+                       goto out;
+               if (at->iat_portal[i] == 0)
+                       /* unused */
+                       break;
+       }
+
+       /* Not enough portals? */
+       LASSERT(i < IMP_AT_MAX_PORTALS);
+
+       at->iat_portal[i] = portal;
+out:
+       spin_unlock(&imp->imp_lock);
+       return i;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644 (file)
index 0000000..2f55ce2
--- /dev/null
@@ -0,0 +1,2396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include <lustre_ver.h>
+
+#include <obd_support.h>
+/* lustre_swab_mdt_body */
+#include <lustre/lustre_idl.h>
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include <obd.h>
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_update.h>
+#include <lustre_acl.h>
+#include <lustre_debug.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+       &RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_DLM_LVB,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_EPOCH,
+       &RMF_REC_REINT,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SEQ_OPC,
+       &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FLD_OPC,
+       &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_SYMTGT,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_SYMTGT,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_LOGCOOKIES,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_MDT_EPOCH,
+       &RMF_EADATA,
+       &RMF_LOGCOOKIES,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_SWAP_LAYOUTS,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_TGTUUID,
+       &RMF_CLUUID,
+       &RMF_CONN,
+       &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY,
+       &RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GETINFO_KEY,
+       &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_LAYOUT_INTENT,
+       &RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_DLM_REQ,
+       &RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_OBD_IOOBJ,
+       &RMF_NIOBUF_REMOTE,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FIEMAP_KEY,
+       &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_REQUEST,
+       &RMF_MDS_HSM_USER_ITEM,
+       &RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+       &RQF_OBD_PING,
+       &RQF_OBD_SET_INFO,
+       &RQF_OBD_IDX_READ,
+       &RQF_SEC_CTX,
+       &RQF_MGS_TARGET_REG,
+       &RQF_MGS_SET_INFO,
+       &RQF_MGS_CONFIG_READ,
+       &RQF_SEQ_QUERY,
+       &RQF_FLD_QUERY,
+       &RQF_MDS_CONNECT,
+       &RQF_MDS_DISCONNECT,
+       &RQF_MDS_GET_INFO,
+       &RQF_MDS_GETSTATUS,
+       &RQF_MDS_STATFS,
+       &RQF_MDS_GETATTR,
+       &RQF_MDS_GETATTR_NAME,
+       &RQF_MDS_GETXATTR,
+       &RQF_MDS_SYNC,
+       &RQF_MDS_CLOSE,
+       &RQF_MDS_PIN,
+       &RQF_MDS_UNPIN,
+       &RQF_MDS_READPAGE,
+       &RQF_MDS_WRITEPAGE,
+       &RQF_MDS_IS_SUBDIR,
+       &RQF_MDS_DONE_WRITING,
+       &RQF_MDS_REINT,
+       &RQF_MDS_REINT_CREATE,
+       &RQF_MDS_REINT_CREATE_RMT_ACL,
+       &RQF_MDS_REINT_CREATE_SLAVE,
+       &RQF_MDS_REINT_CREATE_SYM,
+       &RQF_MDS_REINT_OPEN,
+       &RQF_MDS_REINT_UNLINK,
+       &RQF_MDS_REINT_LINK,
+       &RQF_MDS_REINT_RENAME,
+       &RQF_MDS_REINT_SETATTR,
+       &RQF_MDS_REINT_SETXATTR,
+       &RQF_MDS_QUOTACHECK,
+       &RQF_MDS_QUOTACTL,
+       &RQF_MDS_HSM_PROGRESS,
+       &RQF_MDS_HSM_CT_REGISTER,
+       &RQF_MDS_HSM_CT_UNREGISTER,
+       &RQF_MDS_HSM_STATE_GET,
+       &RQF_MDS_HSM_STATE_SET,
+       &RQF_MDS_HSM_ACTION,
+       &RQF_MDS_HSM_REQUEST,
+       &RQF_MDS_SWAP_LAYOUTS,
+       &RQF_UPDATE_OBJ,
+       &RQF_QC_CALLBACK,
+       &RQF_OST_CONNECT,
+       &RQF_OST_DISCONNECT,
+       &RQF_OST_QUOTACHECK,
+       &RQF_OST_QUOTACTL,
+       &RQF_OST_GETATTR,
+       &RQF_OST_SETATTR,
+       &RQF_OST_CREATE,
+       &RQF_OST_PUNCH,
+       &RQF_OST_SYNC,
+       &RQF_OST_DESTROY,
+       &RQF_OST_BRW_READ,
+       &RQF_OST_BRW_WRITE,
+       &RQF_OST_STATFS,
+       &RQF_OST_SET_GRANT_INFO,
+       &RQF_OST_GET_INFO_GENERIC,
+       &RQF_OST_GET_INFO_LAST_ID,
+       &RQF_OST_GET_INFO_LAST_FID,
+       &RQF_OST_SET_INFO_LAST_FID,
+       &RQF_OST_GET_INFO_FIEMAP,
+       &RQF_LDLM_ENQUEUE,
+       &RQF_LDLM_ENQUEUE_LVB,
+       &RQF_LDLM_CONVERT,
+       &RQF_LDLM_CANCEL,
+       &RQF_LDLM_CALLBACK,
+       &RQF_LDLM_CP_CALLBACK,
+       &RQF_LDLM_BL_CALLBACK,
+       &RQF_LDLM_GL_CALLBACK,
+       &RQF_LDLM_GL_DESC_CALLBACK,
+       &RQF_LDLM_INTENT,
+       &RQF_LDLM_INTENT_BASIC,
+       &RQF_LDLM_INTENT_LAYOUT,
+       &RQF_LDLM_INTENT_GETATTR,
+       &RQF_LDLM_INTENT_OPEN,
+       &RQF_LDLM_INTENT_CREATE,
+       &RQF_LDLM_INTENT_UNLINK,
+       &RQF_LDLM_INTENT_QUOTA,
+       &RQF_QUOTA_DQACQ,
+       &RQF_LOG_CANCEL,
+       &RQF_LLOG_ORIGIN_HANDLE_CREATE,
+       &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+       &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+       &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+       &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+       &RQF_LLOG_ORIGIN_CONNECT
+};
+
+struct req_msg_field {
+       const __u32 rmf_flags;
+       const char  *rmf_name;
+       /**
+        * Field length. (-1) means "variable length".  If the
+        * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+        * but the actual size must be a whole multiple of \a rmf_size.
+        */
+       const int   rmf_size;
+       void    (*rmf_swabber)(void *);
+       void    (*rmf_dumper)(void *);
+       int      rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+       /**
+        * The field is a string, must be NUL-terminated.
+        */
+       RMF_F_STRING = 1 << 0,
+       /**
+        * The field's buffer size need not match the declared \a rmf_size.
+        */
+       RMF_F_NO_SIZE_CHECK = 1 << 1,
+       /**
+        * The field's buffer size must be a whole multiple of the declared \a
+        * rmf_size and the \a rmf_swabber function must work on the declared \a
+        * rmf_size worth of bytes.
+        */
+       RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+       .rmf_name    = (name),                            \
+       .rmf_flags   = (flags),                          \
+       .rmf_size    = (size),                            \
+       .rmf_swabber = (void (*)(void*))(swabber),            \
+       .rmf_dumper  = (void (*)(void*))(dumper)                \
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+       DEFINE_MSGF("generic_data", 0,
+                   -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+       DEFINE_MSGF("mgs_target_info", 0,
+                   sizeof(struct mgs_target_info),
+                   lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+       DEFINE_MSGF("mgs_send_param", 0,
+                   sizeof(struct mgs_send_param),
+                   NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+       DEFINE_MSGF("mgs_config_read request", 0,
+                   sizeof(struct mgs_config_body),
+                   lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+       DEFINE_MSGF("mgs_config_read reply ", 0,
+                   sizeof(struct mgs_config_res),
+                   lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+       DEFINE_MSGF("generic u32", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+       DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+       DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+       DEFINE_MSGF("getinfo_vallen", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+       DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+       DEFINE_MSGF("seq_query_opc", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+       DEFINE_MSGF("seq_query_range", 0,
+                   sizeof(struct lu_seq_range),
+                   lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+       DEFINE_MSGF("fld_query_opc", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+       DEFINE_MSGF("fld_query_mdfld", 0,
+                   sizeof(struct lu_seq_range),
+                   lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+       DEFINE_MSGF("mdt_body", 0,
+                   sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+       DEFINE_MSGF("obd_quotactl", 0,
+                   sizeof(struct obd_quotactl),
+                   lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+       DEFINE_MSGF("quota_body", 0,
+                   sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+       DEFINE_MSGF("mdt_ioepoch", 0,
+                   sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+       DEFINE_MSGF("ptlrpc_body", 0,
+                   sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_OBD_STATFS =
+       DEFINE_MSGF("obd_statfs", 0,
+                   sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+       DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+       DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+       DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+       DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+       NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+       DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+       NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+       DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+       DEFINE_MSGF("llogd_body", 0,
+                   sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+       DEFINE_MSGF("llog_log_hdr", 0,
+                   sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+       DEFINE_MSGF("llogd_conn_body", 0,
+                   sizeof(struct llogd_conn_body),
+                   lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+       DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+       DEFINE_MSGF("cdata",
+                   RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 7, 50, 0)
+                   sizeof(struct obd_connect_data),
+#else
+/* For interoperability with 1.8 and 2.0 clients/servers.
+ * The RPC verification code allows larger RPC buffers, but not
+ * smaller buffers.  Until we no longer need to keep compatibility
+ * with older servers/clients we can only check that the buffer
+ * size is at least as large as obd_connect_data_v1.  That is not
+ * not in itself harmful, since the chance of just corrupting this
+ * field is low.  See JIRA LU-16 for details. */
+                   sizeof(struct obd_connect_data_v1),
+#endif
+                   lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+       DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+                   sizeof(struct ldlm_request),
+                   lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+       DEFINE_MSGF("dlm_rep", 0,
+                   sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+       DEFINE_MSGF("ldlm_intent", 0,
+                   sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+       DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+       DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+                   lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+       DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+       DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+                   lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+                                                   NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_ACL =
+       DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+                   LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+       DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+                   sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+       DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                   lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+       DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                   lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+       DEFINE_MSGF("layout_intent", 0,
+                   sizeof(struct layout_intent), lustre_swab_layout_intent,
+                   NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+       DEFINE_MSGF("ost_body", 0,
+                   sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+       DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+       DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+                   dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+       DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+                   lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_OBD_ID =
+       DEFINE_MSGF("obd_id", 0,
+                   sizeof(obd_id), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+       DEFINE_MSGF("fid", 0,
+                   sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+       DEFINE_MSGF("ost_id", 0,
+                   sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+       DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+                   lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+       DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+       DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+                   lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+       DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+                   lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+       DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+                   lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+       DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+                   lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+       DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+                   lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+       DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+                   NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+       DEFINE_MSGF("hsm_archive", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+       DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+                   lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+                                             lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+                                               lustre_swab_update_reply_buf,
+                                                   NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+       DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+                   lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+       const char *rf_name;
+       int      rf_idx;
+       struct {
+               int                       nr;
+               const struct req_msg_field **d;
+       } rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+       .rf_name   = name,                                            \
+       .rf_fields = {                                            \
+               [RCL_CLIENT] = {                                        \
+                       .nr = client_nr,                                \
+                       .d  = client                                \
+               },                                                    \
+               [RCL_SERVER] = {                                        \
+                       .nr = server_nr,                                \
+                       .d  = server                                \
+               }                                                      \
+       }                                                              \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)                            \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+       DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+       DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+       DEFINE_REQ_FMT0("OBD_IDX_READ",
+                       obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+       DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+       DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+                        mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+       DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+                        mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+       DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+                        mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+       DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+       DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+       DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+       DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+       DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+       DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+       DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+       DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+       DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+       DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+                       ldlm_intent_quota_client,
+                       ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+       DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+       DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+       DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+       DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+       DEFINE_REQ_FMT0("MDS_GETXATTR",
+                       mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+       DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+                       mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+       DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+                       mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+                       mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+                       mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+                       mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+       DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+                       mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+       DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+                       mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+       DEFINE_REQ_FMT0("MDS_REINT_LINK",
+                       mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+       DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+                       mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+       DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+                       mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+       DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+                       mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+       DEFINE_REQ_FMT0("MDS_CONNECT",
+                       obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+       DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+       DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+                       mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+       DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+                       mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+       DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+                       ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+       DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+                       ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+       DEFINE_REQ_FMT0("LDLM_CONVERT",
+                       ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+       DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+                       ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+                       ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+       DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+                       ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+       DEFINE_REQ_FMT0("LDLM_INTENT",
+                       ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+       DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+                       ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+       DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+                       ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+       DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+                       ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+       DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+                       ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+       DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+                       ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_MDS_CLOSE =
+       DEFINE_REQ_FMT0("MDS_CLOSE",
+                       mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+       DEFINE_REQ_FMT0("MDS_PIN",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+       DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+       DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+                       mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+       DEFINE_REQ_FMT0("MDS_READPAGE",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+       DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+       DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+       DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+       DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+       DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+                       mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+       DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+       DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+       DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+                       mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+       DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+       DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+                       mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+                       llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+                       llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+                       llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+                       llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+                       llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+       DEFINE_REQ_FMT0("OST_CONNECT",
+                       obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+       DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+       DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+       DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+       DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+       DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+       DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+       DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+       DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+       DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+       DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+       DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+                        ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+       DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+                                       ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+       DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+                                               ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+       DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+                                                ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+       DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+                                                empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+       DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+                                              ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+       int i;
+       int j;
+       int k;
+       struct req_format *rf = NULL;
+
+       for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+               rf = req_formats[i];
+               rf->rf_idx = i;
+               for (j = 0; j < RCL_NR; ++j) {
+                       LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+                       for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+                               struct req_msg_field *field;
+
+                               field = (typeof(field))rf->rf_fields[j].d[k];
+                               LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+                                       || field->rmf_size > 0);
+                               LASSERT(field->rmf_offset[i][j] == 0);
+                               /*
+                                * k + 1 to detect unused format/field
+                                * combinations.
+                                */
+                               field->rmf_offset[i][j] = k + 1;
+                       }
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+               pill->rc_area[RCL_CLIENT][i] = -1;
+               pill->rc_area[RCL_SERVER][i] = -1;
+       }
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+                     struct ptlrpc_request *req,
+                     enum req_location location)
+{
+       LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+       /*
+        * Today all capsules are embedded in ptlrpc_request structs,
+        * but just in case that ever isn't the case, we don't reach
+        * into req unless req != NULL and pill is the one embedded in
+        * the req.
+        *
+        * The req->rq_pill_init flag makes it safe to initialize a pill
+        * twice, which might happen in the OST paths as a result of the
+        * high-priority RPC queue getting peeked at before ost_handle()
+        * handles an OST RPC.
+        */
+       if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+               return;
+
+       memset(pill, 0, sizeof *pill);
+       pill->rc_req = req;
+       pill->rc_loc = location;
+       req_capsule_init_area(pill);
+
+       if (req != NULL && pill == &req->rq_pill)
+               req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+       return
+               0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+               req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+                                   enum req_location loc)
+{
+       struct ptlrpc_request *req;
+
+       req = pill->rc_req;
+       return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+       LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+       LASSERT(__req_format_is_sane(fmt));
+
+       pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+                          enum req_location loc)
+{
+       const struct req_format *fmt = pill->rc_fmt;
+       int                   i;
+
+       LASSERT(fmt != NULL);
+
+       for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+               if (pill->rc_area[loc][i] == -1) {
+                       pill->rc_area[loc][i] =
+                                           fmt->rf_fields[loc].d[i]->rmf_size;
+                       if (pill->rc_area[loc][i] == -1) {
+                               /*
+                                * Skip the following fields.
+                                *
+                                * If this LASSERT() trips then you're missing a
+                                * call to req_capsule_set_size().
+                                */
+                               LASSERT(loc != RCL_SERVER);
+                               break;
+                       }
+               }
+       }
+       return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+       const struct req_format *fmt;
+       int                   count;
+       int                   rc;
+
+       LASSERT(pill->rc_loc == RCL_SERVER);
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+
+       count = req_capsule_filled_sizes(pill, RCL_SERVER);
+       rc = lustre_pack_reply(pill->rc_req, count,
+                              pill->rc_area[RCL_SERVER], NULL);
+       if (rc != 0) {
+               DEBUG_REQ(D_ERROR, pill->rc_req,
+                      "Cannot pack %d fields in format `%s': ",
+                      count, fmt->rf_name);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+                               const struct req_msg_field *field,
+                               enum req_location loc)
+{
+       int offset;
+
+       offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+       LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+                           pill->rc_fmt->rf_name,
+                           field->rmf_name, offset, loc);
+       offset --;
+
+       LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+       return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+                     const struct req_msg_field *field,
+                     enum req_location loc,
+                     int offset,
+                     void *value, int len, int dump, void (*swabber)( void *))
+{
+       void    *p;
+       int     i;
+       int     n;
+       int     do_swab;
+       int     inout = loc == RCL_CLIENT;
+
+       swabber = swabber ?: field->rmf_swabber;
+
+       if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+           swabber != NULL && value != NULL)
+               do_swab = 1;
+       else
+               do_swab = 0;
+
+       if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+                              do_swab ? "unswabbed " : "", field->rmf_name);
+                       field->rmf_dumper(value);
+               }
+               if (!do_swab)
+                       return;
+               swabber(value);
+               ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+               if (dump) {
+                       CDEBUG(D_RPCTRACE, "Dump of swabbed field %s "
+                              "follows\n", field->rmf_name);
+                       field->rmf_dumper(value);
+               }
+
+               return;
+       }
+
+       /*
+        * We're swabbing an array; swabber() swabs a single array element, so
+        * swab every element.
+        */
+       LASSERT((len % field->rmf_size) == 0);
+       for (p = value, i = 0, n = len / field->rmf_size;
+            i < n;
+            i++, p += field->rmf_size) {
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, "
+                              "element %d follows\n",
+                              do_swab ? "unswabbed " : "", field->rmf_name, i);
+                       field->rmf_dumper(p);
+               }
+               if (!do_swab)
+                       continue;
+               swabber(p);
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, "
+                              "element %d follows\n", field->rmf_name, i);
+                       field->rmf_dumper(value);
+               }
+       }
+       if (do_swab)
+               ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc,
+                              void (*swabber)( void *),
+                              int dump)
+{
+       const struct req_format *fmt;
+       struct lustre_msg       *msg;
+       void                *value;
+       int                   len;
+       int                   offset;
+
+       void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+       static const char *rcl_names[RCL_NR] = {
+               [RCL_CLIENT] = "client",
+               [RCL_SERVER] = "server"
+       };
+
+       LASSERT(pill != NULL);
+       LASSERT(pill != LP_POISON);
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+       LASSERT(fmt != LP_POISON);
+       LASSERT(__req_format_is_sane(fmt));
+
+       offset = __req_capsule_offset(pill, field, loc);
+
+       msg = __req_msg(pill, loc);
+       LASSERT(msg != NULL);
+
+       getter = (field->rmf_flags & RMF_F_STRING) ?
+               (typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+       if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+               /*
+                * We've already asserted that field->rmf_size > 0 in
+                * req_layout_init().
+                */
+               len = lustre_msg_buflen(msg, offset);
+               if ((len % field->rmf_size) != 0) {
+                       CERROR("%s: array field size mismatch "
+                              "%d modulo %d != 0 (%d)\n",
+                              field->rmf_name, len, field->rmf_size, loc);
+                       return NULL;
+               }
+       } else if (pill->rc_area[loc][offset] != -1) {
+               len = pill->rc_area[loc][offset];
+       } else {
+               len = max(field->rmf_size, 0);
+       }
+       value = getter(msg, offset, len);
+
+       if (value == NULL) {
+               DEBUG_REQ(D_ERROR, pill->rc_req,
+                         "Wrong buffer for field `%s' (%d of %d) "
+                         "in format `%s': %d vs. %d (%s)\n",
+                         field->rmf_name, offset, lustre_msg_bufcount(msg),
+                         fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+                         rcl_names[loc]);
+       } else {
+               swabber_dumper_helper(pill, field, loc, offset, value, len,
+                                     dump, swabber);
+       }
+
+       return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+       const struct    req_format *fmt;
+       const struct    req_msg_field *field;
+       int          len;
+       int          i;
+
+       fmt = pill->rc_fmt;
+
+       DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+       for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+               field = FMT_FIELD(fmt, loc, i);
+               if (field->rmf_dumper == NULL) {
+                       /*
+                        * FIXME Add a default hex dumper for fields that don't
+                        * have a specific dumper
+                        */
+                       len = req_capsule_get_size(pill, field, loc);
+                       CDEBUG(D_RPCTRACE, "Field %s has no dumper function;"
+                              "field size is %d\n", field->rmf_name, len);
+               } else {
+                       /* It's the dumping side-effect that we're interested in */
+                       (void) __req_capsule_get(pill, field, loc, NULL, 1);
+               }
+       }
+       CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+       __req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+       __req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+                            const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber)
+{
+       return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len)
+{
+       req_capsule_set_size(pill, field, RCL_CLIENT, len);
+       return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+                            const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber)
+{
+       return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len)
+{
+       req_capsule_set_size(pill, field, RCL_SERVER, len);
+       return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+                                       const struct req_msg_field *field,
+                                       int len, void *swabber)
+{
+       req_capsule_set_size(pill, field, RCL_SERVER, len);
+       return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc, int size)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       if ((size != field->rmf_size) &&
+           (field->rmf_size != -1) &&
+           !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+           (size > 0)) {
+               if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+                   (size % field->rmf_size != 0)) {
+                       CERROR("%s: array field size mismatch "
+                              "%d %% %d != 0 (%d)\n",
+                              field->rmf_name, size, field->rmf_size, loc);
+                       LBUG();
+               } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+                   size < field->rmf_size) {
+                       CERROR("%s: field size mismatch %d != %d (%d)\n",
+                              field->rmf_name, size, field->rmf_size, loc);
+                       LBUG();
+               }
+       }
+
+       pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+                        const struct req_msg_field *field,
+                        enum req_location loc)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       return lustre_msg_buflen(__req_msg(pill, loc),
+                                __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+       return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+                              pill->rc_fmt->rf_fields[loc].nr,
+                              pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                        enum req_location loc)
+{
+       int size, i = 0;
+
+       /*
+        * This function should probably LASSERT() that fmt has no fields with
+        * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+        * elements in the array there will ultimately be, but then, we could
+        * assume that there will be at least one element, and that's just what
+        * we do.
+        */
+       size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+       if (size < 0)
+               return size;
+
+       for (; i < fmt->rf_fields[loc].nr; ++i)
+               if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+                       size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+                                              rmf_size);
+       return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+       int i;
+       int j;
+
+       const struct req_format *old;
+
+       LASSERT(pill->rc_fmt != NULL);
+       LASSERT(__req_format_is_sane(fmt));
+
+       old = pill->rc_fmt;
+       /*
+        * Sanity checking...
+        */
+       for (i = 0; i < RCL_NR; ++i) {
+               LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+               for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+                       const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+                       /* "opaque" fields can be transmogrified */
+                       if (ofield->rmf_swabber == NULL &&
+                           (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+                           (ofield->rmf_size == -1 ||
+                           ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+                               continue;
+                       LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+               }
+               /*
+                * Last field in old format can be shorter than in new.
+                */
+               LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+                       FMT_FIELD(old, i, j)->rmf_size);
+       }
+
+       pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+                             const struct req_msg_field *field,
+                             enum req_location loc)
+{
+       int offset;
+
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+       LASSERT(req_capsule_has_field(pill, field, loc));
+
+       offset = __req_capsule_offset(pill, field, loc);
+       return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+                       const struct req_msg_field *field,
+                       unsigned int newlen,
+                       enum req_location loc)
+{
+       const struct req_format *fmt;
+       struct lustre_msg       *msg;
+       int                   len;
+       int                   offset;
+
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+       LASSERT(__req_format_is_sane(fmt));
+       LASSERT(req_capsule_has_field(pill, field, loc));
+       LASSERT(req_capsule_field_present(pill, field, loc));
+
+       offset = __req_capsule_offset(pill, field, loc);
+
+       msg = __req_msg(pill, loc);
+       len = lustre_msg_buflen(msg, offset);
+       LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+                               fmt->rf_name, field->rmf_name, len, newlen);
+
+       if (loc == RCL_CLIENT)
+               pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+                                                           1);
+       else
+               pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+                                                           1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+                           const struct req_msg_field *field,
+                           unsigned int newlen)
+{
+       struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+       char *from, *to;
+       int offset, len, rc;
+
+       LASSERT(pill->rc_fmt != NULL);
+       LASSERT(__req_format_is_sane(pill->rc_fmt));
+       LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+       LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+       len = req_capsule_get_size(pill, field, RCL_SERVER);
+       offset = __req_capsule_offset(pill, field, RCL_SERVER);
+       if (pill->rc_req->rq_repbuf_len >=
+           lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+               CERROR("Inplace repack might be done\n");
+
+       pill->rc_req->rq_reply_state = NULL;
+       req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+       rc = req_capsule_server_pack(pill);
+       if (rc) {
+               /* put old rs back, the caller will decide what to do */
+               pill->rc_req->rq_reply_state = rs;
+               return rc;
+       }
+       nrs = pill->rc_req->rq_reply_state;
+       /* Now we need only buffers, copy first chunk */
+       to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+       from = lustre_msg_buf(rs->rs_msg, 0, 0);
+       len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+       memcpy(to, from, len);
+       /* check if we have tail and copy it too */
+       if (rs->rs_msg->lm_bufcount > offset + 1) {
+               to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+               from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+               offset = rs->rs_msg->lm_bufcount - 1;
+               len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+                     cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+               memcpy(to, from, len);
+       }
+       /* drop old reply if everything is fine */
+       if (rs->rs_difficult) {
+               /* copy rs data */
+               int i;
+
+               nrs->rs_difficult = 1;
+               nrs->rs_no_ack = rs->rs_no_ack;
+               for (i = 0; i < rs->rs_nlocks; i++) {
+                       nrs->rs_locks[i] = rs->rs_locks[i];
+                       nrs->rs_modes[i] = rs->rs_modes[i];
+                       nrs->rs_nlocks++;
+               }
+               rs->rs_nlocks = 0;
+               rs->rs_difficult = 0;
+               rs->rs_no_ack = 0;
+       }
+       ptlrpc_rs_decref(rs);
+       return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644 (file)
index 0000000..367ca8e
--- /dev/null
@@ -0,0 +1,354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {                           \
+       mutex_lock(&ctxt->loc_mutex);                        \
+       if (ctxt->loc_imp) {                                      \
+               imp = class_import_get(ctxt->loc_imp);          \
+       } else {                                                      \
+               CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+                      "Unable to complete MDS/OSS recovery,"    \
+                      "but I'll try again next time.  Not fatal.\n", \
+                      ctxt->loc_idx);                          \
+               imp = NULL;                                        \
+               mutex_unlock(&ctxt->loc_mutex);            \
+               return (-EINVAL);                                    \
+       }                                                            \
+       mutex_unlock(&ctxt->loc_mutex);                    \
+} while(0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {                             \
+       mutex_lock(&ctxt->loc_mutex);                        \
+       if (ctxt->loc_imp != imp)                                    \
+               CWARN("loc_imp has changed from %p to %p\n",      \
+                      ctxt->loc_imp, imp);                        \
+       class_import_put(imp);                                  \
+       mutex_unlock(&ctxt->loc_mutex);                    \
+} while(0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+                           struct llog_handle *lgh, struct llog_logid *logid,
+                           char *name, enum llog_open_param open_param)
+{
+       struct obd_import     *imp;
+       struct llogd_body     *body;
+       struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+       struct ptlrpc_request *req = NULL;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(ctxt, imp);
+
+       /* client cannot create llog */
+       LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+       LASSERT(lgh);
+
+       req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       if (name)
+               req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                    strlen(name) + 1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+                                LLOG_ORIGIN_HANDLE_CREATE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               req = NULL;
+               GOTO(out, rc);
+       }
+       ptlrpc_request_set_replen(req);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (logid)
+               body->lgd_logid = *logid;
+       body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+       if (name) {
+               char *tmp;
+               tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+                                                  strlen(name) + 1);
+               LASSERT(tmp);
+               strcpy(tmp, name);
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       lgh->lgh_id = body->lgd_logid;
+       lgh->lgh_ctxt = ctxt;
+       EXIT;
+out:
+       LLOG_CLIENT_EXIT(ctxt, imp);
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+                              struct llog_handle *loghandle)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_DESTROY);
+       if (req == NULL)
+               GOTO(err_exit, rc =-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+       if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+               CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+                      body->lgd_llh_flags);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       RETURN(rc);
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int *cur_idx, int next_idx,
+                                 __u64 *cur_offset, void *buf, int len)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       void              *ptr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+       if (req == NULL)
+               GOTO(err_exit, rc =-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+       body->lgd_index = next_idx;
+       body->lgd_saved_index = *cur_idx;
+       body->lgd_len = len;
+       body->lgd_cur_offset = *cur_offset;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       /* The log records are swabbed as they are processed */
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       if (ptr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       *cur_idx = body->lgd_saved_index;
+       *cur_offset = body->lgd_cur_offset;
+
+       memcpy(buf, ptr, len);
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int prev_idx, void *buf, int len)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       void              *ptr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+       if (req == NULL)
+               GOTO(err_exit, rc = -ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+       body->lgd_index = prev_idx;
+       body->lgd_len = len;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       if (ptr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       memcpy(buf, ptr, len);
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+                                  struct llog_handle *handle)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       struct llog_log_hdr   *hdr;
+       struct llog_rec_hdr   *llh_hdr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_READ_HEADER);
+       if (req == NULL)
+               GOTO(err_exit, rc = -ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = handle->lgh_id;
+       body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+       if (hdr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       memcpy(handle->lgh_hdr, hdr, sizeof (*hdr));
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+       /* sanity checks */
+       llh_hdr = &handle->lgh_hdr->llh_hdr;
+       if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+               CERROR("bad log header magic: %#x (expecting %#x)\n",
+                      llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+               rc = -EIO;
+       } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+               CERROR("incorrectly sized log header: %#x "
+                      "(expecting %#x)\n",
+                      llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+               CERROR("you may need to re-run lconf --write_conf.\n");
+               rc = -EIO;
+       }
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+                            struct llog_handle *handle)
+{
+       /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+          the servers all close the file at the end of every
+          other LLOG_ RPC. */
+       return(0);
+}
+
+struct llog_operations llog_client_ops = {
+       .lop_next_block         = llog_client_next_block,
+       .lop_prev_block         = llog_client_prev_block,
+       .lop_read_header        = llog_client_read_header,
+       .lop_open               = llog_client_open,
+       .lop_destroy            = llog_client_destroy,
+       .lop_close              = llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644 (file)
index 0000000..a81f557
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+       struct obd_import *new_imp;
+       ENTRY;
+
+       LASSERT(ctxt);
+       new_imp = ctxt->loc_obd->u.cli.cl_import;
+       LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+                "%p - %p\n", ctxt->loc_imp, new_imp);
+       mutex_lock(&ctxt->loc_mutex);
+       if (ctxt->loc_imp != new_imp) {
+               if (ctxt->loc_imp)
+                       class_import_put(ctxt->loc_imp);
+               ctxt->loc_imp = class_import_get(new_imp);
+       }
+       mutex_unlock(&ctxt->loc_mutex);
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_server.c b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c
new file mode 100644 (file)
index 0000000..bc1fcd8
--- /dev/null
@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <lustre_fsfilt.h>
+
+#if  defined(LUSTRE_LOG_SERVER)
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+       if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               return llog_cat_close(env, lgh);
+       else
+               return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+       struct obd_export       *exp = req->rq_export;
+       struct obd_device       *obd = exp->exp_obd;
+       struct obd_device       *disk_obd;
+       struct lvfs_run_ctxt     saved;
+       struct llog_handle      *loghandle;
+       struct llogd_body       *body;
+       struct llog_logid       *logid = NULL;
+       struct llog_ctxt        *ctxt;
+       char                    *name = NULL;
+       int                      rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+               logid = &body->lgd_logid;
+
+       if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+               name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               if (name == NULL)
+                       RETURN(-EFAULT);
+               CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+       }
+
+       ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL) {
+               CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+                      obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+               RETURN(-ENODEV);
+       }
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+                      name, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_open);
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+       struct obd_device       *disk_obd;
+       struct lvfs_run_ctxt     saved;
+       struct llogd_body       *body;
+       struct llog_logid       *logid = NULL;
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+               logid = &body->lgd_logid;
+
+       if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+               CERROR("%s: wrong llog flags %x\n",
+                      req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       /* erase only if no error and logid is valid */
+       if (rc == 0)
+               rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_origin_handle_destroy);
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+       struct obd_device   *disk_obd;
+       struct llog_handle  *loghandle;
+       struct llogd_body   *body;
+       struct llogd_body   *repbody;
+       struct lvfs_run_ctxt saved;
+       struct llog_ctxt    *ctxt;
+       __u32           flags;
+       void            *ptr;
+       int               rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                      &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+                            LLOG_CHUNK_SIZE);
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       *repbody = *body;
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+                            &repbody->lgd_saved_index, repbody->lgd_index,
+                            &repbody->lgd_cur_offset, ptr, LLOG_CHUNK_SIZE);
+       if (rc)
+               GOTO(out_close, rc);
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_next_block);
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+       struct llog_handle   *loghandle;
+       struct llogd_body    *body;
+       struct llogd_body    *repbody;
+       struct obd_device    *disk_obd;
+       struct lvfs_run_ctxt  saved;
+       struct llog_ctxt     *ctxt;
+       __u32            flags;
+       void             *ptr;
+       int                rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                        &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+                            LLOG_CHUNK_SIZE);
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       *repbody = *body;
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+                            body->lgd_index, ptr, LLOG_CHUNK_SIZE);
+       if (rc)
+               GOTO(out_close, rc);
+
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_prev_block);
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+       struct obd_device    *disk_obd;
+       struct llog_handle   *loghandle;
+       struct llogd_body    *body;
+       struct llog_log_hdr  *hdr;
+       struct lvfs_run_ctxt  saved;
+       struct llog_ctxt     *ctxt;
+       __u32            flags;
+       int                rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                      &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       /*
+        * llog_init_handle() reads the llog header
+        */
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+       flags = loghandle->lgh_hdr->llh_flags;
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+       *hdr = *loghandle->lgh_hdr;
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_read_header);
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+       ENTRY;
+       /* Nothing to do */
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_origin_handle_close);
+
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+       int num_cookies, rc = 0, err, i, failed = 0;
+       struct obd_device *disk_obd;
+       struct llog_cookie *logcookies;
+       struct llog_ctxt *ctxt = NULL;
+       struct lvfs_run_ctxt saved;
+       struct llog_handle *cathandle;
+       struct inode *inode;
+       void *handle;
+       ENTRY;
+
+       logcookies = req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES);
+       num_cookies = req_capsule_get_size(&req->rq_pill, &RMF_LOGCOOKIES,
+                                          RCL_CLIENT) / sizeof(*logcookies);
+       if (logcookies == NULL || num_cookies == 0) {
+               DEBUG_REQ(D_HA, req, "No llog cookies sent");
+               RETURN(-EFAULT);
+       }
+
+       ctxt = llog_get_context(req->rq_export->exp_obd,
+                               logcookies->lgc_subsys);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       for (i = 0; i < num_cookies; i++, logcookies++) {
+               cathandle = ctxt->loc_handle;
+               LASSERT(cathandle != NULL);
+               inode = cathandle->lgh_file->f_dentry->d_inode;
+
+               handle = fsfilt_start_log(disk_obd, inode,
+                                         FSFILT_OP_CANCEL_UNLINK, NULL, 1);
+               if (IS_ERR(handle)) {
+                       CERROR("fsfilt_start_log() failed: %ld\n",
+                              PTR_ERR(handle));
+                       GOTO(pop_ctxt, rc = PTR_ERR(handle));
+               }
+
+               rc = llog_cat_cancel_records(req->rq_svc_thread->t_env,
+                                            cathandle, 1, logcookies);
+
+               /*
+                * Do not raise -ENOENT errors for resent rpcs. This rec already
+                * might be killed.
+                */
+               if (rc == -ENOENT &&
+                   (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) {
+                       /*
+                        * Do not change this message, reply-single.sh test_59b
+                        * expects to find this in log.
+                        */
+                       CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n",
+                              req);
+                       rc = 0;
+               } else if (rc == 0) {
+                       CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n",
+                              num_cookies);
+               }
+
+               err = fsfilt_commit(disk_obd, inode, handle, 0);
+               if (err) {
+                       CERROR("Error committing transaction: %d\n", err);
+                       if (!rc)
+                               rc = err;
+                       failed++;
+                       GOTO(pop_ctxt, rc);
+               } else if (rc)
+                       failed++;
+       }
+       GOTO(pop_ctxt, rc);
+pop_ctxt:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       if (rc)
+               CERROR("Cancel %d of %d llog-records failed: %d\n",
+                      failed, num_cookies, rc);
+
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_cancel);
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644 (file)
index 0000000..031c0f9
--- /dev/null
@@ -0,0 +1,1401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+struct ll_rpc_opcode {
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+       { OST_REPLY,    "ost_reply" },
+       { OST_GETATTR,      "ost_getattr" },
+       { OST_SETATTR,      "ost_setattr" },
+       { OST_READ,      "ost_read" },
+       { OST_WRITE,    "ost_write" },
+       { OST_CREATE ,      "ost_create" },
+       { OST_DESTROY,      "ost_destroy" },
+       { OST_GET_INFO,     "ost_get_info" },
+       { OST_CONNECT,      "ost_connect" },
+       { OST_DISCONNECT,   "ost_disconnect" },
+       { OST_PUNCH,    "ost_punch" },
+       { OST_OPEN,      "ost_open" },
+       { OST_CLOSE,    "ost_close" },
+       { OST_STATFS,       "ost_statfs" },
+       { 14,           NULL },    /* formerly OST_SAN_READ */
+       { 15,           NULL },    /* formerly OST_SAN_WRITE */
+       { OST_SYNC,      "ost_sync" },
+       { OST_SET_INFO,     "ost_set_info" },
+       { OST_QUOTACHECK,   "ost_quotacheck" },
+       { OST_QUOTACTL,     "ost_quotactl" },
+       { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+       { MDS_GETATTR,      "mds_getattr" },
+       { MDS_GETATTR_NAME, "mds_getattr_lock" },
+       { MDS_CLOSE,    "mds_close" },
+       { MDS_REINT,    "mds_reint" },
+       { MDS_READPAGE,     "mds_readpage" },
+       { MDS_CONNECT,      "mds_connect" },
+       { MDS_DISCONNECT,   "mds_disconnect" },
+       { MDS_GETSTATUS,    "mds_getstatus" },
+       { MDS_STATFS,       "mds_statfs" },
+       { MDS_PIN,        "mds_pin" },
+       { MDS_UNPIN,    "mds_unpin" },
+       { MDS_SYNC,      "mds_sync" },
+       { MDS_DONE_WRITING, "mds_done_writing" },
+       { MDS_SET_INFO,     "mds_set_info" },
+       { MDS_QUOTACHECK,   "mds_quotacheck" },
+       { MDS_QUOTACTL,     "mds_quotactl" },
+       { MDS_GETXATTR,     "mds_getxattr" },
+       { MDS_SETXATTR,     "mds_setxattr" },
+       { MDS_WRITEPAGE,    "mds_writepage" },
+       { MDS_IS_SUBDIR,    "mds_is_subdir" },
+       { MDS_GET_INFO,     "mds_get_info" },
+       { MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+       { MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+       { MDS_HSM_ACTION,   "mds_hsm_action" },
+       { MDS_HSM_PROGRESS, "mds_hsm_progress" },
+       { MDS_HSM_REQUEST,  "mds_hsm_request" },
+       { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+       { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+       { MDS_SWAP_LAYOUTS,     "mds_swap_layouts" },
+       { LDLM_ENQUEUE,     "ldlm_enqueue" },
+       { LDLM_CONVERT,     "ldlm_convert" },
+       { LDLM_CANCEL,      "ldlm_cancel" },
+       { LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+       { LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+       { LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+       { LDLM_SET_INFO,    "ldlm_set_info" },
+       { MGS_CONNECT,      "mgs_connect" },
+       { MGS_DISCONNECT,   "mgs_disconnect" },
+       { MGS_EXCEPTION,    "mgs_exception" },
+       { MGS_TARGET_REG,   "mgs_target_reg" },
+       { MGS_TARGET_DEL,   "mgs_target_del" },
+       { MGS_SET_INFO,     "mgs_set_info" },
+       { MGS_CONFIG_READ,  "mgs_config_read" },
+       { OBD_PING,      "obd_ping" },
+       { OBD_LOG_CANCEL,   "llog_origin_handle_cancel" },
+       { OBD_QC_CALLBACK,  "obd_quota_callback" },
+       { OBD_IDX_READ,     "dt_index_read" },
+       { LLOG_ORIGIN_HANDLE_CREATE,     "llog_origin_handle_create" },
+       { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+       { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
+       { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+       { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+       { LLOG_ORIGIN_CONNECT,     "llog_origin_connect" },
+       { LLOG_CATINFO,           "llog_catinfo" },
+       { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+       { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+       { QUOTA_DQACQ,      "quota_acquire" },
+       { QUOTA_DQREL,      "quota_release" },
+       { SEQ_QUERY,    "seq_query" },
+       { SEC_CTX_INIT,     "sec_ctx_init" },
+       { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
+       { SEC_CTX_FINI,     "sec_ctx_fini" },
+       { FLD_QUERY,    "fld_query" },
+       { UPDATE_OBJ,       "update_obj" },
+};
+
+struct ll_eopcode {
+     __u32       opcode;
+     const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+       { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+       { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+       { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+       { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+       { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+       { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+       { MDS_REINT_CREATE,     "mds_reint_create" },
+       { MDS_REINT_LINK,       "mds_reint_link" },
+       { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+       { MDS_REINT_RENAME,     "mds_reint_rename" },
+       { MDS_REINT_OPEN,       "mds_reint_open" },
+       { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+       { BRW_READ_BYTES,       "read_bytes" },
+       { BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+       /* When one of the assertions below fail, chances are that:
+        *     1) A new opcode was added in include/lustre/lustre_idl.h,
+        *      but is missing from the table above.
+        * or  2) The opcode space was renumbered or rearranged,
+        *      and the opcode_offset() function in
+        *      ptlrpc_internal.h needs to be modified.
+        */
+       __u32 offset = opcode_offset(opcode);
+       LASSERTF(offset < LUSTRE_MAX_OPCODES,
+                "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+                offset, LUSTRE_MAX_OPCODES);
+       LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+                "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+                offset, ll_rpc_opcode_table[offset].opcode, opcode);
+       return ll_rpc_opcode_table[offset].opname;
+}
+
+const char* ll_eopcode2str(__u32 opcode)
+{
+       LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+       return ll_eopcode_table[opcode].opname;
+}
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+                            char *name, struct proc_dir_entry **procroot_ret,
+                            struct lprocfs_stats **stats_ret)
+{
+       struct proc_dir_entry *svc_procroot;
+       struct lprocfs_stats *svc_stats;
+       int i, rc;
+       unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+                                         LPROCFS_CNTR_STDDEV;
+
+       LASSERT(*procroot_ret == NULL);
+       LASSERT(*stats_ret == NULL);
+
+       svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+       if (svc_stats == NULL)
+               return;
+
+       if (dir) {
+               svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+               if (IS_ERR(svc_procroot)) {
+                       lprocfs_free_stats(&svc_stats);
+                       return;
+               }
+       } else {
+               svc_procroot = root;
+       }
+
+       lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+                            svc_counter_config, "req_waittime", "usec");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+                            svc_counter_config, "req_qdepth", "reqs");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+                            svc_counter_config, "req_active", "reqs");
+       lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+                            svc_counter_config, "req_timeout", "sec");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+                            svc_counter_config, "reqbuf_avail", "bufs");
+       for (i = 0; i < EXTRA_LAST_OPC; i++) {
+               char *units;
+
+               switch(i) {
+               case BRW_WRITE_BYTES:
+               case BRW_READ_BYTES:
+                       units = "bytes";
+                       break;
+               default:
+                       units = "reqs";
+                       break;
+               }
+               lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+                                    svc_counter_config,
+                                    ll_eopcode2str(i), units);
+       }
+       for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+               __u32 opcode = ll_rpc_opcode_table[i].opcode;
+               lprocfs_counter_init(svc_stats,
+                                    EXTRA_MAX_OPCODES + i, svc_counter_config,
+                                    ll_opcode2str(opcode), "usec");
+       }
+
+       rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+       if (rc < 0) {
+               if (dir)
+                       lprocfs_remove(&svc_procroot);
+               lprocfs_free_stats(&svc_stats);
+       } else {
+               if (dir)
+                       *procroot_ret = svc_procroot;
+               *stats_ret = svc_stats;
+       }
+}
+
+static int
+ptlrpc_lprocfs_read_req_history_len(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       *eof = 1;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svcpt->scp_hist_nrqbds;
+
+       return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_read_req_history_max(char *page, char **start, off_t off,
+                                   int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       *eof = 1;
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svc->srv_hist_nrqbds_cpt_max;
+
+       return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_write_req_history_max(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+       struct ptlrpc_service      *svc = data;
+       int                         bufpages;
+       int                         val;
+       int                         rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0)
+               return -ERANGE;
+
+       /* This sanity check is more of an insanity check; we can still
+        * hose a kernel by allowing the request history to grow too
+        * far. */
+       bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       if (val > num_physpages/(2 * bufpages))
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+
+       if (val == 0)
+               svc->srv_hist_nrqbds_cpt_max = 0;
+       else
+               svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_min(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+
+       return snprintf(page, count, "%d\n",
+                       svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_min(struct file *file, const char *buffer,
+                             unsigned long count, void *data)
+{
+       struct ptlrpc_service      *svc = data;
+       int     val;
+       int     rc = lprocfs_write_helper(buffer, count, &val);
+
+       if (rc < 0)
+               return rc;
+
+       if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+               spin_unlock(&svc->srv_lock);
+               return -ERANGE;
+       }
+
+       svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_started(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svcpt->scp_nthrs_running;
+
+       return snprintf(page, count, "%d\n", total);
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_max(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+
+       return snprintf(page, count, "%d\n",
+                       svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_max(struct file *file, const char *buffer,
+                             unsigned long count, void *data)
+{
+       struct ptlrpc_service *svc = data;
+       int     val;
+       int     rc = lprocfs_write_helper(buffer, count, &val);
+
+       if (rc < 0)
+               return rc;
+
+       if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+               spin_unlock(&svc->srv_lock);
+               return -ERANGE;
+       }
+
+       svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+       switch (state) {
+       default:
+               LBUG();
+       case NRS_POL_STATE_INVALID:
+               return "invalid";
+       case NRS_POL_STATE_STOPPED:
+               return "stopped";
+       case NRS_POL_STATE_STOPPING:
+               return "stopping";
+       case NRS_POL_STATE_STARTING:
+               return "starting";
+       case NRS_POL_STATE_STARTED:
+               return "started";
+       }
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_pol_info *info)
+{
+       LASSERT(policy != NULL);
+       LASSERT(info != NULL);
+       LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock));
+
+       memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+       info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+       info->pi_state       = policy->pol_state;
+       /**
+        * XXX: These are accessed without holding
+        * ptlrpc_service_part::scp_req_lock.
+        */
+       info->pi_req_queued  = policy->pol_req_queued;
+       info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_rd_nrs(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{
+       struct ptlrpc_service          *svc = data;
+       struct ptlrpc_service_part     *svcpt;
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_pol_info     *infos;
+       struct ptlrpc_nrs_pol_info      tmp;
+       unsigned                        num_pols;
+       unsigned                        pol_idx = 0;
+       bool                            hp = false;
+       int                             i;
+       int                             rc = 0;
+       int                             rc2 = 0;
+       ENTRY;
+
+       /**
+        * Serialize NRS core lprocfs operations with policy registration/
+        * unregistration.
+        */
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Use the first service partition's regular NRS head in order to obtain
+        * the number of policies registered with NRS heads of this service. All
+        * service partitions will have the same number of policies.
+        */
+       nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+       spin_lock(&nrs->nrs_lock);
+       num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+       spin_unlock(&nrs->nrs_lock);
+
+       OBD_ALLOC(infos, num_pols * sizeof(*infos));
+       if (infos == NULL)
+               GOTO(out, rc = -ENOMEM);
+again:
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               nrs = nrs_svcpt2nrs(svcpt, hp);
+               spin_lock(&nrs->nrs_lock);
+
+               pol_idx = 0;
+
+               list_for_each_entry(policy, &nrs->nrs_policy_list,
+                                       pol_list) {
+                       LASSERT(pol_idx < num_pols);
+
+                       nrs_policy_get_info_locked(policy, &tmp);
+                       /**
+                        * Copy values when handling the first service
+                        * partition.
+                        */
+                       if (i == 0) {
+                               memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+                                      NRS_POL_NAME_MAX);
+                               memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+                                      sizeof(tmp.pi_state));
+                               infos[pol_idx].pi_fallback = tmp.pi_fallback;
+                               /**
+                                * For the rest of the service partitions
+                                * sanity-check the values we get.
+                                */
+                       } else {
+                               LASSERT(strncmp(infos[pol_idx].pi_name,
+                                               tmp.pi_name,
+                                               NRS_POL_NAME_MAX) == 0);
+                               /**
+                                * Not asserting ptlrpc_nrs_pol_info::pi_state,
+                                * because it may be different between
+                                * instances of the same policy in different
+                                * service partitions.
+                                */
+                               LASSERT(infos[pol_idx].pi_fallback ==
+                                       tmp.pi_fallback);
+                       }
+
+                       infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+                       infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+                       pol_idx++;
+               }
+               spin_unlock(&nrs->nrs_lock);
+       }
+
+       /**
+        * Policy status information output is in YAML format.
+        * For example:
+        *
+        *      regular_requests:
+        *        - name: fifo
+        *          state: started
+        *          fallback: yes
+        *          queued: 0
+        *          active: 0
+        *
+        *        - name: crrn
+        *          state: started
+        *          fallback: no
+        *          queued: 2015
+        *          active: 384
+        *
+        *      high_priority_requests:
+        *        - name: fifo
+        *          state: started
+        *          fallback: yes
+        *          queued: 0
+        *          active: 2
+        *
+        *        - name: crrn
+        *          state: stopped
+        *          fallback: no
+        *          queued: 0
+        *          active: 0
+        */
+       rc2 = snprintf(page + rc, count - rc,
+                      "%s\n", !hp ?
+                      "\nregular_requests:" :
+                      "high_priority_requests:");
+
+       if (rc2 >= count - rc) {
+               /** Output was truncated */
+               GOTO(out, rc = -EFBIG);
+       }
+
+       rc += rc2;
+
+       for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+               rc2 = snprintf(page + rc, count - rc,
+                              "  - name: %s\n"
+                              "    state: %s\n"
+                              "    fallback: %s\n"
+                              "    queued: %-20d\n"
+                              "    active: %-20d\n\n",
+                              infos[pol_idx].pi_name,
+                              nrs_state2str(infos[pol_idx].pi_state),
+                              infos[pol_idx].pi_fallback ? "yes" : "no",
+                              (int)infos[pol_idx].pi_req_queued,
+                              (int)infos[pol_idx].pi_req_started);
+
+
+               if (rc2 >= count - rc) {
+                       /** Output was truncated */
+                       GOTO(out, rc = -EFBIG);
+               }
+
+               rc += rc2;
+       }
+
+       if (!hp && nrs_svc_has_hp(svc)) {
+               memset(infos, 0, num_pols * sizeof(*infos));
+
+               /**
+                * Redo the processing for the service's HP NRS heads' policies.
+                */
+               hp = true;
+               goto again;
+       }
+
+       *eof = 1;
+
+out:
+       if (infos)
+               OBD_FREE(infos, num_pols * sizeof(*infos));
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static int ptlrpc_lprocfs_wr_nrs(struct file *file, const char *buffer,
+                                unsigned long count, void *data)
+{
+       struct ptlrpc_service          *svc = data;
+       enum ptlrpc_nrs_queue_type      queue = PTLRPC_NRS_QUEUE_BOTH;
+       char                           *cmd;
+       char                           *cmd_copy = NULL;
+       char                           *token;
+       int                             rc = 0;
+       ENTRY;
+
+       if (count >= LPROCFS_NRS_WR_MAX_CMD)
+               GOTO(out, rc = -EINVAL);
+
+       OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+       if (cmd == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /**
+        * strsep() modifies its argument, so keep a copy
+        */
+       cmd_copy = cmd;
+
+       if (copy_from_user(cmd, buffer, count))
+               GOTO(out, rc = -EFAULT);
+
+       cmd[count] = '\0';
+
+       token = strsep(&cmd, " ");
+
+       if (strlen(token) > NRS_POL_NAME_MAX - 1)
+               GOTO(out, rc = -EINVAL);
+
+       /**
+        * No [reg|hp] token has been specified
+        */
+       if (cmd == NULL)
+               goto default_queue;
+
+       /**
+        * The second token is either NULL, or an optional [reg|hp] string
+        */
+       if (strcmp(cmd, "reg") == 0)
+               queue = PTLRPC_NRS_QUEUE_REG;
+       else if (strcmp(cmd, "hp") == 0)
+               queue = PTLRPC_NRS_QUEUE_HP;
+       else
+               GOTO(out, rc = -EINVAL);
+
+default_queue:
+
+       if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+               GOTO(out, rc = -ENODEV);
+       else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+               queue = PTLRPC_NRS_QUEUE_REG;
+
+       /**
+        * Serialize NRS core lprocfs operations with policy registration/
+        * unregistration.
+        */
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+                                      false, NULL);
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+out:
+       if (cmd_copy)
+               OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+       RETURN(rc < 0 ? rc : count);
+}
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+       int                     srhi_idx;
+       __u64                   srhi_seq;
+       struct ptlrpc_request   *srhi_req;
+};
+
+int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+                                   struct ptlrpc_srh_iterator *srhi,
+                                   __u64 seq)
+{
+       struct list_head                *e;
+       struct ptlrpc_request   *req;
+
+       if (srhi->srhi_req != NULL &&
+           srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+           srhi->srhi_seq <= seq) {
+               /* If srhi_req was set previously, hasn't been culled and
+                * we're searching for a seq on or after it (i.e. more
+                * recent), search from it onwards.
+                * Since the service history is LRU (i.e. culled reqs will
+                * be near the head), we shouldn't have to do long
+                * re-scans */
+               LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+                        "%s:%d: seek seq "LPU64", request seq "LPU64"\n",
+                        svcpt->scp_service->srv_name, svcpt->scp_cpt,
+                        srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+               LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+                        "%s:%d: seek offset "LPU64", request seq "LPU64", "
+                        "last culled "LPU64"\n",
+                        svcpt->scp_service->srv_name, svcpt->scp_cpt,
+                        seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+               e = &srhi->srhi_req->rq_history_list;
+       } else {
+               /* search from start */
+               e = svcpt->scp_hist_reqs.next;
+       }
+
+       while (e != &svcpt->scp_hist_reqs) {
+               req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+               if (req->rq_history_seq >= seq) {
+                       srhi->srhi_seq = req->rq_history_seq;
+                       srhi->srhi_req = req;
+                       return 0;
+               }
+               e = e->next;
+       }
+
+       return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)                   \
+       ((svc)->srv_cpt_bits == 0 ? 0 :                 \
+        (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)                   \
+       ((svc)->srv_cpt_bits == 0 ? 0 :                 \
+        (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)                   \
+       ((svc)->srv_cpt_bits == 0 ? (seq) :             \
+        ((seq) >> (svc)->srv_cpt_bits) |               \
+        ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)                   \
+       ((svc)->srv_cpt_bits == 0 ? (pos) :             \
+        ((__u64)(pos) << (svc)->srv_cpt_bits) |        \
+        ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_srh_iterator      *srhi;
+       unsigned int                    cpt;
+       int                             rc;
+       int                             i;
+
+       if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+               CWARN("Failed to read request history because size of loff_t "
+                     "%d can't match size of u64\n", (int)sizeof(loff_t));
+               return NULL;
+       }
+
+       OBD_ALLOC(srhi, sizeof(*srhi));
+       if (srhi == NULL)
+               return NULL;
+
+       srhi->srhi_seq = 0;
+       srhi->srhi_req = NULL;
+
+       cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (i < cpt) /* skip */
+                       continue;
+               if (i > cpt) /* make up the lowest position for this CPT */
+                       *pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+               spin_lock(&svcpt->scp_lock);
+               rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+                               PTLRPC_REQ_POS2SEQ(svc, *pos));
+               spin_unlock(&svcpt->scp_lock);
+               if (rc == 0) {
+                       *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+                       srhi->srhi_idx = i;
+                       return srhi;
+               }
+       }
+
+       OBD_FREE(srhi, sizeof(*srhi));
+       return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+       struct ptlrpc_srh_iterator *srhi = iter;
+
+       if (srhi != NULL)
+               OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+                                   void *iter, loff_t *pos)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_srh_iterator      *srhi = iter;
+       struct ptlrpc_service_part      *svcpt;
+       __u64                           seq;
+       int                             rc;
+       int                             i;
+
+       for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+               svcpt = svc->srv_parts[i];
+
+               if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+                       srhi->srhi_req = NULL;
+                       seq = srhi->srhi_seq = 0;
+               } else { /* the next sequence */
+                       seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+               }
+
+               spin_lock(&svcpt->scp_lock);
+               rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+               spin_unlock(&svcpt->scp_lock);
+               if (rc == 0) {
+                       *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+                       srhi->srhi_idx = i;
+                       return srhi;
+               }
+       }
+
+       OBD_FREE(srhi, sizeof(*srhi));
+       return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+       /* Called holding srv_lock with irqs disabled.
+        * Print specific req contents and a newline.
+        * CAVEAT EMPTOR: check request message length before printing!!!
+        * You might have received any old crap so you must be just as
+        * careful here as the service's request parser!!! */
+       struct seq_file *sf = seq_file;
+
+       switch (req->rq_phase) {
+       case RQ_PHASE_NEW:
+               /* still awaiting a service thread's attention, or rejected
+                * because the generic request message didn't unpack */
+               seq_printf(sf, "<not swabbed>\n");
+               break;
+       case RQ_PHASE_INTERPRET:
+               /* being handled, so basic msg swabbed, and opc is valid
+                * but racing with mds_handle() */
+       case RQ_PHASE_COMPLETE:
+               /* been handled by mds_handle() reply state possibly still
+                * volatile */
+               seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+               break;
+       default:
+               DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+       }
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_srh_iterator      *srhi = iter;
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_request           *req;
+       int                             rc;
+
+       LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+       svcpt = svc->srv_parts[srhi->srhi_idx];
+
+       spin_lock(&svcpt->scp_lock);
+
+       rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+       if (rc == 0) {
+               req = srhi->srhi_req;
+
+               /* Print common req fields.
+                * CAVEAT EMPTOR: we're racing with the service handler
+                * here.  The request could contain any old crap, so you
+                * must be just as careful as the service's request
+                * parser. Currently I only print stuff here I know is OK
+                * to look at coz it was set up in request_in_callback()!!! */
+               seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ",
+                          req->rq_history_seq, libcfs_nid2str(req->rq_self),
+                          libcfs_id2str(req->rq_peer), req->rq_xid,
+                          req->rq_reqlen, ptlrpc_rqphase2str(req),
+                          req->rq_arrival_time.tv_sec,
+                          req->rq_sent - req->rq_arrival_time.tv_sec,
+                          req->rq_sent - req->rq_deadline);
+               if (svc->srv_ops.so_req_printer == NULL)
+                       seq_printf(s, "\n");
+               else
+                       svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+       }
+
+       spin_unlock(&svcpt->scp_lock);
+       return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+       static struct seq_operations sops = {
+               .start = ptlrpc_lprocfs_svc_req_history_start,
+               .stop  = ptlrpc_lprocfs_svc_req_history_stop,
+               .next  = ptlrpc_lprocfs_svc_req_history_next,
+               .show  = ptlrpc_lprocfs_svc_req_history_show,
+       };
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file       *seqf;
+       int                 rc;
+
+       LPROCFS_ENTRY_AND_CHECK(dp);
+       rc = seq_open(file, &sops);
+       if (rc) {
+               LPROCFS_EXIT();
+               return rc;
+       }
+
+       seqf = file->private_data;
+       seqf->private = dp->data;
+       return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+       struct ptlrpc_service           *svc = data;
+       struct ptlrpc_service_part      *svcpt;
+       struct dhms                     ts;
+       time_t                          worstt;
+       unsigned int                    cur;
+       unsigned int                    worst;
+       int                             nob = 0;
+       int                             rc = 0;
+       int                             i;
+
+       if (AT_OFF) {
+               rc += snprintf(page + rc, count - rc,
+                              "adaptive timeouts off, using obd_timeout %u\n",
+                              obd_timeout);
+               return rc;
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               cur     = at_get(&svcpt->scp_at_estimate);
+               worst   = svcpt->scp_at_estimate.at_worst_ever;
+               worstt  = svcpt->scp_at_estimate.at_worst_time;
+               s2dhms(&ts, cfs_time_current_sec() - worstt);
+
+               nob = snprintf(page, count,
+                              "%10s : cur %3u  worst %3u (at %ld, "
+                              DHMS_FMT" ago) ", "service",
+                              cur, worst, worstt, DHMS_VARS(&ts));
+
+               nob = lprocfs_at_hist_helper(page, count, nob,
+                                            &svcpt->scp_at_estimate);
+               rc += nob;
+               page += nob;
+               count -= nob;
+
+               /*
+                * NB: for lustre proc read, the read count must be less
+                * than PAGE_SIZE, please see details in lprocfs_fops_read.
+                * It's unlikely that we exceed PAGE_SIZE at here because
+                * it means the service has more than 50 partitions.
+                */
+               if (count <= 0) {
+                       CWARN("Can't fit AT information of %s in one page, "
+                             "please contact with developer to fix this.\n",
+                             svc->srv_name);
+                       break;
+               }
+       }
+
+       return rc;
+}
+
+static int ptlrpc_lprocfs_rd_hp_ratio(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+       struct ptlrpc_service *svc = data;
+       int rc = snprintf(page, count, "%d", svc->srv_hpreq_ratio);
+       return rc;
+}
+
+static int ptlrpc_lprocfs_wr_hp_ratio(struct file *file, const char *buffer,
+                                     unsigned long count, void *data)
+{
+       struct ptlrpc_service           *svc = data;
+       int     rc;
+       int     val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       svc->srv_hpreq_ratio = val;
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+                                    struct ptlrpc_service *svc)
+{
+       struct lprocfs_vars lproc_vars[] = {
+               {.name       = "high_priority_ratio",
+                .read_fptr  = ptlrpc_lprocfs_rd_hp_ratio,
+                .write_fptr = ptlrpc_lprocfs_wr_hp_ratio,
+                .data       = svc},
+               {.name       = "req_buffer_history_len",
+                .read_fptr  = ptlrpc_lprocfs_read_req_history_len,
+                .data       = svc},
+               {.name       = "req_buffer_history_max",
+                .write_fptr = ptlrpc_lprocfs_write_req_history_max,
+                .read_fptr  = ptlrpc_lprocfs_read_req_history_max,
+                .data       = svc},
+               {.name       = "threads_min",
+                .read_fptr  = ptlrpc_lprocfs_rd_threads_min,
+                .write_fptr = ptlrpc_lprocfs_wr_threads_min,
+                .data       = svc},
+               {.name       = "threads_max",
+                .read_fptr  = ptlrpc_lprocfs_rd_threads_max,
+                .write_fptr = ptlrpc_lprocfs_wr_threads_max,
+                .data       = svc},
+               {.name       = "threads_started",
+                .read_fptr  = ptlrpc_lprocfs_rd_threads_started,
+                .data       = svc},
+               {.name       = "timeouts",
+                .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
+                .data       = svc},
+               {.name       = "nrs_policies",
+                .read_fptr  = ptlrpc_lprocfs_rd_nrs,
+                .write_fptr = ptlrpc_lprocfs_wr_nrs,
+                .data       = svc},
+               {NULL}
+       };
+       static struct file_operations req_history_fops = {
+               .owner       = THIS_MODULE,
+               .open   = ptlrpc_lprocfs_svc_req_history_open,
+               .read   = seq_read,
+               .llseek      = seq_lseek,
+               .release     = lprocfs_seq_release,
+       };
+
+       int rc;
+
+       ptlrpc_lprocfs_register(entry, svc->srv_name,
+                               "stats", &svc->srv_procroot,
+                               &svc->srv_stats);
+
+       if (svc->srv_procroot == NULL)
+               return;
+
+       lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+       rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+                               0400, &req_history_fops, svc);
+       if (rc)
+               CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+       ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+                               &obddev->obd_svc_procroot,
+                               &obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+       struct lprocfs_stats *svc_stats;
+       __u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+       int opc = opcode_offset(op);
+
+       svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+       if (svc_stats == NULL || opc <= 0)
+               return;
+       LASSERT(opc < LUSTRE_MAX_OPCODES);
+       if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+               lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+       struct lprocfs_stats *svc_stats;
+       int idx;
+
+       if (!req->rq_import)
+               return;
+       svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+       if (!svc_stats)
+               return;
+       idx = lustre_msg_get_opc(req->rq_reqmsg);
+       switch (idx) {
+       case OST_READ:
+               idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+               break;
+       case OST_WRITE:
+               idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+               break;
+       default:
+               LASSERTF(0, "unsupported opcode %u\n", idx);
+               break;
+       }
+
+       lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+       if (svc->srv_procroot != NULL)
+               lprocfs_remove(&svc->srv_procroot);
+
+       if (svc->srv_stats)
+               lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+       if (obd->obd_svc_procroot)
+               lprocfs_remove(&obd->obd_svc_procroot);
+
+       if (obd->obd_svc_stats)
+               lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       char          *kbuf;
+       char          *tmpbuf;
+
+       OBD_ALLOC(kbuf, BUFLEN);
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       /*
+        * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+        * bytes into kbuf, to ensure that the string is NUL-terminated.
+        * UUID_MAX should include a trailing NUL already.
+        */
+       if (copy_from_user(kbuf, buffer,
+                              min_t(unsigned long, BUFLEN - 1, count))) {
+               count = -EFAULT;
+               goto out;
+       }
+       tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+       /* Kludge code(deadlock situation): the lprocfs lock has been held
+        * since the client is evicted by writting client's
+        * uuid/nid to procfs "evict_client" entry. However,
+        * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+        * the proc entries under the being destroyed export{}, so I have
+        * to drop the lock at first here.
+        * - jay, jxiong@clusterfs.com */
+       LPROCFS_EXIT();
+       class_incref(obd, __FUNCTION__, current);
+
+       if (strncmp(tmpbuf, "nid:", 4) == 0)
+               obd_export_evict_by_nid(obd, tmpbuf + 4);
+       else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+               obd_export_evict_by_uuid(obd, tmpbuf + 5);
+       else
+               obd_export_evict_by_uuid(obd, tmpbuf);
+
+       class_decref(obd, __FUNCTION__, current);
+       LPROCFS_ENTRY();
+
+out:
+       OBD_FREE(kbuf, BUFLEN);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char *buffer,
+                   unsigned long count, void *data)
+{
+       struct obd_device     *obd = data;
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+       LPROCFS_CLIMP_EXIT(obd);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+       if (rc >= 0)
+               RETURN(count);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       char *kbuf = NULL;
+       char *uuid;
+       char *ptr;
+       int do_reconn = 1;
+       const char prefix[] = "connection=";
+       const int prefix_len = sizeof(prefix) - 1;
+
+       if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+               return -EINVAL;
+
+       OBD_ALLOC(kbuf, count + 1);
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(kbuf, buffer, count))
+               GOTO(out, count = -EFAULT);
+
+       kbuf[count] = 0;
+
+       /* only support connection=uuid::instance now */
+       if (strncmp(prefix, kbuf, prefix_len) != 0)
+               GOTO(out, count = -EINVAL);
+
+       uuid = kbuf + prefix_len;
+       ptr = strstr(uuid, "::");
+       if (ptr) {
+               __u32 inst;
+               char *endptr;
+
+               *ptr = 0;
+               do_reconn = 0;
+               ptr += strlen("::");
+               inst = simple_strtol(ptr, &endptr, 10);
+               if (*endptr) {
+                       CERROR("config: wrong instance # %s\n", ptr);
+               } else if (inst != imp->imp_connect_data.ocd_instance) {
+                       CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
+                              "target(%u/%u), reconnecting...\n",
+                              imp->imp_obd->obd_name,
+                              imp->imp_connect_data.ocd_instance, inst);
+                       do_reconn = 1;
+               } else {
+                       CDEBUG(D_INFO, "IR: %s has already been connecting to "
+                              "new target(%u)\n",
+                              imp->imp_obd->obd_name, inst);
+               }
+       }
+
+       if (do_reconn)
+               ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+       OBD_FREE(kbuf, count + 1);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       int rc;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       rc = snprintf(page, count, "%d\n", !imp->imp_no_pinger_recover);
+       LPROCFS_CLIMP_EXIT(obd);
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                     unsigned long count, void *data)
+{
+       struct obd_device *obd = data;
+       struct client_obd *cli = &obd->u.cli;
+       struct obd_import *imp = cli->cl_import;
+       int rc, val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val != 0 && val != 1)
+               return -ERANGE;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       spin_lock(&imp->imp_lock);
+       imp->imp_no_pinger_recover = !val;
+       spin_unlock(&imp->imp_lock);
+       LPROCFS_CLIMP_EXIT(obd);
+
+       return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644 (file)
index 0000000..de3f0db
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
+                        lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+                        struct ptlrpc_connection *conn, int portal, __u64 xid,
+                        unsigned int offset)
+{
+       int           rc;
+       lnet_md_t        md;
+       ENTRY;
+
+       LASSERT (portal != 0);
+       LASSERT (conn != NULL);
+       CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+       md.start     = base;
+       md.length    = len;
+       md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+       md.options   = PTLRPC_MD_OPTIONS;
+       md.user_ptr  = cbid;
+       md.eq_handle = ptlrpc_eq_h;
+
+       if (unlikely(ack == LNET_ACK_REQ &&
+                    OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+               /* don't ask for the ack to simulate failing client */
+               ack = LNET_NOACK_REQ;
+       }
+
+       rc = LNetMDBind (md, LNET_UNLINK, mdh);
+       if (unlikely(rc != 0)) {
+               CERROR ("LNetMDBind failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               RETURN (-ENOMEM);
+       }
+
+       CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+              len, portal, xid, offset);
+
+       rc = LNetPut (conn->c_self, *mdh, ack,
+                     conn->c_peer, portal, xid, offset, 0);
+       if (unlikely(rc != 0)) {
+               int rc2;
+               /* We're going to get an UNLINK event when I unlink below,
+                * which will complete just like any other failed send, so
+                * I fall through and return success here! */
+               CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n",
+                      libcfs_id2str(conn->c_peer), portal, xid, rc);
+               rc2 = LNetMDUnlink(*mdh);
+               LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+       }
+
+       RETURN (0);
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+       int i;
+
+       for (i = 0; i < count; i++)
+               LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       lnet_process_id_t peer;
+       int rc = 0;
+       int rc2;
+       int posted_md;
+       int total_md;
+       __u64 xid;
+       lnet_handle_me_t  me_h;
+       lnet_md_t        md;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+               RETURN(0);
+
+       /* NB no locking required until desc is on the network */
+       LASSERT(desc->bd_nob > 0);
+       LASSERT(desc->bd_md_count == 0);
+       LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+       LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+       LASSERT(desc->bd_req != NULL);
+       LASSERT(desc->bd_type == BULK_PUT_SINK ||
+               desc->bd_type == BULK_GET_SOURCE);
+
+       /* cleanup the state of the bulk for it will be reused */
+       if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+               desc->bd_nob_transferred = 0;
+       else
+               LASSERT(desc->bd_nob_transferred == 0);
+
+       desc->bd_failure = 0;
+
+       peer = desc->bd_import->imp_connection->c_peer;
+
+       LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+       LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+       /* An XID is only used for a single request from the client.
+        * For retried bulk transfers, a new XID will be allocated in
+        * in ptlrpc_check_set() if it needs to be resent, so it is not
+        * using the same RDMA match bits after an error.
+        *
+        * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+        * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+       xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+       LASSERTF(!(desc->bd_registered &&
+                  req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+                xid != desc->bd_last_xid,
+                "registered: %d  rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
+                desc->bd_registered, xid, desc->bd_last_xid);
+
+       total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+       desc->bd_registered = 1;
+       desc->bd_last_xid = xid;
+       desc->bd_md_count = total_md;
+       md.user_ptr = &desc->bd_cbid;
+       md.eq_handle = ptlrpc_eq_h;
+       md.threshold = 1;                      /* PUT or GET */
+
+       for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+               md.options = PTLRPC_MD_OPTIONS |
+                            ((desc->bd_type == BULK_GET_SOURCE) ?
+                             LNET_MD_OP_GET : LNET_MD_OP_PUT);
+               ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+               rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+                                 LNET_UNLINK, LNET_INS_AFTER, &me_h);
+               if (rc != 0) {
+                       CERROR("%s: LNetMEAttach failed x"LPU64"/%d: rc = %d\n",
+                              desc->bd_export->exp_obd->obd_name, xid,
+                              posted_md, rc);
+                       break;
+               }
+
+               /* About to let the network at it... */
+               rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+                                 &desc->bd_mds[posted_md]);
+               if (rc != 0) {
+                       CERROR("%s: LNetMDAttach failed x"LPU64"/%d: rc = %d\n",
+                              desc->bd_export->exp_obd->obd_name, xid,
+                              posted_md, rc);
+                       rc2 = LNetMEUnlink(me_h);
+                       LASSERT(rc2 == 0);
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               LASSERT(rc == -ENOMEM);
+               spin_lock(&desc->bd_lock);
+               desc->bd_md_count -= total_md - posted_md;
+               spin_unlock(&desc->bd_lock);
+               LASSERT(desc->bd_md_count >= 0);
+               mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+               req->rq_status = -ENOMEM;
+               RETURN(-ENOMEM);
+       }
+
+       /* Set rq_xid to matchbits of the final bulk so that server can
+        * infer the number of bulks that were prepared */
+       req->rq_xid = --xid;
+       LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+                "bd_last_xid = x"LPU64", rq_xid = x"LPU64"\n",
+                desc->bd_last_xid, req->rq_xid);
+
+       spin_lock(&desc->bd_lock);
+       /* Holler if peer manages to touch buffers before he knows the xid */
+       if (desc->bd_md_count != total_md)
+               CWARN("%s: Peer %s touched %d buffers while I registered\n",
+                     desc->bd_export->exp_obd->obd_name, libcfs_id2str(peer),
+                     total_md - desc->bd_md_count);
+       spin_unlock(&desc->bd_lock);
+
+       CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+              "xid x"LPX64"-"LPX64", portal %u\n", desc->bd_md_count,
+              desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+              desc->bd_iov_count, desc->bd_nob,
+              desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       wait_queue_head_t            *wq;
+       struct l_wait_info       lwi;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(!in_interrupt());     /* might sleep */
+
+       /* Let's setup deadline for reply unlink. */
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+           async && req->rq_bulk_deadline == 0)
+               req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
+
+       if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
+               RETURN(1);                              /* never registered */
+
+       LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+       /* the unlink ensures the callback happens ASAP and is the last
+        * one.  If it fails, it must be because completion just happened,
+        * but we must still l_wait_event() in this case to give liblustre
+        * a chance to run client_bulk_callback() */
+       mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+       if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
+               RETURN(1);                              /* never registered */
+
+       /* Move to "Unregistering" phase as bulk was not unlinked yet. */
+       ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+       /* Do not wait for unlink to finish. */
+       if (async)
+               RETURN(0);
+
+       if (req->rq_set != NULL)
+               wq = &req->rq_set->set_waitq;
+       else
+               wq = &req->rq_reply_waitq;
+
+       for (;;) {
+               /* Network access will complete in finite time but the HUGE
+                * timeout lets us CWARN for visibility of sluggish NALs */
+               lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                          cfs_time_seconds(1), NULL, NULL);
+               rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+               if (rc == 0) {
+                       ptlrpc_rqphase_move(req, req->rq_next_phase);
+                       RETURN(1);
+               }
+
+               LASSERT(rc == -ETIMEDOUT);
+               DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+                         desc);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+       struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_service           *svc = svcpt->scp_service;
+       int service_time = max_t(int, cfs_time_current_sec() -
+                                req->rq_arrival_time.tv_sec, 1);
+
+       if (!(flags & PTLRPC_REPLY_EARLY) &&
+           (req->rq_type != PTL_RPC_MSG_ERR) &&
+           (req->rq_reqmsg != NULL) &&
+           !(lustre_msg_get_flags(req->rq_reqmsg) &
+             (MSG_RESENT | MSG_REPLAY |
+              MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+               /* early replies, errors and recovery requests don't count
+                * toward our service time estimate */
+               int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+               if (oldse != 0) {
+                       DEBUG_REQ(D_ADAPTTO, req,
+                                 "svc %s changed estimate from %d to %d",
+                                 svc->srv_name, oldse,
+                                 at_get(&svcpt->scp_at_estimate));
+               }
+       }
+       /* Report actual service time for client latency calc */
+       lustre_msg_set_service_time(req->rq_repmsg, service_time);
+       /* Report service time estimate for future client reqs, but report 0
+        * (to be ignored by client) if it's a error reply during recovery.
+        * (bz15815) */
+       if (req->rq_type == PTL_RPC_MSG_ERR &&
+           (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+               lustre_msg_set_timeout(req->rq_repmsg, 0);
+       else
+               lustre_msg_set_timeout(req->rq_repmsg,
+                                      at_get(&svcpt->scp_at_estimate));
+
+       if (req->rq_reqmsg &&
+           !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+               CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+                      "req_flags=%#x magic=%d:%x/%x len=%d\n",
+                      flags, lustre_msg_get_flags(req->rq_reqmsg),
+                      lustre_msg_is_v1(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+       }
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on sucess or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct ptlrpc_connection  *conn;
+       int                     rc;
+
+       /* We must already have a reply buffer (only ptlrpc_error() may be
+        * called without one). The reply generated by sptlrpc layer (e.g.
+        * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+        * have a request buffer which is either the actual (swabbed) incoming
+        * request, or a saved copy if this is a req saved in
+        * target_queue_final_reply().
+        */
+       LASSERT (req->rq_no_reply == 0);
+       LASSERT (req->rq_reqbuf != NULL);
+       LASSERT (rs != NULL);
+       LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+       LASSERT (req->rq_repmsg != NULL);
+       LASSERT (req->rq_repmsg == rs->rs_msg);
+       LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+       LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+       /* There may be no rq_export during failover */
+
+       if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+                    req->rq_export->exp_obd->obd_fail)) {
+               /* Failed obd's only send ENODEV */
+               req->rq_type = PTL_RPC_MSG_ERR;
+               req->rq_status = -ENODEV;
+               CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+                      req->rq_export->exp_obd->obd_minor);
+       }
+
+       /* In order to keep interoprability with the client (< 2.3) which
+        * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+        * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+        * reply buffer on client will be overflow.
+        *
+        * XXX Remove this whenver we drop the interoprability with such client.
+        */
+       req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+                                          sizeof(struct ptlrpc_body_v2), 1);
+
+       if (req->rq_type != PTL_RPC_MSG_ERR)
+               req->rq_type = PTL_RPC_MSG_REPLY;
+
+       lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+       lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+       lustre_msg_set_opc(req->rq_repmsg,
+               req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+       target_pack_pool_reply(req);
+
+       ptlrpc_at_set_reply(req, flags);
+
+       if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+               conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+       else
+               conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+       if (unlikely(conn == NULL)) {
+               CERROR("not replying on NULL connection\n"); /* bug 9635 */
+               return -ENOTCONN;
+       }
+       ptlrpc_rs_addref(rs);              /* +1 ref for the network */
+
+       rc = sptlrpc_svc_wrap_reply(req);
+       if (unlikely(rc))
+               goto out;
+
+       req->rq_sent = cfs_time_current_sec();
+
+       rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+                          (rs->rs_difficult && !rs->rs_no_ack) ?
+                          LNET_ACK_REQ : LNET_NOACK_REQ,
+                          &rs->rs_cb_id, conn,
+                          ptlrpc_req2svc(req)->srv_rep_portal,
+                          req->rq_xid, req->rq_reply_off);
+out:
+       if (unlikely(rc != 0))
+               ptlrpc_req_drop_rs(req);
+       ptlrpc_connection_put(conn);
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+       if (req->rq_no_reply)
+               return 0;
+       else
+               return (ptlrpc_send_reply(req, 0));
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+       int rc;
+       ENTRY;
+
+       if (req->rq_no_reply)
+               RETURN(0);
+
+       if (!req->rq_repmsg) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+           req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+           req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+               req->rq_type = PTL_RPC_MSG_ERR;
+
+       rc = ptlrpc_send_reply(req, may_be_difficult);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+       return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+       int rc;
+       int rc2;
+       int mpflag = 0;
+       struct ptlrpc_connection *connection;
+       lnet_handle_me_t  reply_me_h;
+       lnet_md_t        reply_md;
+       struct obd_device *obd = request->rq_import->imp_obd;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+               RETURN(0);
+
+       LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+       LASSERT(request->rq_wait_ctx == 0);
+
+       /* If this is a re-transmit, we're required to have disengaged
+        * cleanly from the previous attempt */
+       LASSERT(!request->rq_receiving_reply);
+
+       if (request->rq_import->imp_obd &&
+           request->rq_import->imp_obd->obd_fail) {
+               CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+                      request->rq_import->imp_obd->obd_name);
+               /* this prevents us from waiting in ptlrpc_queue_wait */
+               request->rq_err = 1;
+               request->rq_status = -ENODEV;
+               RETURN(-ENODEV);
+       }
+
+       connection = request->rq_import->imp_connection;
+
+       lustre_msg_set_handle(request->rq_reqmsg,
+                             &request->rq_import->imp_remote_handle);
+       lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+       lustre_msg_set_conn_cnt(request->rq_reqmsg,
+                               request->rq_import->imp_conn_cnt);
+       lustre_msghdr_set_flags(request->rq_reqmsg,
+                               request->rq_import->imp_msghdr_flags);
+
+       if (request->rq_resend)
+               lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+       if (request->rq_memalloc)
+               mpflag = cfs_memory_pressure_get_and_set();
+
+       rc = sptlrpc_cli_wrap_request(request);
+       if (rc)
+               GOTO(out, rc);
+
+       /* bulk register should be done after wrap_request() */
+       if (request->rq_bulk != NULL) {
+               rc = ptlrpc_register_bulk (request);
+               if (rc != 0)
+                       GOTO(out, rc);
+       }
+
+       if (!noreply) {
+               LASSERT (request->rq_replen != 0);
+               if (request->rq_repbuf == NULL) {
+                       LASSERT(request->rq_repdata == NULL);
+                       LASSERT(request->rq_repmsg == NULL);
+                       rc = sptlrpc_cli_alloc_repbuf(request,
+                                                     request->rq_replen);
+                       if (rc) {
+                               /* this prevents us from looping in
+                                * ptlrpc_queue_wait */
+                               request->rq_err = 1;
+                               request->rq_status = rc;
+                               GOTO(cleanup_bulk, rc);
+                       }
+               } else {
+                       request->rq_repdata = NULL;
+                       request->rq_repmsg = NULL;
+               }
+
+               rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+                                 connection->c_peer, request->rq_xid, 0,
+                                 LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+               if (rc != 0) {
+                       CERROR("LNetMEAttach failed: %d\n", rc);
+                       LASSERT (rc == -ENOMEM);
+                       GOTO(cleanup_bulk, rc = -ENOMEM);
+               }
+       }
+
+       spin_lock(&request->rq_lock);
+       /* If the MD attach succeeds, there _will_ be a reply_in callback */
+       request->rq_receiving_reply = !noreply;
+       /* We are responsible for unlinking the reply buffer */
+       request->rq_must_unlink = !noreply;
+       /* Clear any flags that may be present from previous sends. */
+       request->rq_replied = 0;
+       request->rq_err = 0;
+       request->rq_timedout = 0;
+       request->rq_net_err = 0;
+       request->rq_resend = 0;
+       request->rq_restart = 0;
+       request->rq_reply_truncate = 0;
+       spin_unlock(&request->rq_lock);
+
+       if (!noreply) {
+               reply_md.start     = request->rq_repbuf;
+               reply_md.length    = request->rq_repbuf_len;
+               /* Allow multiple early replies */
+               reply_md.threshold = LNET_MD_THRESH_INF;
+               /* Manage remote for early replies */
+               reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+                       LNET_MD_MANAGE_REMOTE |
+                       LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+               reply_md.user_ptr  = &request->rq_reply_cbid;
+               reply_md.eq_handle = ptlrpc_eq_h;
+
+               /* We must see the unlink callback to unset rq_must_unlink,
+                  so we can't auto-unlink */
+               rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+                                 &request->rq_reply_md_h);
+               if (rc != 0) {
+                       CERROR("LNetMDAttach failed: %d\n", rc);
+                       LASSERT (rc == -ENOMEM);
+                       spin_lock(&request->rq_lock);
+                       /* ...but the MD attach didn't succeed... */
+                       request->rq_receiving_reply = 0;
+                       spin_unlock(&request->rq_lock);
+                       GOTO(cleanup_me, rc = -ENOMEM);
+               }
+
+               CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+                      ", portal %u\n",
+                      request->rq_repbuf_len, request->rq_xid,
+                      request->rq_reply_portal);
+       }
+
+       /* add references on request for request_out_callback */
+       ptlrpc_request_addref(request);
+       if (obd->obd_svc_stats != NULL)
+               lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+                       atomic_read(&request->rq_import->imp_inflight));
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+       do_gettimeofday(&request->rq_arrival_time);
+       request->rq_sent = cfs_time_current_sec();
+       /* We give the server rq_timeout secs to process the req, and
+          add the network latency for our local timeout. */
+       request->rq_deadline = request->rq_sent + request->rq_timeout +
+               ptlrpc_at_get_net_latency(request);
+
+       ptlrpc_pinger_sending_on_import(request->rq_import);
+
+       DEBUG_REQ(D_INFO, request, "send flg=%x",
+                 lustre_msg_get_flags(request->rq_reqmsg));
+       rc = ptl_send_buf(&request->rq_req_md_h,
+                         request->rq_reqbuf, request->rq_reqdata_len,
+                         LNET_NOACK_REQ, &request->rq_req_cbid,
+                         connection,
+                         request->rq_request_portal,
+                         request->rq_xid, 0);
+       if (rc == 0)
+               GOTO(out, rc);
+
+       ptlrpc_req_finished(request);
+       if (noreply)
+               GOTO(out, rc);
+
+ cleanup_me:
+       /* MEUnlink is safe; the PUT didn't even get off the ground, and
+        * nobody apart from the PUT's target has the right nid+XID to
+        * access the reply buffer. */
+       rc2 = LNetMEUnlink(reply_me_h);
+       LASSERT (rc2 == 0);
+       /* UNLINKED callback called synchronously */
+       LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+       /* We do sync unlink here as there was no real transfer here so
+        * the chance to have long unlink to sluggish net is smaller here. */
+       ptlrpc_unregister_bulk(request, 0);
+ out:
+       if (request->rq_memalloc)
+               cfs_memory_pressure_restore(mpflag);
+       return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+       struct ptlrpc_service     *service = rqbd->rqbd_svcpt->scp_service;
+       static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
+       int                       rc;
+       lnet_md_t                md;
+       lnet_handle_me_t          me_h;
+
+       CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+              service->srv_req_portal);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+               return (-ENOMEM);
+
+       /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+        * which means buffer can only be attached on local CPT, and LND
+        * threads can find it by grabbing a local lock */
+       rc = LNetMEAttach(service->srv_req_portal,
+                         match_id, 0, ~0, LNET_UNLINK,
+                         rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+                         LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+       if (rc != 0) {
+               CERROR("LNetMEAttach failed: %d\n", rc);
+               return (-ENOMEM);
+       }
+
+       LASSERT(rqbd->rqbd_refcount == 0);
+       rqbd->rqbd_refcount = 1;
+
+       md.start     = rqbd->rqbd_buffer;
+       md.length    = service->srv_buf_size;
+       md.max_size  = service->srv_max_req_size;
+       md.threshold = LNET_MD_THRESH_INF;
+       md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+       md.user_ptr  = &rqbd->rqbd_cbid;
+       md.eq_handle = ptlrpc_eq_h;
+
+       rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+       if (rc == 0)
+               return (0);
+
+       CERROR("LNetMDAttach failed: %d; \n", rc);
+       LASSERT (rc == -ENOMEM);
+       rc = LNetMEUnlink (me_h);
+       LASSERT (rc == 0);
+       rqbd->rqbd_refcount = 0;
+
+       return (-ENOMEM);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644 (file)
index 0000000..1996431
--- /dev/null
@@ -0,0 +1,1790 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+              policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_ref == 0);
+       LASSERT(policy->pol_req_queued == 0);
+
+       if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+               policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+                                enum ptlrpc_nrs_ctl opc, void *arg)
+{
+       /**
+        * The policy may be stopped, but the lprocfs files and
+        * ptlrpc_nrs_policy instances remain present until unregistration time.
+        * Do not perform the ctl operation if the policy is stopped, as
+        * policy->pol_private will be NULL in such a case.
+        */
+       if (policy->pol_state == NRS_POL_STATE_STOPPED)
+               RETURN(-ENODEV);
+
+       RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+              policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+              -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs *nrs = policy->pol_nrs;
+       ENTRY;
+
+       if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+               spin_unlock(&nrs->nrs_lock);
+
+               policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+               spin_lock(&nrs->nrs_lock);
+       }
+
+       LASSERT(list_empty(&policy->pol_list_queued));
+       LASSERT(policy->pol_req_queued == 0 &&
+               policy->pol_req_started == 0);
+
+       policy->pol_private = NULL;
+
+       policy->pol_state = NRS_POL_STATE_STOPPED;
+
+       if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+               module_put(policy->pol_desc->pd_owner);
+
+       EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs *nrs = policy->pol_nrs;
+       ENTRY;
+
+       if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+               RETURN(-EPERM);
+
+       if (policy->pol_state == NRS_POL_STATE_STARTING)
+               RETURN(-EAGAIN);
+
+       /* In progress or already stopped */
+       if (policy->pol_state != NRS_POL_STATE_STARTED)
+               RETURN(0);
+
+       policy->pol_state = NRS_POL_STATE_STOPPING;
+
+       /* Immediately make it invisible */
+       if (nrs->nrs_policy_primary == policy) {
+               nrs->nrs_policy_primary = NULL;
+
+       } else {
+               LASSERT(nrs->nrs_policy_fallback == policy);
+               nrs->nrs_policy_fallback = NULL;
+       }
+
+       /* I have the only refcount */
+       if (policy->pol_ref == 1)
+               nrs_policy_stop0(policy);
+
+       RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+       struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+       ENTRY;
+
+       if (tmp == NULL) {
+               /**
+                * XXX: This should really be RETURN_EXIT, but the latter does
+                * not currently print anything out, and possibly should be
+                * fixed to do so.
+                */
+               EXIT;
+               return;
+       }
+
+       nrs->nrs_policy_primary = NULL;
+
+       LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+       tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+       if (tmp->pol_ref == 0)
+               nrs_policy_stop0(tmp);
+       EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+       int                     rc = 0;
+       ENTRY;
+
+       /**
+        * Don't allow multiple starting which is too complex, and has no real
+        * benefit.
+        */
+       if (nrs->nrs_policy_starting)
+               RETURN(-EAGAIN);
+
+       LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+       if (policy->pol_state == NRS_POL_STATE_STOPPING)
+               RETURN(-EAGAIN);
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+               /**
+                * This is for cases in which the user sets the policy to the
+                * fallback policy (currently fifo for all services); i.e. the
+                * user is resetting the policy to the default; so we stop the
+                * primary policy, if any.
+                */
+               if (policy == nrs->nrs_policy_fallback) {
+                       nrs_policy_stop_primary(nrs);
+                       RETURN(0);
+               }
+
+               /**
+                * If we reach here, we must be setting up the fallback policy
+                * at service startup time, and only a single policy with the
+                * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+                * register with NRS core.
+                */
+               LASSERT(nrs->nrs_policy_fallback == NULL);
+       } else {
+               /**
+                * Shouldn't start primary policy if w/o fallback policy.
+                */
+               if (nrs->nrs_policy_fallback == NULL)
+                       RETURN(-EPERM);
+
+               if (policy->pol_state == NRS_POL_STATE_STARTED)
+                       RETURN(0);
+       }
+
+       /**
+        * Increase the module usage count for policies registering from other
+        * modules.
+        */
+       if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+           !try_module_get(policy->pol_desc->pd_owner)) {
+               atomic_dec(&policy->pol_desc->pd_refs);
+               CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+                      policy->pol_desc->pd_name);
+               RETURN(-ENODEV);
+       }
+
+       /**
+        * Serialize policy starting across the NRS head
+        */
+       nrs->nrs_policy_starting = 1;
+
+       policy->pol_state = NRS_POL_STATE_STARTING;
+
+       if (policy->pol_desc->pd_ops->op_policy_start) {
+               spin_unlock(&nrs->nrs_lock);
+
+               rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+               spin_lock(&nrs->nrs_lock);
+               if (rc != 0) {
+                       if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+                               module_put(policy->pol_desc->pd_owner);
+
+                       policy->pol_state = NRS_POL_STATE_STOPPED;
+                       GOTO(out, rc);
+               }
+       }
+
+       policy->pol_state = NRS_POL_STATE_STARTED;
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+               /**
+                * This path is only used at PTLRPC service setup time.
+                */
+               nrs->nrs_policy_fallback = policy;
+       } else {
+               /*
+                * Try to stop the current primary policy if there is one.
+                */
+               nrs_policy_stop_primary(nrs);
+
+               /**
+                * And set the newly-started policy as the primary one.
+                */
+               nrs->nrs_policy_primary = policy;
+       }
+
+out:
+       nrs->nrs_policy_starting = 0;
+
+       RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+       policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_ref > 0);
+
+       policy->pol_ref--;
+       if (unlikely(policy->pol_ref == 0 &&
+           policy->pol_state == NRS_POL_STATE_STOPPING))
+               nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+       spin_lock(&policy->pol_nrs->nrs_lock);
+       nrs_policy_put_locked(policy);
+       spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+                                                        char *name)
+{
+       struct ptlrpc_nrs_policy *tmp;
+
+       list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+               if (strncmp(tmp->pol_desc->pd_name, name,
+                           NRS_POL_NAME_MAX) == 0) {
+                       nrs_policy_get_locked(tmp);
+                       return tmp;
+               }
+       }
+       return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+       struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+       if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+               struct ptlrpc_nrs_resource *parent;
+
+               for (; res != NULL; res = parent) {
+                       parent = res->res_parent;
+                       policy->pol_desc->pd_ops->op_res_put(policy, res);
+               }
+       }
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy     the policy
+ * \param[in] nrq        the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *                       ldlm_lock_reorder_req(), in order to move \a nrq to
+ *                       the high-priority NRS head; we should not sleep when
+ *                       set.
+ *
+ * \retval NULL                  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+                                             struct ptlrpc_nrs_request *nrq,
+                                             bool moving_req)
+{
+       /**
+        * Set to NULL to traverse the resource hierarchy from the top.
+        */
+       struct ptlrpc_nrs_resource *res = NULL;
+       struct ptlrpc_nrs_resource *tmp = NULL;
+       int                         rc;
+
+       while (1) {
+               rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+                                                         &tmp, moving_req);
+               if (rc < 0) {
+                       if (res != NULL)
+                               nrs_resource_put(res);
+                       return NULL;
+               }
+
+               LASSERT(tmp != NULL);
+               tmp->res_parent = res;
+               tmp->res_policy = policy;
+               res = tmp;
+               tmp = NULL;
+               /**
+                * Return once we have obtained a reference to the bottom level
+                * of the resource hierarchy.
+                */
+               if (rc > 0)
+                       return res;
+       }
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *                 stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *                        request from a policy on the regular NRS head to a
+ *                        policy on the HP NRS head (via
+ *                        ldlm_lock_reorder_req()). It signifies that
+ *                        allocations to get resources should be atomic; for
+ *                        a full explanation, see comment in
+ *                        ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+                                 struct ptlrpc_nrs_request *nrq,
+                                 struct ptlrpc_nrs_resource **resp,
+                                 bool moving_req)
+{
+       struct ptlrpc_nrs_policy   *primary = NULL;
+       struct ptlrpc_nrs_policy   *fallback = NULL;
+
+       memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+       /**
+        * Obtain policy references.
+        */
+       spin_lock(&nrs->nrs_lock);
+
+       fallback = nrs->nrs_policy_fallback;
+       nrs_policy_get_locked(fallback);
+
+       primary = nrs->nrs_policy_primary;
+       if (primary != NULL)
+               nrs_policy_get_locked(primary);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       /**
+        * Obtain resource hierarchy references.
+        */
+       resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+       LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+       if (primary != NULL) {
+               resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+                                                        moving_req);
+               /**
+                * A primary policy may exist which may not wish to serve a
+                * particular request for different reasons; release the
+                * reference on the policy as it will not be used for this
+                * request.
+                */
+               if (resp[NRS_RES_PRIMARY] == NULL)
+                       nrs_policy_put(primary);
+       }
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+       struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+       struct ptlrpc_nrs        *nrs = NULL;
+       int                       i;
+
+       for (i = 0; i < NRS_RES_MAX; i++) {
+               if (resp[i] != NULL) {
+                       pols[i] = resp[i]->res_policy;
+                       nrs_resource_put(resp[i]);
+                       resp[i] = NULL;
+               } else {
+                       pols[i] = NULL;
+               }
+       }
+
+       for (i = 0; i < NRS_RES_MAX; i++) {
+               if (pols[i] == NULL)
+                       continue;
+
+               if (nrs == NULL) {
+                       nrs = pols[i]->pol_nrs;
+                       spin_lock(&nrs->nrs_lock);
+               }
+               nrs_policy_put_locked(pols[i]);
+       }
+
+       if (nrs != NULL)
+               spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *                  has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+                                           bool peek, bool force)
+{
+       struct ptlrpc_nrs_request *nrq;
+
+       LASSERT(policy->pol_req_queued > 0);
+
+       nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+       LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+       return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_nrs_policy *policy;
+       int                       rc;
+       int                       i;
+
+       /**
+        * Try in descending order, because the primary policy (if any) is
+        * the preferred choice.
+        */
+       for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+               if (nrq->nr_res_ptrs[i] == NULL)
+                       continue;
+
+               nrq->nr_res_idx = i;
+               policy = nrq->nr_res_ptrs[i]->res_policy;
+
+               rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+               if (rc == 0) {
+                       policy->pol_nrs->nrs_req_queued++;
+                       policy->pol_req_queued++;
+                       return;
+               }
+       }
+       /**
+        * Should never get here, as at least the primary policy's
+        * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+        * succeed.
+        */
+       LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *               job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+       if (policy->pol_desc->pd_ops->op_req_stop)
+               policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+       LASSERT(policy->pol_nrs->nrs_req_started > 0);
+       LASSERT(policy->pol_req_started > 0);
+
+       policy->pol_nrs->nrs_req_started--;
+       policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]    nrs  the NRS head this policy belongs to.
+ * \param[in]    name the human-readable policy name; should be the same as
+ *                    ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]    opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *                    carrying an operation; usually data that is private to
+ *                    the policy at some level, or generic policy status
+ *                    information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+                         enum ptlrpc_nrs_ctl opc, void *arg)
+{
+       struct ptlrpc_nrs_policy       *policy;
+       int                             rc = 0;
+       ENTRY;
+
+       spin_lock(&nrs->nrs_lock);
+
+       policy = nrs_policy_find_locked(nrs, name);
+       if (policy == NULL)
+               GOTO(out, rc = -ENOENT);
+
+       switch (opc) {
+               /**
+                * Unknown opcode, pass it down to the policy-specific control
+                * function for handling.
+                */
+       default:
+               rc = nrs_policy_ctl_locked(policy, opc, arg);
+               break;
+
+               /**
+                * Start \e policy
+                */
+       case PTLRPC_NRS_CTL_START:
+               rc = nrs_policy_start_locked(policy);
+               break;
+       }
+out:
+       if (policy != NULL)
+               nrs_policy_put_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *                ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+       struct ptlrpc_nrs_policy *policy = NULL;
+       ENTRY;
+
+       spin_lock(&nrs->nrs_lock);
+
+       policy = nrs_policy_find_locked(nrs, name);
+       if (policy == NULL) {
+               spin_unlock(&nrs->nrs_lock);
+
+               CERROR("Can't find NRS policy %s\n", name);
+               RETURN(-ENOENT);
+       }
+
+       if (policy->pol_ref > 1) {
+               CERROR("Policy %s is busy with %d references\n", name,
+                      (int)policy->pol_ref);
+               nrs_policy_put_locked(policy);
+
+               spin_unlock(&nrs->nrs_lock);
+               RETURN(-EBUSY);
+       }
+
+       LASSERT(policy->pol_req_queued == 0);
+       LASSERT(policy->pol_req_started == 0);
+
+       if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+               nrs_policy_stop_locked(policy);
+               LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+       }
+
+       list_del(&policy->pol_list);
+       nrs->nrs_num_pols--;
+
+       nrs_policy_put_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       nrs_policy_fini(policy);
+
+       LASSERT(policy->pol_private == NULL);
+       OBD_FREE_PTR(policy);
+
+       RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *                 obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+                              struct ptlrpc_nrs_pol_desc *desc)
+{
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_policy       *tmp;
+       struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+       int                             rc;
+       ENTRY;
+
+       LASSERT(svcpt != NULL);
+       LASSERT(desc->pd_ops != NULL);
+       LASSERT(desc->pd_ops->op_res_get != NULL);
+       LASSERT(desc->pd_ops->op_req_get != NULL);
+       LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+       LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+       LASSERT(desc->pd_compat != NULL);
+
+       OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+                         svcpt->scp_cpt, sizeof(*policy), __GFP_IO);
+       if (policy == NULL)
+               RETURN(-ENOMEM);
+
+       policy->pol_nrs     = nrs;
+       policy->pol_desc    = desc;
+       policy->pol_state   = NRS_POL_STATE_STOPPED;
+       policy->pol_flags   = desc->pd_flags;
+
+       INIT_LIST_HEAD(&policy->pol_list);
+       INIT_LIST_HEAD(&policy->pol_list_queued);
+
+       rc = nrs_policy_init(policy);
+       if (rc != 0) {
+               OBD_FREE_PTR(policy);
+               RETURN(rc);
+       }
+
+       spin_lock(&nrs->nrs_lock);
+
+       tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+       if (tmp != NULL) {
+               CERROR("NRS policy %s has been registered, can't register it "
+                      "for %s\n", policy->pol_desc->pd_name,
+                      svcpt->scp_service->srv_name);
+               nrs_policy_put_locked(tmp);
+
+               spin_unlock(&nrs->nrs_lock);
+               nrs_policy_fini(policy);
+               OBD_FREE_PTR(policy);
+
+               RETURN(-EEXIST);
+       }
+
+       list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+       nrs->nrs_num_pols++;
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+               rc = nrs_policy_start_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       if (rc != 0)
+               (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+       RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_policy       *policy;
+
+       LASSERT(req->rq_nrq.nr_initialized);
+       LASSERT(!req->rq_nrq.nr_enqueued);
+
+       nrs_request_enqueue(&req->rq_nrq);
+       req->rq_nrq.nr_enqueued = 1;
+
+       policy = nrs_request_policy(&req->rq_nrq);
+       /**
+        * Add the policy to the NRS head's list of policies with enqueued
+        * requests, if it has not been added there.
+        */
+       if (unlikely(list_empty(&policy->pol_list_queued)))
+               list_add_tail(&policy->pol_list_queued,
+                                 &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+       int     opc = lustre_msg_get_opc(req->rq_reqmsg);
+       ENTRY;
+
+       spin_lock(&req->rq_lock);
+       req->rq_hp = 1;
+       ptlrpc_nrs_req_add_nolock(req);
+       if (opc != OBD_PING)
+               DEBUG_REQ(D_NET, req, "high priority req");
+       spin_unlock(&req->rq_lock);
+       EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true         the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+       struct ptlrpc_nrs_pol_desc *desc;
+       /* for convenience */
+       struct ptlrpc_service_part       *svcpt = nrs->nrs_svcpt;
+       struct ptlrpc_service            *svc = svcpt->scp_service;
+       int                               rc = -EINVAL;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (nrs_policy_compatible(svc, desc)) {
+                       rc = nrs_policy_register(nrs, desc);
+                       if (rc != 0) {
+                               CERROR("Failed to register NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svc->srv_name, rc);
+                               /**
+                                * Fail registration if any of the policies'
+                                * registration fails.
+                                */
+                               break;
+                       }
+               }
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+                                  struct ptlrpc_service_part *svcpt)
+{
+       int                             rc;
+       enum ptlrpc_nrs_queue_type      queue;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       if (nrs == &svcpt->scp_nrs_reg)
+               queue = PTLRPC_NRS_QUEUE_REG;
+       else if (nrs == svcpt->scp_nrs_hp)
+               queue = PTLRPC_NRS_QUEUE_HP;
+       else
+               LBUG();
+
+       nrs->nrs_svcpt = svcpt;
+       nrs->nrs_queue_type = queue;
+       spin_lock_init(&nrs->nrs_lock);
+       INIT_LIST_HEAD(&nrs->nrs_policy_list);
+       INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+       rc = nrs_register_policies_locked(nrs);
+
+       RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_nrs              *nrs;
+       int                             rc;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       /**
+        * Initialize the regular NRS head.
+        */
+       nrs = nrs_svcpt2nrs(svcpt, false);
+       rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /**
+        * Optionally allocate a high-priority NRS head.
+        */
+       if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+               GOTO(out, rc);
+
+       OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+                         svcpt->scp_service->srv_cptable,
+                         svcpt->scp_cpt);
+       if (svcpt->scp_nrs_hp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       nrs = nrs_svcpt2nrs(svcpt, true);
+       rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+       RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_policy       *tmp;
+       int                             rc;
+       bool                            hp = false;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+       nrs = nrs_svcpt2nrs(svcpt, hp);
+       nrs->nrs_stopping = 1;
+
+       list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+                                    pol_list) {
+               rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+               LASSERT(rc == 0);
+       }
+
+       /**
+        * If the service partition has an HP NRS head, clean that up as well.
+        */
+       if (!hp && nrs_svcpt_has_hp(svcpt)) {
+               hp = true;
+               goto again;
+       }
+
+       if (hp)
+               OBD_FREE_PTR(nrs);
+
+       EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+       struct ptlrpc_nrs_pol_desc     *tmp;
+       ENTRY;
+
+       list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+               if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+                       RETURN(tmp);
+       }
+       RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_service          *svc;
+       struct ptlrpc_service_part     *svcpt;
+       int                             i;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+       LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+       list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+               if (!nrs_policy_compatible(svc, desc) ||
+                   unlikely(svc->srv_is_stopping))
+                       continue;
+
+               ptlrpc_service_for_each_part(svcpt, i, svc) {
+                       bool hp = false;
+
+again:
+                       nrs = nrs_svcpt2nrs(svcpt, hp);
+                       rc = nrs_policy_unregister(nrs, desc->pd_name);
+                       /**
+                        * Ignore -ENOENT as the policy may not have registered
+                        * successfully on all service partitions.
+                        */
+                       if (rc == -ENOENT) {
+                               rc = 0;
+                       } else if (rc != 0) {
+                               CERROR("Failed to unregister NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svcpt->scp_service->srv_name, rc);
+                               RETURN(rc);
+                       }
+
+                       if (!hp && nrs_svc_has_hp(svc)) {
+                               hp = true;
+                               goto again;
+                       }
+               }
+
+               if (desc->pd_ops->op_lprocfs_fini != NULL)
+                       desc->pd_ops->op_lprocfs_fini(svc);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *     time when registering a policy that ships with NRS core, or in a
+ *     module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+       struct ptlrpc_service          *svc;
+       struct ptlrpc_nrs_pol_desc     *desc;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(conf != NULL);
+       LASSERT(conf->nc_ops != NULL);
+       LASSERT(conf->nc_compat != NULL);
+       LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+               conf->nc_compat_svc_name != NULL));
+       LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+                    conf->nc_owner != NULL));
+
+       conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+       /**
+        * External policies are not allowed to start immediately upon
+        * registration, as there is a relatively higher chance that their
+        * registration might fail. In such a case, some policy instances may
+        * already have requests queued wen unregistration needs to happen as
+        * part o cleanup; since there is currently no way to drain requests
+        * from a policy unless the service is unregistering, we just disallow
+        * this.
+        */
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+           (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+                              PTLRPC_NRS_FL_REG_START))) {
+               CERROR("NRS: failing to register policy %s. Please check "
+                      "policy flags; external policies cannot act as fallback "
+                      "policies, or be started immediately upon registration "
+                      "without interaction with lprocfs\n", conf->nc_name);
+               RETURN(-EINVAL);
+       }
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+               CERROR("NRS: failing to register policy %s which has already "
+                      "been registered with NRS core!\n",
+                      conf->nc_name);
+               GOTO(fail, rc = -EEXIST);
+       }
+
+       OBD_ALLOC_PTR(desc);
+       if (desc == NULL)
+               GOTO(fail, rc = -ENOMEM);
+
+       strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+       desc->pd_ops             = conf->nc_ops;
+       desc->pd_compat          = conf->nc_compat;
+       desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+               desc->pd_owner   = conf->nc_owner;
+       desc->pd_flags           = conf->nc_flags;
+       atomic_set(&desc->pd_refs, 0);
+
+       /**
+        * For policies that are held in the same module as NRS (currently
+        * ptlrpc), do not register the policy with all compatible services,
+        * as the services will not have started at this point, since we are
+        * calling from ptlrpc module initialization code. In such cases each
+        * service will register all compatible policies later, via
+        * ptlrpc_service_nrs_setup().
+        */
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+               goto internal;
+
+       /**
+        * Register the new policy on all compatible services
+        */
+       mutex_lock(&ptlrpc_all_services_mutex);
+
+       list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+               struct ptlrpc_service_part     *svcpt;
+               int                             i;
+               int                             rc2;
+
+               if (!nrs_policy_compatible(svc, desc) ||
+                   unlikely(svc->srv_is_stopping))
+                       continue;
+
+               ptlrpc_service_for_each_part(svcpt, i, svc) {
+                       struct ptlrpc_nrs      *nrs;
+                       bool                    hp = false;
+again:
+                       nrs = nrs_svcpt2nrs(svcpt, hp);
+                       rc = nrs_policy_register(nrs, desc);
+                       if (rc != 0) {
+                               CERROR("Failed to register NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svcpt->scp_service->srv_name, rc);
+
+                               rc2 = nrs_policy_unregister_locked(desc);
+                               /**
+                                * Should not fail at this point
+                                */
+                               LASSERT(rc2 == 0);
+                               mutex_unlock(&ptlrpc_all_services_mutex);
+                               OBD_FREE_PTR(desc);
+                               GOTO(fail, rc);
+                       }
+
+                       if (!hp && nrs_svc_has_hp(svc)) {
+                               hp = true;
+                               goto again;
+                       }
+               }
+
+               /**
+                * No need to take a reference to other modules here, as we
+                * will be calling from the module's init() function.
+                */
+               if (desc->pd_ops->op_lprocfs_init != NULL) {
+                       rc = desc->pd_ops->op_lprocfs_init(svc);
+                       if (rc != 0) {
+                               rc2 = nrs_policy_unregister_locked(desc);
+                               /**
+                                * Should not fail at this point
+                                */
+                               LASSERT(rc2 == 0);
+                               mutex_unlock(&ptlrpc_all_services_mutex);
+                               OBD_FREE_PTR(desc);
+                               GOTO(fail, rc);
+                       }
+               }
+       }
+
+       mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+       list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *     Although it can be used for policies that ship alongside NRS core, the
+ *     function is primarily intended for policies that register externally,
+ *     from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+       struct ptlrpc_nrs_pol_desc      *desc;
+       int                              rc;
+       ENTRY;
+
+       LASSERT(conf != NULL);
+
+       if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+               CERROR("Unable to unregister a fallback policy, unless the "
+                      "PTLRPC service is stopping.\n");
+               RETURN(-EPERM);
+       }
+
+       conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       desc = nrs_policy_find_desc_locked(conf->nc_name);
+       if (desc == NULL) {
+               CERROR("Failing to unregister NRS policy %s which has "
+                      "not been registered with NRS core!\n",
+                      conf->nc_name);
+               GOTO(not_exist, rc = -ENOENT);
+       }
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+
+       rc = nrs_policy_unregister_locked(desc);
+       if (rc < 0) {
+               if (rc == -EBUSY)
+                       CERROR("Please first stop policy %s on all service "
+                              "partitions and then retry to unregister the "
+                              "policy.\n", conf->nc_name);
+               GOTO(fail, rc);
+       }
+
+       CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+              conf->nc_name);
+
+       list_del(&desc->pd_list);
+       OBD_FREE_PTR(desc);
+
+fail:
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *                   ptlrpc_service_nrs_cleanup() to undo any work performed
+ *                   by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part             *svcpt;
+       const struct ptlrpc_nrs_pol_desc       *desc;
+       int                                     i;
+       int                                     rc = 0;
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Initialize NRS heads on all service CPTs.
+        */
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               rc = nrs_svcpt_setup_locked(svcpt);
+               if (rc != 0)
+                       GOTO(failed, rc);
+       }
+
+       /**
+        * Set up lprocfs interfaces for all supported policies for the
+        * service.
+        */
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (!nrs_policy_compatible(svc, desc))
+                       continue;
+
+               if (desc->pd_ops->op_lprocfs_init != NULL) {
+                       rc = desc->pd_ops->op_lprocfs_init(svc);
+                       if (rc != 0)
+                               GOTO(failed, rc);
+               }
+       }
+
+failed:
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part           *svcpt;
+       const struct ptlrpc_nrs_pol_desc     *desc;
+       int                                   i;
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Clean up NRS heads on all service partitions
+        */
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               nrs_svcpt_cleanup_locked(svcpt);
+
+       /**
+        * Clean up lprocfs interfaces for all supported policies for the
+        * service.
+        */
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (!nrs_policy_compatible(svc, desc))
+                       continue;
+
+               if (desc->pd_ops->op_lprocfs_fini != NULL)
+                       desc->pd_ops->op_lprocfs_fini(svc);
+       }
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+                              struct ptlrpc_request *req, bool hp)
+{
+       struct ptlrpc_nrs       *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+       memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+       nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+                             false);
+
+       /**
+        * It is fine to access \e nr_initialized without locking as there is
+        * no contention at this early stage.
+        */
+       req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+       if (req->rq_nrq.nr_initialized) {
+               nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+               /* no protection on bit nr_initialized because no
+                * contention at this late stage */
+               req->rq_nrq.nr_finalized = 1;
+       }
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+       if (req->rq_nrq.nr_started)
+               nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *                 high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+                       struct ptlrpc_request *req, bool hp)
+{
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (hp)
+               ptlrpc_nrs_hpreq_add_nolock(req);
+       else
+               ptlrpc_nrs_req_add_nolock(req);
+
+       spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+       LASSERT(policy->pol_req_queued > 0);
+
+       policy->pol_nrs->nrs_req_queued--;
+       policy->pol_req_queued--;
+
+       /**
+        * If the policy has no more requests queued, remove it from
+        * ptlrpc_nrs::nrs_policy_queued.
+        */
+       if (unlikely(policy->pol_req_queued == 0)) {
+               list_del_init(&policy->pol_list_queued);
+
+               /**
+                * If there are other policies with queued requests, move the
+                * current policy to the end so that we can round robin over
+                * all policies and drain the requests.
+                */
+       } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+               LASSERT(policy->pol_req_queued <
+                       policy->pol_nrs->nrs_req_queued);
+
+               list_move_tail(&policy->pol_list_queued,
+                                  &policy->pol_nrs->nrs_policy_queued);
+       }
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *                 high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *                 request, and not handle it, so the request is not removed
+ *                 from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *                 has one pending
+ *
+ * \retval the request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+                          bool peek, bool force)
+{
+       struct ptlrpc_nrs         *nrs = nrs_svcpt2nrs(svcpt, hp);
+       struct ptlrpc_nrs_policy  *policy;
+       struct ptlrpc_nrs_request *nrq;
+
+       /**
+        * Always try to drain requests from all NRS polices even if they are
+        * inactive, because the user can change policy status at runtime.
+        */
+       list_for_each_entry(policy, &nrs->nrs_policy_queued,
+                               pol_list_queued) {
+               nrq = nrs_request_get(policy, peek, force);
+               if (nrq != NULL) {
+                       if (likely(!peek)) {
+                               nrq->nr_started = 1;
+
+                               policy->pol_req_started++;
+                               policy->pol_nrs->nrs_req_started++;
+
+                               nrs_request_removed(policy);
+                       }
+
+                       return container_of(nrq, struct ptlrpc_request, rq_nrq);
+               }
+       }
+
+       return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+       policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+       req->rq_nrq.nr_enqueued = 0;
+
+       nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *                 enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true         the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+       return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_nrs_request       *nrq = &req->rq_nrq;
+       struct ptlrpc_nrs_resource      *res1[NRS_RES_MAX];
+       struct ptlrpc_nrs_resource      *res2[NRS_RES_MAX];
+       ENTRY;
+
+       /**
+        * Obtain the high-priority NRS head resources.
+        */
+       nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (!ptlrpc_nrs_req_can_move(req))
+               goto out;
+
+       ptlrpc_nrs_req_del_nolock(req);
+
+       memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+       memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+       ptlrpc_nrs_hpreq_add_nolock(req);
+
+       memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+       spin_unlock(&svcpt->scp_req_lock);
+
+       /**
+        * Release either the regular NRS head resources if we moved the
+        * request, or the high-priority NRS head resources if we took a
+        * reference earlier in this function and ptlrpc_nrs_req_can_move()
+        * returned false.
+        */
+       nrs_resource_put_safe(res1);
+       EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]    svc    the service the policy belongs to.
+ * \param[in]    queue  whether to carry out the command on the policy which
+ *                      belongs to the regular, high-priority, or both NRS
+ *                      heads of service partitions of \a svc.
+ * \param[in]    name   the policy to act upon, by human-readable name
+ * \param[in]    opc    the opcode of the operation to carry out
+ * \param[in]    single when set, the operation will only be carried out on the
+ *                      NRS heads of the first service partition of \a svc.
+ *                      This is useful for some policies which e.g. share
+ *                      identical values on the same parameters of different
+ *                      service partitions; when reading these parameters via
+ *                      lprocfs, these policies may just want to obtain and
+ *                      print out the values from the first service partition.
+ *                      Storing these values centrally elsewhere then could be
+ *                      another solution for this.
+ * \param[in,out] arg   can be used as a generic in/out buffer between control
+ *                      operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+                             enum ptlrpc_nrs_queue_type queue, char *name,
+                             enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+       struct ptlrpc_service_part     *svcpt;
+       int                             i;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+       if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+               return -EINVAL;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+                       rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+                                           opc, arg);
+                       if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+                                       single))
+                               GOTO(out, rc);
+               }
+
+               if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+                       /**
+                        * XXX: We could optionally check for
+                        * nrs_svc_has_hp(svc) here, and return an error if it
+                        * is false. Right now we rely on the policies' lprocfs
+                        * handlers that call the present function to make this
+                        * check; if they fail to do so, they might hit the
+                        * assertion inside nrs_svcpt2nrs() below.
+                        */
+                       rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+                                           opc, arg);
+                       if (rc != 0 || single)
+                               GOTO(out, rc);
+               }
+       }
+out:
+       RETURN(rc);
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+       int     rc;
+       ENTRY;
+
+       mutex_init(&nrs_core.nrs_mutex);
+       INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+       rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+       if (rc != 0)
+               GOTO(fail, rc);
+
+
+       RETURN(rc);
+fail:
+       /**
+        * Since no PTLRPC services have been started at this point, all we need
+        * to do for cleanup is to free the descriptors.
+        */
+       ptlrpc_nrs_fini();
+
+       RETURN(rc);
+}
+
+/**
+ * Removes all policy desciptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+       struct ptlrpc_nrs_pol_desc *desc;
+       struct ptlrpc_nrs_pol_desc *tmp;
+
+       list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+                                    pd_list) {
+               list_del_init(&desc->pd_list);
+               OBD_FREE_PTR(desc);
+       }
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c
new file mode 100644 (file)
index 0000000..ddfb510
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644 (file)
index 0000000..7d3ee97
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO      "fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0     success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_fifo_head *head;
+
+       OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+       if (head == NULL)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&head->fh_list);
+       policy->pol_private = head;
+       return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_fifo_head *head = policy->pol_private;
+
+       LASSERT(head != NULL);
+       LASSERT(list_empty(&head->fh_list));
+
+       OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy    The policy on which the request is being asked for
+ * \param[in]  nrq       The request for which resources are being taken
+ * \param[in]  parent    Parent resource, unused in this policy
+ * \param[out] resp      Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *                       policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *          it implements a simple scheduling algorithm in which request
+ *          priority is determined on the request arrival order, it does not
+ *          need to maintain a set of resources that would otherwise be used
+ *          to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+                           struct ptlrpc_nrs_request *nrq,
+                           const struct ptlrpc_nrs_resource *parent,
+                           struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+       /**
+        * Just return the resource embedded inside nrs_fifo_head, and end this
+        * resource hierarchy reference request.
+        */
+       *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+       return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *                  policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *        queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+                                            bool peek, bool force)
+{
+       struct nrs_fifo_head      *head = policy->pol_private;
+       struct ptlrpc_nrs_request *nrq;
+
+       nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+             list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+                            nr_u.fifo.fr_list);
+
+       if (likely(!peek && nrq != NULL)) {
+               struct ptlrpc_request *req = container_of(nrq,
+                                                         struct ptlrpc_request,
+                                                         rq_nrq);
+
+               list_del_init(&nrq->nr_u.fifo.fr_list);
+
+               CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: "LPU64
+                      "\n", policy->pol_desc->pd_name,
+                      libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+       }
+
+       return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *                   succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+                           struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_fifo_head *head;
+
+       head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+                           fh_res);
+       /**
+        * Only used for debugging
+        */
+       nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+       list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+       return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+                            struct ptlrpc_nrs_request *nrq)
+{
+       LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+       list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+                             struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+                                                 rq_nrq);
+
+       CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: "LPU64"\n",
+              policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+              nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+       .op_policy_start        = nrs_fifo_start,
+       .op_policy_stop         = nrs_fifo_stop,
+       .op_res_get             = nrs_fifo_res_get,
+       .op_req_get             = nrs_fifo_req_get,
+       .op_req_enqueue         = nrs_fifo_req_add,
+       .op_req_dequeue         = nrs_fifo_req_del,
+       .op_req_stop            = nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+       .nc_name                = NRS_POL_NAME_FIFO,
+       .nc_ops                 = &nrs_fifo_ops,
+       .nc_compat              = nrs_policy_compat_all,
+       .nc_flags               = PTLRPC_NRS_FL_FALLBACK |
+                                 PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c
new file mode 100644 (file)
index 0000000..a88c519
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_orr.c
+ *
+ * Network Request Scheduler (NRS) ORR and TRR policies
+ *
+ * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs
+ * respectively
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644 (file)
index 0000000..1437636
--- /dev/null
@@ -0,0 +1,2575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <obd_cksum.h>
+#include <lustre/ll_fiemap.h>
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+       return cfs_size_round(offsetof(struct lustre_msg_v2,
+                                      lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_hdr_size_v2(count);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+                           int index)
+{
+       if (inout)
+               lustre_set_req_swabbed(req, index);
+       else
+               lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+                        int index)
+{
+       if (inout)
+               return (ptlrpc_req_need_swab(req) &&
+                       !lustre_req_swabbed(req, index));
+       else
+               return (ptlrpc_rep_need_swab(req) &&
+                       !lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+                                             __u32 version)
+{
+       __u32 ver = lustre_msg_get_version(msg);
+       return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               CERROR("msg v1 not supported - please upgrade you system\n");
+               return -EINVAL;
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_check_version_v2(msg, version);
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size()
+{
+       static int size = 0;
+       if (!size) {
+               /* Always reply old ptlrpc_body_v2 to keep interoprability
+                * with the old client (< 2.3) which doesn't have pb_jobid
+                * in the ptlrpc_body.
+                *
+                * XXX Remove this whenever we dorp interoprability with such
+                *     client.
+                */
+               __u32 pblen = sizeof(struct ptlrpc_body_v2);
+               size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+       }
+       return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+       int size;
+       int i;
+
+       size = lustre_msg_hdr_size_v2(count);
+       for (i = 0; i < count; i++)
+               size += cfs_size_round(lengths[i]);
+
+       return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_size_v2(count, lens);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                       char **bufs)
+{
+       char *ptr;
+       int i;
+
+       msg->lm_bufcount = count;
+       /* XXX: lm_secflvr uninitialized here */
+       msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+       for (i = 0; i < count; i++)
+               msg->lm_buflens[i] = lens[i];
+
+       if (bufs == NULL)
+               return;
+
+       ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+       for (i = 0; i < count; i++) {
+               char *tmp = bufs[i];
+               LOGL(tmp, lens[i], ptr);
+       }
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+                                 int count, __u32 *lens, char **bufs)
+{
+       int reqlen, rc;
+
+       reqlen = lustre_msg_size_v2(count, lens);
+
+       rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+       if (rc)
+               return rc;
+
+       req->rq_reqlen = reqlen;
+
+       lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+       lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+       return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+                       __u32 *lens, char **bufs)
+{
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+       /* only use new format, we don't need to be compatible with 1.4 */
+       magic = LUSTRE_MSG_MAGIC_V2;
+
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_pack_request_v2(req, count, lens, bufs);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)                                    \
+do {                                                                   \
+       spin_lock(&ptlrpc_rs_debug_lock);                               \
+       list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);      \
+       spin_unlock(&ptlrpc_rs_debug_lock);                             \
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)                                    \
+do {                                                                   \
+       spin_lock(&ptlrpc_rs_debug_lock);                               \
+       list_del(&(rs)->rs_debug_list);                         \
+       spin_unlock(&ptlrpc_rs_debug_lock);                             \
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_reply_state *rs = NULL;
+
+       spin_lock(&svcpt->scp_rep_lock);
+
+       /* See if we have anything in a pool, and wait if nothing */
+       while (list_empty(&svcpt->scp_rep_idle)) {
+               struct l_wait_info      lwi;
+               int                     rc;
+
+               spin_unlock(&svcpt->scp_rep_lock);
+               /* If we cannot get anything for some long time, we better
+                * bail out instead of waiting infinitely */
+               lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+               rc = l_wait_event(svcpt->scp_rep_waitq,
+                                 !list_empty(&svcpt->scp_rep_idle), &lwi);
+               if (rc != 0)
+                       goto out;
+               spin_lock(&svcpt->scp_rep_lock);
+       }
+
+       rs = list_entry(svcpt->scp_rep_idle.next,
+                           struct ptlrpc_reply_state, rs_list);
+       list_del(&rs->rs_list);
+
+       spin_unlock(&svcpt->scp_rep_lock);
+
+       memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+       rs->rs_svcpt = svcpt;
+       rs->rs_prealloc = 1;
+out:
+       return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+       spin_lock(&svcpt->scp_rep_lock);
+       list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+       spin_unlock(&svcpt->scp_rep_lock);
+       wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                        __u32 *lens, char **bufs, int flags)
+{
+       struct ptlrpc_reply_state *rs;
+       int                     msg_len, rc;
+       ENTRY;
+
+       LASSERT(req->rq_reply_state == NULL);
+
+       if ((flags & LPRFL_EARLY_REPLY) == 0) {
+               spin_lock(&req->rq_lock);
+               req->rq_packed_final = 1;
+               spin_unlock(&req->rq_lock);
+       }
+
+       msg_len = lustre_msg_size_v2(count, lens);
+       rc = sptlrpc_svc_alloc_rs(req, msg_len);
+       if (rc)
+               RETURN(rc);
+
+       rs = req->rq_reply_state;
+       atomic_set(&rs->rs_refcount, 1);    /* 1 ref for rq_reply_state */
+       rs->rs_cb_id.cbid_fn = reply_out_callback;
+       rs->rs_cb_id.cbid_arg = rs;
+       rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+       INIT_LIST_HEAD(&rs->rs_exp_list);
+       INIT_LIST_HEAD(&rs->rs_obd_list);
+       INIT_LIST_HEAD(&rs->rs_list);
+       spin_lock_init(&rs->rs_lock);
+
+       req->rq_replen = msg_len;
+       req->rq_reply_state = rs;
+       req->rq_repmsg = rs->rs_msg;
+
+       lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+       lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+       PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+                           char **bufs, int flags)
+{
+       int rc = 0;
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+               break;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n",
+                        req->rq_reqmsg->lm_magic);
+               rc = -EINVAL;
+       }
+       if (rc != 0)
+               CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+                      lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+       return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+                     char **bufs)
+{
+       return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+       int i, offset, buflen, bufcount;
+
+       LASSERT(m != NULL);
+       LASSERT(n >= 0);
+
+       bufcount = m->lm_bufcount;
+       if (unlikely(n >= bufcount)) {
+               CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                      m, n, bufcount);
+               return NULL;
+       }
+
+       buflen = m->lm_buflens[n];
+       if (unlikely(buflen < min_size)) {
+               CERROR("msg %p buffer[%d] size %d too small "
+                      "(required %d, opc=%d)\n", m, n, buflen, min_size,
+                      n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+               return NULL;
+       }
+
+       offset = lustre_msg_hdr_size_v2(bufcount);
+       for (i = 0; i < n; i++)
+               offset += cfs_size_round(m->lm_buflens[i]);
+
+       return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_buf_v2(m, n, min_size);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+                        unsigned int newlen, int move_data)
+{
+       char   *tail = NULL, *newpos;
+       int     tail_len = 0, n;
+
+       LASSERT(msg);
+       LASSERT(msg->lm_bufcount > segment);
+       LASSERT(msg->lm_buflens[segment] >= newlen);
+
+       if (msg->lm_buflens[segment] == newlen)
+               goto out;
+
+       if (move_data && msg->lm_bufcount > segment + 1) {
+               tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+               for (n = segment + 1; n < msg->lm_bufcount; n++)
+                       tail_len += cfs_size_round(msg->lm_buflens[n]);
+       }
+
+       msg->lm_buflens[segment] = newlen;
+
+       if (tail && tail_len) {
+               newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+               LASSERT(newpos <= tail);
+               if (newpos != tail)
+                       memmove(newpos, tail, tail_len);
+       }
+out:
+       return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                     unsigned int newlen, int move_data)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+       PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+       LASSERT (atomic_read(&rs->rs_refcount) == 0);
+       LASSERT (!rs->rs_difficult || rs->rs_handled);
+       LASSERT (!rs->rs_on_net);
+       LASSERT (!rs->rs_scheduled);
+       LASSERT (rs->rs_export == NULL);
+       LASSERT (rs->rs_nlocks == 0);
+       LASSERT (list_empty(&rs->rs_exp_list));
+       LASSERT (list_empty(&rs->rs_obd_list));
+
+       sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+       int swabbed, required_len, i;
+
+       /* Now we know the sender speaks my language. */
+       required_len = lustre_msg_hdr_size_v2(0);
+       if (len < required_len) {
+               /* can't even look inside the message */
+               CERROR("message length %d too small for lustre_msg\n", len);
+               return -EINVAL;
+       }
+
+       swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+       if (swabbed) {
+               __swab32s(&m->lm_magic);
+               __swab32s(&m->lm_bufcount);
+               __swab32s(&m->lm_secflvr);
+               __swab32s(&m->lm_repsize);
+               __swab32s(&m->lm_cksum);
+               __swab32s(&m->lm_flags);
+               CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+               CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+       }
+
+       required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+       if (len < required_len) {
+               /* didn't receive all the buffer lengths */
+               CERROR ("message length %d too small for %d buflens\n",
+                       len, m->lm_bufcount);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < m->lm_bufcount; i++) {
+               if (swabbed)
+                       __swab32s(&m->lm_buflens[i]);
+               required_len += cfs_size_round(m->lm_buflens[i]);
+       }
+
+       if (len < required_len) {
+               CERROR("len: %d, required_len %d\n", len, required_len);
+               CERROR("bufcount: %d\n", m->lm_bufcount);
+               for (i = 0; i < m->lm_bufcount; i++)
+                       CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+               return -EINVAL;
+       }
+
+       return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+       int required_len, rc;
+       ENTRY;
+
+       /* We can provide a slightly better error log, if we check the
+        * message magic and version first.  In the future, struct
+        * lustre_msg may grow, and we'd like to log a version mismatch,
+        * rather than a short message.
+        *
+        */
+       required_len = offsetof(struct lustre_msg, lm_magic) +
+                      sizeof(m->lm_magic);
+       if (len < required_len) {
+               /* can't even look inside the message */
+               CERROR("message length %d too small for magic/version check\n",
+                      len);
+               RETURN(-EINVAL);
+       }
+
+       rc = lustre_unpack_msg_v2(m, len);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+       int rc;
+       rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+       if (rc == 1) {
+               lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+       int rc;
+       rc = __lustre_unpack_msg(req->rq_repmsg, len);
+       if (rc == 1) {
+               lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+                                              const int inout, int offset)
+{
+       struct ptlrpc_body *pb;
+       struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+       pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+       if (!pb) {
+               CERROR("error unpacking ptlrpc body\n");
+               return -EFAULT;
+       }
+       if (ptlrpc_buf_need_swab(req, inout, offset)) {
+               lustre_swab_ptlrpc_body(pb);
+               ptlrpc_buf_set_swabbed(req, inout, offset);
+       }
+
+       if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+                CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+                return -EINVAL;
+       }
+
+       return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+       default:
+               CERROR("bad lustre msg magic: %08x\n",
+                      req->rq_reqmsg->lm_magic);
+               return -EINVAL;
+       }
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+       switch (req->rq_repmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+       default:
+               CERROR("bad lustre msg magic: %08x\n",
+                      req->rq_repmsg->lm_magic);
+               return -EINVAL;
+       }
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+       if (n >= m->lm_bufcount)
+               return 0;
+
+       return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_buflen_v2(m, n);
+       default:
+               CERROR("incorrect message magic: %08x\n", m->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+       if (n >= m->lm_bufcount)
+               LBUG();
+
+       m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               lustre_msg_set_buflen_v2(m, n, len);
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+       }
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return m->lm_bufcount;
+       default:
+               CERROR("incorrect message magic: %08x\n", m->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+       /* max_len == 0 means the string should fill the buffer */
+       char *str;
+       int slen, blen;
+
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               str = lustre_msg_buf_v2(m, index, 0);
+               blen = lustre_msg_buflen_v2(m, index);
+               break;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+       }
+
+       if (str == NULL) {
+               CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+               return NULL;
+       }
+
+       slen = strnlen(str, blen);
+
+       if (slen == blen) {                  /* not NULL terminated */
+               CERROR("can't unpack non-NULL terminated string in "
+                       "msg %p buffer[%d] len %d\n", m, index, blen);
+               return NULL;
+       }
+
+       if (max_len == 0) {
+               if (slen != blen - 1) {
+                       CERROR("can't unpack short string in msg %p "
+                              "buffer[%d] len %d: strlen %d\n",
+                              m, index, blen, slen);
+                       return NULL;
+               }
+       } else if (slen > max_len) {
+               CERROR("can't unpack oversized string in msg %p "
+                      "buffer[%d] len %d strlen %d: max %d expected\n",
+                      m, index, blen, slen, max_len);
+               return NULL;
+       }
+
+       return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+                                     int min_size, void *swabber)
+{
+       void *ptr = NULL;
+
+       LASSERT(msg != NULL);
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               ptr = lustre_msg_buf_v2(msg, index, min_size);
+               break;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+       }
+
+       if (ptr && swabber)
+               ((void (*)(void *))swabber)(ptr);
+
+       return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+       return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2:
+               /* already in host endian */
+               return msg->lm_flags;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2:
+               msg->lm_flags = flags;
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_flags;
+       }
+       default:
+               /* flags might be printed in debug code while message
+                * uninitialized */
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags = flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_op_flags;
+       }
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_op_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_op_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return NULL;
+               }
+               return &pb->pb_handle;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return PTL_RPC_MSG_ERR;
+               }
+               return pb->pb_type;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return PTL_RPC_MSG_ERR;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_version;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_version |= version;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_opc;
+       }
+       default:
+               CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+               LBUG();
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_last_xid;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_last_committed;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return NULL;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return NULL;
+               }
+               return pb->pb_pre_versions;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_transno;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_status;
+       }
+       default:
+               /* status might be printed in debug code while message
+                * uninitialized */
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_slv;
+       }
+       default:
+               CERROR("invalid msg magic %08x\n", msg->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return;
+               }
+               pb->pb_slv = slv;
+               return;
+       }
+       default:
+               CERROR("invalid msg magic %x\n", msg->lm_magic);
+               return;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_limit;
+       }
+       default:
+               CERROR("invalid msg magic %x\n", msg->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return;
+               }
+               pb->pb_limit = limit;
+               return;
+       }
+       default:
+               CERROR("invalid msg magic %08x\n", msg->lm_magic);
+               return;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_conn_cnt;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 1;
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return msg->lm_magic;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+
+               }
+               return pb->pb_timeout;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+
+               }
+               return pb->pb_service_time;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return NULL;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb =
+                       lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                         sizeof(struct ptlrpc_body));
+               if (!pb)
+                       return NULL;
+
+               return pb->pb_jobid;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return msg->lm_cksum;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/*
+ * In 1.6 and 1.8 the checksum was computed only on struct ptlrpc_body as
+ * it was in 1.6 (88 bytes, smaller than the full size in 1.8).  It makes
+ * more sense to compute the checksum on the full ptlrpc_body, regardless
+ * of what size it is, but in order to keep interoperability with 1.8 we
+ * can optionally also checksum only the first 88 bytes (caller decides). */
+# define ptlrpc_body_cksum_size_compat18        88
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18)
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+#endif
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               __u32 crc;
+               unsigned int hsize = 4;
+               __u32 len = compat18 ? ptlrpc_body_cksum_size_compat18 :
+                           lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+                                      len, NULL, 0, (unsigned char *)&crc,
+                                      &hsize);
+               return crc;
+#else
+# warning "remove checksum compatibility support for b1_8"
+               __u32 crc;
+               unsigned int hsize = 4;
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+                                  lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+                                  NULL, 0, (unsigned char *)&crc, &hsize);
+               return crc;
+#endif
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_handle = *handle;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_type = type;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_opc = opc;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_last_xid = last_xid;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_last_committed = last_committed;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_pre_versions[0] = versions[0];
+               pb->pb_pre_versions[1] = versions[1];
+               pb->pb_pre_versions[2] = versions[2];
+               pb->pb_pre_versions[3] = versions[3];
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_transno = transno;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_status = status;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_conn_cnt = conn_cnt;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_timeout = timeout;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_service_time = service_time;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               __u32 opc = lustre_msg_get_opc(msg);
+               struct ptlrpc_body *pb;
+
+               /* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+                * See the comment in ptlrpc_request_pack(). */
+               if (!opc || opc == LDLM_BL_CALLBACK ||
+                   opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+                       return;
+
+               pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                      sizeof(struct ptlrpc_body));
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+               if (jobid != NULL)
+                       memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+               else if (pb->pb_jobid[0] == '\0')
+                       lustre_get_jobid(pb->pb_jobid);
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2:
+               msg->lm_cksum = cksum;
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+       int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+       req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+                                        req->rq_pill.rc_area[RCL_SERVER]);
+       if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+               req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+       req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+       if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+               req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+                     int opcode, int version,
+                     obd_count keylen, void *key,
+                     obd_count vallen, void *val,
+                     struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       char              *tmp;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                            RCL_CLIENT, vallen);
+       rc = ptlrpc_request_pack(req, version, opcode);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+       memcpy(tmp, val, vallen);
+
+       ptlrpc_request_set_replen(req);
+
+       if (set) {
+               ptlrpc_set_add_req(set, req);
+               ptlrpc_check_set(NULL, set);
+       } else {
+               rc = ptlrpc_queue_wait(req);
+               ptlrpc_req_finished(req);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+       __swab32s (&b->pb_type);
+       __swab32s (&b->pb_version);
+       __swab32s (&b->pb_opc);
+       __swab32s (&b->pb_status);
+       __swab64s (&b->pb_last_xid);
+       __swab64s (&b->pb_last_seen);
+       __swab64s (&b->pb_last_committed);
+       __swab64s (&b->pb_transno);
+       __swab32s (&b->pb_flags);
+       __swab32s (&b->pb_op_flags);
+       __swab32s (&b->pb_conn_cnt);
+       __swab32s (&b->pb_timeout);
+       __swab32s (&b->pb_service_time);
+       __swab32s (&b->pb_limit);
+       __swab64s (&b->pb_slv);
+       __swab64s (&b->pb_pre_versions[0]);
+       __swab64s (&b->pb_pre_versions[1]);
+       __swab64s (&b->pb_pre_versions[2]);
+       __swab64s (&b->pb_pre_versions[3]);
+       CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+       /* While we need to maintain compatibility between
+        * clients and servers without ptlrpc_body_v2 (< 2.3)
+        * do not swab any fields beyond pb_jobid, as we are
+        * using this swab function for both ptlrpc_body
+        * and ptlrpc_body_v2. */
+       CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+       __swab64s(&ocd->ocd_connect_flags);
+       __swab32s(&ocd->ocd_version);
+       __swab32s(&ocd->ocd_grant);
+       __swab64s(&ocd->ocd_ibits_known);
+       __swab32s(&ocd->ocd_index);
+       __swab32s(&ocd->ocd_brw_size);
+       /* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+        * they are 8-byte values */
+       __swab16s(&ocd->ocd_grant_extent);
+       __swab32s(&ocd->ocd_unused);
+       __swab64s(&ocd->ocd_transno);
+       __swab32s(&ocd->ocd_group);
+       __swab32s(&ocd->ocd_cksum_types);
+       __swab32s(&ocd->ocd_instance);
+       /* Fields after ocd_cksum_types are only accessible by the receiver
+        * if the corresponding flag in ocd_connect_flags is set. Accessing
+        * any field after ocd_maxbytes on the receiver without a valid flag
+        * may result in out-of-bound memory access and kernel oops. */
+       if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+               __swab32s(&ocd->ocd_max_easize);
+       if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+               __swab64s(&ocd->ocd_maxbytes);
+       CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+       __swab64s (&o->o_valid);
+       lustre_swab_ost_id(&o->o_oi);
+       __swab64s (&o->o_parent_seq);
+       __swab64s (&o->o_size);
+       __swab64s (&o->o_mtime);
+       __swab64s (&o->o_atime);
+       __swab64s (&o->o_ctime);
+       __swab64s (&o->o_blocks);
+       __swab64s (&o->o_grant);
+       __swab32s (&o->o_blksize);
+       __swab32s (&o->o_mode);
+       __swab32s (&o->o_uid);
+       __swab32s (&o->o_gid);
+       __swab32s (&o->o_flags);
+       __swab32s (&o->o_nlink);
+       __swab32s (&o->o_parent_oid);
+       __swab32s (&o->o_misc);
+       __swab64s (&o->o_ioepoch);
+       __swab32s (&o->o_stripe_idx);
+       __swab32s (&o->o_parent_ver);
+       /* o_handle is opaque */
+       /* o_lcookie is swabbed elsewhere */
+       __swab32s (&o->o_uid_h);
+       __swab32s (&o->o_gid_h);
+       __swab64s (&o->o_data_version);
+       CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+       CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+       CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+       __swab64s (&os->os_type);
+       __swab64s (&os->os_blocks);
+       __swab64s (&os->os_bfree);
+       __swab64s (&os->os_bavail);
+       __swab64s (&os->os_files);
+       __swab64s (&os->os_ffree);
+       /* no need to swab os_fsid */
+       __swab32s (&os->os_bsize);
+       __swab32s (&os->os_namelen);
+       __swab64s (&os->os_maxbytes);
+       __swab32s (&os->os_state);
+       CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+       lustre_swab_ost_id(&ioo->ioo_oid);
+       __swab32s(&ioo->ioo_max_brw);
+       __swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote (struct niobuf_remote *nbr)
+{
+       __swab64s (&nbr->offset);
+       __swab32s (&nbr->len);
+       __swab32s (&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+       lustre_swab_obdo (&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(obd_id *id)
+{
+       __swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+       __swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+       lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+       __swab64s(&desc->lquota_desc.gl_flags);
+       __swab64s(&desc->lquota_desc.gl_ver);
+       __swab64s(&desc->lquota_desc.gl_hardlimit);
+       __swab64s(&desc->lquota_desc.gl_softlimit);
+       __swab64s(&desc->lquota_desc.gl_time);
+       CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+       __swab64s(&lvb->lvb_size);
+       __swab64s(&lvb->lvb_mtime);
+       __swab64s(&lvb->lvb_atime);
+       __swab64s(&lvb->lvb_ctime);
+       __swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+       __swab64s(&lvb->lvb_size);
+       __swab64s(&lvb->lvb_mtime);
+       __swab64s(&lvb->lvb_atime);
+       __swab64s(&lvb->lvb_ctime);
+       __swab64s(&lvb->lvb_blocks);
+       __swab32s(&lvb->lvb_mtime_ns);
+       __swab32s(&lvb->lvb_atime_ns);
+       __swab32s(&lvb->lvb_ctime_ns);
+       __swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+       __swab64s(&lvb->lvb_flags);
+       __swab64s(&lvb->lvb_id_may_rel);
+       __swab64s(&lvb->lvb_id_rel);
+       __swab64s(&lvb->lvb_id_qunit);
+       __swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body (struct mdt_body *b)
+{
+       lustre_swab_lu_fid (&b->fid1);
+       lustre_swab_lu_fid (&b->fid2);
+       /* handle is opaque */
+       __swab64s (&b->valid);
+       __swab64s (&b->size);
+       __swab64s (&b->mtime);
+       __swab64s (&b->atime);
+       __swab64s (&b->ctime);
+       __swab64s (&b->blocks);
+       __swab64s (&b->ioepoch);
+       CLASSERT(offsetof(typeof(*b), unused1) != 0);
+       __swab32s (&b->fsuid);
+       __swab32s (&b->fsgid);
+       __swab32s (&b->capability);
+       __swab32s (&b->mode);
+       __swab32s (&b->uid);
+       __swab32s (&b->gid);
+       __swab32s (&b->flags);
+       __swab32s (&b->rdev);
+       __swab32s (&b->nlink);
+       CLASSERT(offsetof(typeof(*b), unused2) != 0);
+       __swab32s (&b->suppgid);
+       __swab32s (&b->eadatasize);
+       __swab32s (&b->aclsize);
+       __swab32s (&b->max_mdsize);
+       __swab32s (&b->max_cookiesize);
+       __swab32s (&b->uid_h);
+       __swab32s (&b->gid_h);
+       CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b)
+{
+       /* handle is opaque */
+        __swab64s (&b->ioepoch);
+        __swab32s (&b->flags);
+        CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+       int i;
+       __swab32s(&mti->mti_lustre_ver);
+       __swab32s(&mti->mti_stripe_index);
+       __swab32s(&mti->mti_config_ver);
+       __swab32s(&mti->mti_flags);
+       __swab32s(&mti->mti_instance);
+       __swab32s(&mti->mti_nid_count);
+       CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+       for (i = 0; i < MTI_NIDS_MAX; i++)
+               __swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+       int i;
+
+       __swab64s(&entry->mne_version);
+       __swab32s(&entry->mne_instance);
+       __swab32s(&entry->mne_index);
+       __swab32s(&entry->mne_length);
+
+       /* mne_nid_(count|type) must be one byte size because we're gonna
+        * access it w/o swapping. */
+       CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+       CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+       /* remove this assertion if ipv6 is supported. */
+       LASSERT(entry->mne_nid_type == 0);
+       for (i = 0; i < entry->mne_nid_count; i++) {
+               CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+               __swab64s(&entry->u.nids[i]);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+       __swab64s(&body->mcb_offset);
+       __swab32s(&body->mcb_units);
+       __swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+       __swab64s(&body->mcr_offset);
+       __swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
+{
+       __swab64s (&i->dqi_bgrace);
+       __swab64s (&i->dqi_igrace);
+       __swab32s (&i->dqi_flags);
+       __swab32s (&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
+{
+       __swab64s (&b->dqb_ihardlimit);
+       __swab64s (&b->dqb_isoftlimit);
+       __swab64s (&b->dqb_curinodes);
+       __swab64s (&b->dqb_bhardlimit);
+       __swab64s (&b->dqb_bsoftlimit);
+       __swab64s (&b->dqb_curspace);
+       __swab64s (&b->dqb_btime);
+       __swab64s (&b->dqb_itime);
+       __swab32s (&b->dqb_valid);
+       CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl (struct obd_quotactl *q)
+{
+       __swab32s (&q->qc_cmd);
+       __swab32s (&q->qc_type);
+       __swab32s (&q->qc_id);
+       __swab32s (&q->qc_stat);
+       lustre_swab_obd_dqinfo (&q->qc_dqinfo);
+       lustre_swab_obd_dqblk (&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm (struct mdt_remote_perm *p)
+{
+       __swab32s (&p->rp_uid);
+       __swab32s (&p->rp_gid);
+       __swab32s (&p->rp_fsuid);
+       __swab32s (&p->rp_fsuid_h);
+       __swab32s (&p->rp_fsgid);
+       __swab32s (&p->rp_fsgid_h);
+       __swab32s (&p->rp_access_perm);
+       __swab32s (&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+       lustre_swab_lu_fid(&gf->gf_fid);
+       __swab64s(&gf->gf_recno);
+       __swab32s(&gf->gf_linkno);
+       __swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+       __swab64s(&fm_extent->fe_logical);
+       __swab64s(&fm_extent->fe_physical);
+       __swab64s(&fm_extent->fe_length);
+       __swab32s(&fm_extent->fe_flags);
+       __swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+       int i;
+
+       __swab64s(&fiemap->fm_start);
+       __swab64s(&fiemap->fm_length);
+       __swab32s(&fiemap->fm_flags);
+       __swab32s(&fiemap->fm_mapped_extents);
+       __swab32s(&fiemap->fm_extent_count);
+       __swab32s(&fiemap->fm_reserved);
+
+       for (i = 0; i < fiemap->fm_mapped_extents; i++)
+               lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+       __swab32s(&ii->ii_magic);
+       __swab32s(&ii->ii_flags);
+       __swab16s(&ii->ii_count);
+       __swab32s(&ii->ii_attrs);
+       lustre_swab_lu_fid(&ii->ii_fid);
+       __swab64s(&ii->ii_version);
+       __swab64s(&ii->ii_hash_start);
+       __swab64s(&ii->ii_hash_end);
+       __swab16s(&ii->ii_keysize);
+       __swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+       /* swab header */
+       __swab32s(&lip->lip_magic);
+       __swab16s(&lip->lip_flags);
+       __swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+       __swab32s(&rr->rr_opcode);
+       __swab32s(&rr->rr_cap);
+       __swab32s(&rr->rr_fsuid);
+       /* rr_fsuid_h is unused */
+       __swab32s(&rr->rr_fsgid);
+       /* rr_fsgid_h is unused */
+       __swab32s(&rr->rr_suppgid1);
+       /* rr_suppgid1_h is unused */
+       __swab32s(&rr->rr_suppgid2);
+       /* rr_suppgid2_h is unused */
+       lustre_swab_lu_fid(&rr->rr_fid1);
+       lustre_swab_lu_fid(&rr->rr_fid2);
+       __swab64s(&rr->rr_mtime);
+       __swab64s(&rr->rr_atime);
+       __swab64s(&rr->rr_ctime);
+       __swab64s(&rr->rr_size);
+       __swab64s(&rr->rr_blocks);
+       __swab32s(&rr->rr_bias);
+       __swab32s(&rr->rr_mode);
+       __swab32s(&rr->rr_flags);
+       __swab32s(&rr->rr_flags_h);
+       __swab32s(&rr->rr_umask);
+
+       CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+       __swab32s (&ld->ld_tgt_count);
+       __swab32s (&ld->ld_active_tgt_count);
+       __swab32s (&ld->ld_default_stripe_count);
+       __swab32s (&ld->ld_pattern);
+       __swab64s (&ld->ld_default_stripe_size);
+       __swab64s (&ld->ld_default_stripe_offset);
+       __swab32s (&ld->ld_qos_maxage);
+       /* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc (struct lmv_desc *ld)
+{
+       __swab32s (&ld->ld_tgt_count);
+       __swab32s (&ld->ld_active_tgt_count);
+       __swab32s (&ld->ld_default_stripe_count);
+       __swab32s (&ld->ld_pattern);
+       __swab64s (&ld->ld_default_hash_size);
+       __swab32s (&ld->ld_qos_maxage);
+       /* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea)
+{
+       __swab32s(&mea->mea_magic);
+       __swab32s(&mea->mea_count);
+       __swab32s(&mea->mea_master);
+       CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+       int i;
+
+       __swab32s(&lum->lum_magic);
+       __swab32s(&lum->lum_stripe_count);
+       __swab32s(&lum->lum_stripe_offset);
+       __swab32s(&lum->lum_hash_type);
+       __swab32s(&lum->lum_type);
+       CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+       CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+       CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+       for (i = 0; i < lum->lum_stripe_count; i++) {
+               __swab32s(&lum->lum_objects[i].lum_mds);
+               lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+       }
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum (struct lov_user_md *lum)
+{
+       CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+       CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+       CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+       CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lmm_oi_id(&lum->lmm_oi));
+       CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lmm_oi_seq(&lum->lmm_oi));
+       CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+       CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+       CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+                       lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+       __swab64s(&oi->oi.oi_id);
+       __swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+       ENTRY;
+       __swab32s(&lum->lmm_magic);
+       __swab32s(&lum->lmm_pattern);
+       lustre_swab_lmm_oi(&lum->lmm_oi);
+       __swab32s(&lum->lmm_stripe_size);
+       __swab16s(&lum->lmm_stripe_count);
+       __swab16s(&lum->lmm_stripe_offset);
+       print_lum(lum);
+       EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+       lustre_swab_lov_user_md_common(lum);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+       lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+       /* lmm_pool_name nothing to do with char */
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+       __swab32s(&lmm->lmm_magic);
+       __swab32s(&lmm->lmm_pattern);
+       lustre_swab_lmm_oi(&lmm->lmm_oi);
+       __swab32s(&lmm->lmm_stripe_size);
+       __swab16s(&lmm->lmm_stripe_count);
+       __swab16s(&lmm->lmm_layout_gen);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                    int stripe_count)
+{
+       int i;
+       ENTRY;
+       for (i = 0; i < stripe_count; i++) {
+               lustre_swab_ost_id(&(lod[i].l_ost_oi));
+               __swab32s(&(lod[i].l_ost_gen));
+               __swab32s(&(lod[i].l_ost_idx));
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+       int  i;
+
+       for (i = 0; i < RES_NAME_SIZE; i++)
+               __swab64s (&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d)
+{
+       /* the lock data is a union and the first two fields are always an
+        * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+        * data the same way. */
+       __swab64s(&d->l_extent.start);
+       __swab64s(&d->l_extent.end);
+       __swab64s(&d->l_extent.gid);
+       __swab64s(&d->l_flock.lfw_owner);
+       __swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+       __swab64s (&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
+{
+       __swab32s (&r->lr_type);
+       CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+       lustre_swab_ldlm_res_id (&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+       lustre_swab_ldlm_resource_desc (&l->l_resource);
+       __swab32s (&l->l_req_mode);
+       __swab32s (&l->l_granted_mode);
+       lustre_swab_ldlm_policy_data (&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+       __swab32s (&rq->lock_flags);
+       lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+       __swab32s (&rq->lock_count);
+       /* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+       __swab32s (&r->lock_flags);
+       CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+       lustre_swab_ldlm_lock_desc (&r->lock_desc);
+       /* lock_handle opaque */
+       __swab64s (&r->lock_policy_res1);
+       __swab64s (&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+       lustre_swab_lu_fid(&b->qb_fid);
+       lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+       __swab32s(&b->qb_flags);
+       __swab64s(&b->qb_count);
+       __swab64s(&b->qb_usage);
+       __swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+       CDEBUG(D_RPCTRACE,
+              "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+              "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+              ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+       CDEBUG(D_RPCTRACE, "niobuf_remote: offset="LPU64", len=%d, flags=%x\n",
+              nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+       __u32 valid = oa->o_valid;
+
+       CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+       if (valid & OBD_MD_FLID)
+               CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+       if (valid & OBD_MD_FLFID)
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = "LPX64"\n",
+                      oa->o_parent_seq);
+       if (valid & OBD_MD_FLSIZE)
+               CDEBUG(D_RPCTRACE, "obdo: o_size = "LPD64"\n", oa->o_size);
+       if (valid & OBD_MD_FLMTIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_mtime = "LPD64"\n", oa->o_mtime);
+       if (valid & OBD_MD_FLATIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_atime = "LPD64"\n", oa->o_atime);
+       if (valid & OBD_MD_FLCTIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_ctime = "LPD64"\n", oa->o_ctime);
+       if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+               CDEBUG(D_RPCTRACE, "obdo: o_blocks = "LPD64"\n", oa->o_blocks);
+       if (valid & OBD_MD_FLGRANT)
+               CDEBUG(D_RPCTRACE, "obdo: o_grant = "LPD64"\n", oa->o_grant);
+       if (valid & OBD_MD_FLBLKSZ)
+               CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+       if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+               CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+                      oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+                                    (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+       if (valid & OBD_MD_FLUID)
+               CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+       if (valid & OBD_MD_FLUID)
+               CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+       if (valid & OBD_MD_FLGID)
+               CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+       if (valid & OBD_MD_FLGID)
+               CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+       if (valid & OBD_MD_FLFLAGS)
+               CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+       if (valid & OBD_MD_FLNLINK)
+               CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+       else if (valid & OBD_MD_FLCKSUM)
+               CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+                      oa->o_nlink);
+       if (valid & OBD_MD_FLGENER)
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+                      oa->o_parent_oid);
+       if (valid & OBD_MD_FLEPOCH)
+               CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = "LPD64"\n",
+                      oa->o_ioepoch);
+       if (valid & OBD_MD_FLFID) {
+               CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+                      oa->o_stripe_idx);
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+                      oa->o_parent_ver);
+       }
+       if (valid & OBD_MD_FLHANDLE)
+               CDEBUG(D_RPCTRACE, "obdo: o_handle = "LPD64"\n",
+                      oa->o_handle.cookie);
+       if (valid & OBD_MD_FLCOOKIE)
+               CDEBUG(D_RPCTRACE, "obdo: o_lcookie = "
+                      "(llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+       dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+       CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_reqmsg);
+
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+       default:
+               CERROR("bad lustre msg magic: %#08X\n",
+                      req->rq_reqmsg->lm_magic);
+       }
+       return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repmsg);
+
+       switch (req->rq_repmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+       default:
+               /* uninitialized yet */
+               return 0;
+       }
+}
+
+void _debug_req(struct ptlrpc_request *req,
+               struct libcfs_debug_msg_data *msgdata,
+               const char *fmt, ... )
+{
+       int req_ok = req->rq_reqmsg != NULL;
+       int rep_ok = req->rq_repmsg != NULL;
+       lnet_nid_t nid = LNET_NID_ANY;
+       va_list args;
+
+       if (ptlrpc_req_need_swab(req)) {
+               req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+               rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+       }
+
+       if (req->rq_import && req->rq_import->imp_connection)
+               nid = req->rq_import->imp_connection->c_peer.nid;
+       else if (req->rq_export && req->rq_export->exp_connection)
+               nid = req->rq_export->exp_connection->c_peer.nid;
+
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+                          " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d "
+                          "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+                          req, req->rq_xid, req->rq_transno,
+                          req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+                          req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+                          req->rq_import ?
+                               req->rq_import->imp_obd->obd_name :
+                               req->rq_export ?
+                                    req->rq_export->exp_client_uuid.uuid :
+                                    "<?>",
+                          libcfs_nid2str(nid),
+                          req->rq_request_portal, req->rq_reply_portal,
+                          req->rq_reqlen, req->rq_replen,
+                          req->rq_early_count, req->rq_timedout,
+                          req->rq_deadline,
+                          atomic_read(&req->rq_refcount),
+                          DEBUG_REQ_FLAGS(req),
+                          req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+                          rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+                          req->rq_status,
+                          rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+       lustre_swab_lu_fid(&c->lc_fid);
+       __swab64s (&c->lc_opc);
+       __swab64s (&c->lc_uid);
+       __swab64s (&c->lc_gid);
+       __swab32s (&c->lc_flags);
+       __swab32s (&c->lc_keyid);
+       __swab32s (&c->lc_timeout);
+       __swab32s (&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+       __swab64s (&k->lk_seq);
+       __swab32s (&k->lk_keyid);
+       CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+       __swab32s(&state->hus_states);
+       __swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+       __swab32s(&hss->hss_valid);
+       __swab64s(&hss->hss_setmask);
+       __swab64s(&hss->hss_clearmask);
+       __swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+       __swab64s(&extent->offset);
+       __swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+       __swab32s(&action->hca_state);
+       __swab32s(&action->hca_action);
+       lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+       lustre_swab_lu_fid(&hui->hui_fid);
+       lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+       __swab32s(&li->li_opc);
+       __swab32s(&li->li_flags);
+       __swab64s(&li->li_start);
+       __swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+       lustre_swab_lu_fid(&hpk->hpk_fid);
+       __swab64s(&hpk->hpk_cookie);
+       __swab64s(&hpk->hpk_extent.offset);
+       __swab64s(&hpk->hpk_extent.length);
+       __swab16s(&hpk->hpk_flags);
+       __swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+       __swab32s(&hr->hr_action);
+       __swab32s(&hr->hr_archive_id);
+       __swab64s(&hr->hr_flags);
+       __swab32s(&hr->hr_itemcount);
+       __swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+       __swab32s(&ub->ub_magic);
+       __swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+       int i;
+
+       __swab32s(&ur->ur_version);
+       __swab32s(&ur->ur_count);
+       for (i = 0; i < ur->ur_count; i++)
+               __swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+       __swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644 (file)
index 0000000..d926d2b
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+                        int mdidx)
+{
+       CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+       LASSERT(mdidx < desc->bd_md_max_brw);
+       LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+       LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+                                LNET_MD_PHYS)));
+
+       md->options |= LNET_MD_KIOV;
+       md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+       md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+       if (desc->bd_enc_iov)
+               md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+       else
+               md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+                         int pageoffset, int len)
+{
+       lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+       kiov->kiov_page = page;
+       kiov->kiov_offset = pageoffset;
+       kiov->kiov_len = len;
+
+       desc->bd_iov_count++;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644 (file)
index 0000000..ef5269a
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings()
+{
+       return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+                                       LUSTRE_OBD_VERSION, OBD_PING);
+       if (req) {
+               ptlrpc_request_set_replen(req);
+               req->rq_no_resend = req->rq_no_delay = 1;
+       }
+       return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+       int rc;
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       req = ptlrpc_prep_ping(imp);
+       if (req == NULL) {
+               CERROR("OOM trying to ping %s->%s\n",
+                      imp->imp_obd->obd_uuid.uuid,
+                      obd2cli_tgt(imp->imp_obd));
+               RETURN(-ENOMEM);
+       }
+
+       DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+                 imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+       RETURN(0);
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+       int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+       if (imp->imp_state == LUSTRE_IMP_DISCON) {
+               int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+                                 AT_OFF ? 0 :
+                                 at_get(&imp->imp_at.iat_net_latency));
+               time = min(time, dtime);
+       }
+       imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+       imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+       return (imp->imp_deactive ||
+               OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+       if (imp->imp_server_timeout)
+               return cfs_time_shift(obd_timeout / 2);
+       else
+               return cfs_time_shift(obd_timeout);
+}
+
+static atomic_t suspend_timeouts = ATOMIC_INIT(0);
+static cfs_time_t suspend_wakeup_time = 0;
+
+cfs_duration_t pinger_check_timeout(cfs_time_t time)
+{
+       struct timeout_item *item;
+       cfs_time_t timeout = PING_INTERVAL;
+
+       /* The timeout list is a increase order sorted list */
+       mutex_lock(&pinger_mutex);
+       list_for_each_entry(item, &timeout_list, ti_chain) {
+               int ti_timeout = item->ti_timeout;
+               if (timeout > ti_timeout)
+                       timeout = ti_timeout;
+               break;
+       }
+       mutex_unlock(&pinger_mutex);
+
+       return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+                                        cfs_time_current());
+}
+
+static wait_queue_head_t suspend_timeouts_waitq;
+
+cfs_time_t ptlrpc_suspend_wakeup_time(void)
+{
+       return suspend_wakeup_time;
+}
+
+void ptlrpc_deactivate_timeouts(struct obd_import *imp)
+{
+       /*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+       if (imp->imp_no_timeout)
+               return;
+       imp->imp_no_timeout = 1;
+       atomic_inc(&suspend_timeouts);
+       CDEBUG(D_HA|D_WARNING, "deactivate timeouts %u\n",
+              atomic_read(&suspend_timeouts));
+#endif
+}
+
+void ptlrpc_activate_timeouts(struct obd_import *imp)
+{
+       /*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+       if (!imp->imp_no_timeout)
+               return;
+       imp->imp_no_timeout = 0;
+       LASSERT(atomic_read(&suspend_timeouts) > 0);
+       if (atomic_dec_and_test(&suspend_timeouts)) {
+               suspend_wakeup_time = cfs_time_current();
+               wake_up(&suspend_timeouts_waitq);
+       }
+       CDEBUG(D_HA|D_WARNING, "activate timeouts %u\n",
+              atomic_read(&suspend_timeouts));
+#endif
+}
+
+int ptlrpc_check_suspend(void)
+{
+       if (atomic_read(&suspend_timeouts))
+               return 1;
+       return 0;
+}
+
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req)
+{
+       struct l_wait_info lwi;
+
+       if (atomic_read(&suspend_timeouts)) {
+               DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout",
+                         atomic_read(&suspend_timeouts));
+               lwi = LWI_INTR(NULL, NULL);
+               l_wait_event(suspend_timeouts_waitq,
+                            atomic_read(&suspend_timeouts) == 0, &lwi);
+               DEBUG_REQ(D_NET, req, "-- recharge regular timeout");
+               return 1;
+       }
+       return 0;
+}
+
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+       CDEBUG(D_HA, "IR up\n");
+       ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+       CDEBUG(D_HA, "IR down\n");
+       ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+                                        unsigned long this_ping)
+{
+       int level;
+       int force;
+       int force_next;
+       int suppress;
+
+       spin_lock(&imp->imp_lock);
+
+       level = imp->imp_state;
+       force = imp->imp_force_verify;
+       force_next = imp->imp_force_next_verify;
+       /*
+        * This will be used below only if the import is "FULL".
+        */
+       suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+       imp->imp_force_verify = 0;
+
+       if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+           !force) {
+               spin_unlock(&imp->imp_lock);
+               return;
+       }
+
+       imp->imp_force_next_verify = 0;
+
+       spin_unlock(&imp->imp_lock);
+
+       CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+              "force %u force_next %u deactive %u pingable %u suppress %u\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+              ptlrpc_import_state_name(level), level, force, force_next,
+              imp->imp_deactive, imp->imp_pingable, suppress);
+
+       if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+               /* wait for a while before trying recovery again */
+               imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+               if (!imp->imp_no_pinger_recover)
+                       ptlrpc_initiate_recovery(imp);
+       } else if (level != LUSTRE_IMP_FULL ||
+                  imp->imp_obd->obd_no_recov ||
+                  imp_is_deactive(imp)) {
+               CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+                      "or recovery disabled: %s)\n",
+                      imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+                      ptlrpc_import_state_name(level));
+       } else if ((imp->imp_pingable && !suppress) || force_next || force) {
+               ptlrpc_ping(imp);
+       }
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+       ENTRY;
+
+       /* Record that the thread is running */
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       /* And now, loop forever, pinging as needed. */
+       while (1) {
+               cfs_time_t this_ping = cfs_time_current();
+               struct l_wait_info lwi;
+               cfs_duration_t time_to_next_wake;
+               struct timeout_item *item;
+               struct list_head *iter;
+
+               mutex_lock(&pinger_mutex);
+               list_for_each_entry(item, &timeout_list, ti_chain) {
+                       item->ti_cb(item, item->ti_cb_data);
+               }
+               list_for_each(iter, &pinger_imports) {
+                       struct obd_import *imp =
+                               list_entry(iter, struct obd_import,
+                                              imp_pinger_chain);
+
+                       ptlrpc_pinger_process_import(imp, this_ping);
+                       /* obd_timeout might have changed */
+                       if (imp->imp_pingable && imp->imp_next_ping &&
+                           cfs_time_after(imp->imp_next_ping,
+                                          cfs_time_add(this_ping,
+                                                       cfs_time_seconds(PING_INTERVAL))))
+                               ptlrpc_update_next_ping(imp, 0);
+               }
+               mutex_unlock(&pinger_mutex);
+               /* update memory usage info */
+               obd_update_maxusage();
+
+               /* Wait until the next ping time, or until we're stopped. */
+               time_to_next_wake = pinger_check_timeout(this_ping);
+               /* The ping sent by ptlrpc_send_rpc may get sent out
+                  say .01 second after this.
+                  ptlrpc_pinger_sending_on_import will then set the
+                  next ping time to next_ping + .01 sec, which means
+                  we will SKIP the next ping at next_ping, and the
+                  ping will get sent 2 timeouts from now!  Beware. */
+               CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+                      CFS_TIME_T")\n", time_to_next_wake,
+                      cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+               if (time_to_next_wake > 0) {
+                       lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+                                               time_to_next_wake,
+                                               cfs_time_seconds(1)),
+                                         NULL, NULL);
+                       l_wait_event(thread->t_ctl_waitq,
+                                    thread_is_stopping(thread) ||
+                                    thread_is_event(thread),
+                                    &lwi);
+                       if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+                               EXIT;
+                               break;
+                       } else {
+                               /* woken after adding import to reset timer */
+                               thread_test_and_clear_flags(thread, SVC_EVENT);
+                       }
+               }
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+       return 0;
+}
+
+static struct ptlrpc_thread *pinger_thread = NULL;
+
+int ptlrpc_start_pinger(void)
+{
+       struct l_wait_info lwi = { 0 };
+       int rc;
+       ENTRY;
+
+       if (pinger_thread != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC_PTR(pinger_thread);
+       if (pinger_thread == NULL)
+               RETURN(-ENOMEM);
+       init_waitqueue_head(&pinger_thread->t_ctl_waitq);
+       init_waitqueue_head(&suspend_timeouts_waitq);
+
+       strcpy(pinger_thread->t_name, "ll_ping");
+
+       /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+        * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+       rc = PTR_ERR(kthread_run(ptlrpc_pinger_main,
+                                pinger_thread, pinger_thread->t_name));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("cannot start thread: %d\n", rc);
+               OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+               pinger_thread = NULL;
+               RETURN(rc);
+       }
+       l_wait_event(pinger_thread->t_ctl_waitq,
+                    thread_is_running(pinger_thread), &lwi);
+
+       if (suppress_pings)
+               CWARN("Pings will be suppressed at the request of the "
+                     "administrator.  The configuration shall meet the "
+                     "additional requirements described in the manual.  "
+                     "(Search for the \"suppress_pings\" kernel module "
+                     "parameter.)\n");
+
+       RETURN(0);
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+       struct l_wait_info lwi = { 0 };
+       int rc = 0;
+       ENTRY;
+
+       if (pinger_thread == NULL)
+               RETURN(-EALREADY);
+
+       ptlrpc_pinger_remove_timeouts();
+       mutex_lock(&pinger_mutex);
+       thread_set_flags(pinger_thread, SVC_STOPPING);
+       wake_up(&pinger_thread->t_ctl_waitq);
+       mutex_unlock(&pinger_mutex);
+
+       l_wait_event(pinger_thread->t_ctl_waitq,
+                    thread_is_stopped(pinger_thread), &lwi);
+
+       OBD_FREE_PTR(pinger_thread);
+       pinger_thread = NULL;
+       RETURN(rc);
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+       ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+       ptlrpc_update_next_ping(imp, 1);
+       LASSERT(spin_is_locked(&imp->imp_lock));
+       /*
+        * Avoid reading stale imp_connect_data.  When not sure if pings are
+        * expected or not on next connection, we assume they are not and force
+        * one anyway to guarantee the chance of updating
+        * imp_peer_committed_transno.
+        */
+       if (imp->imp_state != LUSTRE_IMP_FULL ||
+           OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+               imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+       ENTRY;
+       if (!list_empty(&imp->imp_pinger_chain))
+               RETURN(-EALREADY);
+
+       mutex_lock(&pinger_mutex);
+       CDEBUG(D_HA, "adding pingable import %s->%s\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       /* if we add to pinger we want recovery on this import */
+       imp->imp_obd->obd_no_recov = 0;
+       ptlrpc_update_next_ping(imp, 0);
+       /* XXX sort, blah blah */
+       list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+       class_import_get(imp);
+
+       ptlrpc_pinger_wake_up();
+       mutex_unlock(&pinger_mutex);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+       ENTRY;
+       if (list_empty(&imp->imp_pinger_chain))
+               RETURN(-ENOENT);
+
+       mutex_lock(&pinger_mutex);
+       list_del_init(&imp->imp_pinger_chain);
+       CDEBUG(D_HA, "removing pingable import %s->%s\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       /* if we remove from pinger we don't want recovery on this import */
+       imp->imp_obd->obd_no_recov = 1;
+       class_import_put(imp);
+       mutex_unlock(&pinger_mutex);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item* ptlrpc_new_timeout(int time, enum timeout_event event,
+                                       timeout_cb_t cb, void *data)
+{
+       struct timeout_item *ti;
+
+       OBD_ALLOC_PTR(ti);
+       if (!ti)
+               return(NULL);
+
+       INIT_LIST_HEAD(&ti->ti_obd_list);
+       INIT_LIST_HEAD(&ti->ti_chain);
+       ti->ti_timeout = time;
+       ti->ti_event = event;
+       ti->ti_cb = cb;
+       ti->ti_cb_data = data;
+
+       return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+                              timeout_cb_t cb, void *data)
+{
+       struct timeout_item *item, *tmp;
+
+       LASSERT(mutex_is_locked(&pinger_mutex));
+
+       list_for_each_entry(item, &timeout_list, ti_chain)
+               if (item->ti_event == event)
+                       goto out;
+
+       item = ptlrpc_new_timeout(time, event, cb, data);
+       if (item) {
+               list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+                       if (tmp->ti_timeout < time) {
+                               list_add(&item->ti_chain, &tmp->ti_chain);
+                               goto out;
+                       }
+               }
+               list_add(&item->ti_chain, &timeout_list);
+       }
+out:
+       return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                             timeout_cb_t cb, void *data,
+                             struct list_head *obd_list)
+{
+       struct timeout_item *ti;
+
+       mutex_lock(&pinger_mutex);
+       ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+       if (!ti) {
+               mutex_unlock(&pinger_mutex);
+               return (-EINVAL);
+       }
+       list_add(obd_list, &ti->ti_obd_list);
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                             enum timeout_event event)
+{
+       struct timeout_item *ti = NULL, *item;
+
+       if (list_empty(obd_list))
+               return 0;
+       mutex_lock(&pinger_mutex);
+       list_del_init(obd_list);
+       /**
+        * If there are no obd attached to the timeout event
+        * list, remove this timeout event from the pinger
+        */
+       list_for_each_entry(item, &timeout_list, ti_chain) {
+               if (item->ti_event == event) {
+                       ti = item;
+                       break;
+               }
+       }
+       LASSERTF(ti != NULL, "ti is NULL ! \n");
+       if (list_empty(&ti->ti_obd_list)) {
+               list_del(&ti->ti_chain);
+               OBD_FREE_PTR(ti);
+       }
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+       struct timeout_item *item, *tmp;
+
+       mutex_lock(&pinger_mutex);
+       list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+               LASSERT(list_empty(&item->ti_obd_list));
+               list_del(&item->ti_chain);
+               OBD_FREE_PTR(item);
+       }
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+
+void ptlrpc_pinger_wake_up()
+{
+       thread_add_flags(pinger_thread, SVC_EVENT);
+       wake_up(&pinger_thread->t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int            pet_refcount = 0;
+static int            pet_state;
+static wait_queue_head_t       pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+       struct obd_device *obd;
+
+       spin_lock(&pet_lock);
+       if (pet_state != PET_READY) {
+               /* eventually the new obd will call here again. */
+               spin_unlock(&pet_lock);
+               return 1;
+       }
+
+       obd = class_exp2obd(exp);
+       if (list_empty(&obd->obd_evict_list)) {
+               class_incref(obd, "evictor", obd);
+               list_add(&obd->obd_evict_list, &pet_list);
+       }
+       spin_unlock(&pet_lock);
+
+       wake_up(&pet_waitq);
+       return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+       struct obd_device *obd;
+       struct obd_export *exp;
+       struct l_wait_info lwi = { 0 };
+       time_t expire_time;
+       ENTRY;
+
+       unshare_fs_struct();
+
+       CDEBUG(D_HA, "Starting Ping Evictor\n");
+       pet_state = PET_READY;
+       while (1) {
+               l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+                            (pet_state == PET_TERMINATE), &lwi);
+
+               /* loop until all obd's will be removed */
+               if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+                       break;
+
+               /* we only get here if pet_exp != NULL, and the end of this
+                * loop is the only place which sets it NULL again, so lock
+                * is not strictly necessary. */
+               spin_lock(&pet_lock);
+               obd = list_entry(pet_list.next, struct obd_device,
+                                    obd_evict_list);
+               spin_unlock(&pet_lock);
+
+               expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+
+               CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+                      obd->obd_name, expire_time);
+
+               /* Exports can't be deleted out of the list while we hold
+                * the obd lock (class_unlink_export), which means we can't
+                * lose the last ref on the export.  If they've already been
+                * removed from the list, we won't find them here. */
+               spin_lock(&obd->obd_dev_lock);
+               while (!list_empty(&obd->obd_exports_timed)) {
+                       exp = list_entry(obd->obd_exports_timed.next,
+                                            struct obd_export,
+                                            exp_obd_chain_timed);
+                       if (expire_time > exp->exp_last_request_time) {
+                               class_export_get(exp);
+                               spin_unlock(&obd->obd_dev_lock);
+                               LCONSOLE_WARN("%s: haven't heard from client %s"
+                                             " (at %s) in %ld seconds. I think"
+                                             " it's dead, and I am evicting"
+                                             " it. exp %p, cur %ld expire %ld"
+                                             " last %ld\n",
+                                             obd->obd_name,
+                                             obd_uuid2str(&exp->exp_client_uuid),
+                                             obd_export_nid2str(exp),
+                                             (long)(cfs_time_current_sec() -
+                                                    exp->exp_last_request_time),
+                                             exp, (long)cfs_time_current_sec(),
+                                             (long)expire_time,
+                                             (long)exp->exp_last_request_time);
+                               CDEBUG(D_HA, "Last request was at %ld\n",
+                                      exp->exp_last_request_time);
+                               class_fail_export(exp);
+                               class_export_put(exp);
+                               spin_lock(&obd->obd_dev_lock);
+                       } else {
+                               /* List is sorted, so everyone below is ok */
+                               break;
+                       }
+               }
+               spin_unlock(&obd->obd_dev_lock);
+
+               spin_lock(&pet_lock);
+               list_del_init(&obd->obd_evict_list);
+               spin_unlock(&pet_lock);
+
+               class_decref(obd, "evictor", obd);
+       }
+       CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+       RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+       task_t *task;
+
+       if (++pet_refcount > 1)
+               return;
+
+       init_waitqueue_head(&pet_waitq);
+
+       task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+       if (IS_ERR(task)) {
+               pet_refcount--;
+               CERROR("Cannot start ping evictor thread: %ld\n",
+                       PTR_ERR(task));
+       }
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+       if (--pet_refcount > 0)
+               return;
+
+       pet_state = PET_TERMINATE;
+       wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644 (file)
index 0000000..9ba7600
--- /dev/null
@@ -0,0 +1,303 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+                                        unsigned type, unsigned portal);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+                                    struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+                                    long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* LPROCFS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+       /**
+        * Protects nrs_core::nrs_policies, serializes external policy
+        * registration/unregistration, and NRS core lprocfs operations.
+        */
+       struct mutex nrs_mutex;
+       /* XXX: This is just for liblustre. Remove the #if defined directive
+        * when the * "cfs_" prefix is dropped from cfs_list_head. */
+       /**
+        * List of all policy descriptors registered with NRS core; protected
+        * by nrs_core::nrs_mutex.
+        */
+       struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+                              struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+                       struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+                          bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+                         bool force)
+{
+       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+                             enum ptlrpc_nrs_queue_type queue, char *name,
+                             enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+       /**
+        * If the first service partition has an HP NRS head, all service
+        * partitions will.
+        */
+       return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+       return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+       return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+       LASSERT(nrq->nr_initialized);
+       LASSERT(!nrq->nr_finalized);
+
+       return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+       return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG   "reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP    "hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX                65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD                                        \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+       NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+                        int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+                         int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count,
+                              int *eof, void *data);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                               enum lustre_sec_part to,
+                               struct obd_uuid *target,
+                               lnet_nid_t nid,
+                               struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+       return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+       return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+       return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+       if (atomic_dec_and_test(&set->set_refcount))
+               OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644 (file)
index 0000000..f6ea80f
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+       int rc, cleanup_phase = 0;
+       ENTRY;
+
+       lustre_assert_wire_constants();
+#if RS_DEBUG
+       spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+       mutex_init(&ptlrpc_all_services_mutex);
+       mutex_init(&pinger_mutex);
+       mutex_init(&ptlrpcd_mutex);
+       ptlrpc_init_xid();
+
+       rc = req_layout_init();
+       if (rc)
+               RETURN(rc);
+
+       rc = ptlrpc_hr_init();
+       if (rc)
+               RETURN(rc);
+
+       cleanup_phase = 1;
+
+       rc = ptlrpc_init_portals();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 2;
+
+       rc = ptlrpc_connection_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 3;
+
+       ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+       rc = ptlrpc_start_pinger();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 4;
+
+       rc = ldlm_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 5;
+
+       rc = sptlrpc_init();
+       if (rc)
+               GOTO(cleanup, rc);
+
+       cleanup_phase = 7;
+       rc = ptlrpc_nrs_init();
+       if (rc)
+               GOTO(cleanup, rc);
+
+       cleanup_phase = 8;
+       rc = tgt_mod_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       RETURN(0);
+
+cleanup:
+       switch(cleanup_phase) {
+       case 8:
+               ptlrpc_nrs_fini();
+       case 7:
+               sptlrpc_fini();
+       case 5:
+               ldlm_exit();
+       case 4:
+               ptlrpc_stop_pinger();
+       case 3:
+               ptlrpc_connection_fini();
+       case 2:
+               ptlrpc_exit_portals();
+       case 1:
+               ptlrpc_hr_fini();
+               req_layout_fini();
+       default: ;
+       }
+
+       return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+       tgt_mod_exit();
+       ptlrpc_nrs_fini();
+       sptlrpc_fini();
+       ldlm_exit();
+       ptlrpc_stop_pinger();
+       ptlrpc_exit_portals();
+       ptlrpc_hr_fini();
+       ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+
+cfs_module(ptlrpc, "1.0.0", ptlrpc_init, ptlrpc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644 (file)
index 0000000..185841f
--- /dev/null
@@ -0,0 +1,827 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_net.h>
+# include <lustre_lib.h>
+
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+       int             pd_size;
+       int             pd_index;
+       int             pd_nthreads;
+       struct ptlrpcd_ctl pd_thread_rcv;
+       struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+CFS_MODULE_PARM(max_ptlrpcds, "i", int, 0644,
+               "Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+CFS_MODULE_PARM(ptlrpcd_bind_policy, "i", int, 0644,
+               "Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *rq_set = req->rq_set;
+
+       LASSERT(rq_set != NULL);
+
+       wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+       int idx = 0;
+
+       if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+               return &ptlrpcds->pd_thread_rcv;
+
+       switch (policy) {
+       case PDL_POLICY_SAME:
+               idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+               break;
+       case PDL_POLICY_LOCAL:
+               /* Before CPU partition patches available, process it the same
+                * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+               /* Fall through to PDL_POLICY_ROUND until the CPU
+                * CPU partition patches are available. */
+               index = -1;
+       case PDL_POLICY_PREFERRED:
+               if (index >= 0 && index < num_online_cpus()) {
+                       idx = index % ptlrpcds->pd_nthreads;
+                       break;
+               }
+               /* Fall through to PDL_POLICY_ROUND for bad index. */
+       default:
+               /* Fall through to PDL_POLICY_ROUND for unknown policy. */
+       case PDL_POLICY_ROUND:
+               /* We do not care whether it is strict load balance. */
+               idx = ptlrpcds->pd_index + 1;
+               if (idx == smp_processor_id())
+                       idx++;
+               idx %= ptlrpcds->pd_nthreads;
+               ptlrpcds->pd_index = idx;
+               break;
+       }
+
+       return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpcd_ctl *pc;
+       struct ptlrpc_request_set *new;
+       int count, i;
+
+       pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+       new = pc->pc_set;
+
+       list_for_each_safe(pos, tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(pos, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               LASSERT(req->rq_phase == RQ_PHASE_NEW);
+               req->rq_set = new;
+               req->rq_queued_time = cfs_time_current();
+       }
+
+       spin_lock(&new->set_new_req_lock);
+       list_splice_init(&set->set_requests, &new->set_new_requests);
+       i = atomic_read(&set->set_remaining);
+       count = atomic_add_return(i, &new->set_new_count);
+       atomic_set(&set->set_remaining, 0);
+       spin_unlock(&new->set_new_req_lock);
+       if (count == i) {
+               wake_up(&new->set_waitq);
+
+               /* XXX: It maybe unnecessary to wakeup all the partners. But to
+                *      guarantee the async RPC can be processed ASAP, we have
+                *      no other better choice. It maybe fixed in future. */
+               for (i = 0; i < pc->pc_npartners; i++)
+                       wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+       }
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+                              struct ptlrpc_request_set *src)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+       int rc = 0;
+
+       spin_lock(&src->set_new_req_lock);
+       if (likely(!list_empty(&src->set_new_requests))) {
+               list_for_each_safe(pos, tmp, &src->set_new_requests) {
+                       req = list_entry(pos, struct ptlrpc_request,
+                                            rq_set_chain);
+                       req->rq_set = des;
+               }
+               list_splice_init(&src->set_new_requests,
+                                    &des->set_requests);
+               rc = atomic_read(&src->set_new_count);
+               atomic_add(rc, &des->set_remaining);
+               atomic_set(&src->set_new_count, 0);
+       }
+       spin_unlock(&src->set_new_req_lock);
+       return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+       struct ptlrpcd_ctl *pc;
+
+       if (req->rq_reqmsg)
+               lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+       spin_lock(&req->rq_lock);
+       if (req->rq_invalid_rqset) {
+               struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+                                                    back_to_sleep, NULL);
+
+               req->rq_invalid_rqset = 0;
+               spin_unlock(&req->rq_lock);
+               l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+       } else if (req->rq_set) {
+               /* If we have a vaid "rq_set", just reuse it to avoid double
+                * linked. */
+               LASSERT(req->rq_phase == RQ_PHASE_NEW);
+               LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+               /* ptlrpc_check_set will decrease the count */
+               atomic_inc(&req->rq_set->set_remaining);
+               spin_unlock(&req->rq_lock);
+               wake_up(&req->rq_set->set_waitq);
+               return;
+       } else {
+               spin_unlock(&req->rq_lock);
+       }
+
+       pc = ptlrpcd_select_pc(req, policy, idx);
+
+       DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+                 req, pc->pc_name, pc->pc_index);
+
+       ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+       atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+       struct ptlrpc_request_set *set = pc->pc_set;
+       int rc = 0;
+       int rc2;
+       ENTRY;
+
+       if (atomic_read(&set->set_new_count)) {
+               spin_lock(&set->set_new_req_lock);
+               if (likely(!list_empty(&set->set_new_requests))) {
+                       list_splice_init(&set->set_new_requests,
+                                            &set->set_requests);
+                       atomic_add(atomic_read(&set->set_new_count),
+                                      &set->set_remaining);
+                       atomic_set(&set->set_new_count, 0);
+                       /*
+                        * Need to calculate its timeout.
+                        */
+                       rc = 1;
+               }
+               spin_unlock(&set->set_new_req_lock);
+       }
+
+       /* We should call lu_env_refill() before handling new requests to make
+        * sure that env key the requests depending on really exists.
+        */
+       rc2 = lu_env_refill(env);
+       if (rc2 != 0) {
+               /*
+                * XXX This is very awkward situation, because
+                * execution can neither continue (request
+                * interpreters assume that env is set up), nor repeat
+                * the loop (as this potentially results in a tight
+                * loop of -ENOMEM's).
+                *
+                * Fortunately, refill only ever does something when
+                * new modules are loaded, i.e., early during boot up.
+                */
+               CERROR("Failure to refill session: %d\n", rc2);
+               RETURN(rc);
+       }
+
+       if (atomic_read(&set->set_remaining))
+               rc |= ptlrpc_check_set(env, set);
+
+       if (!list_empty(&set->set_requests)) {
+               /*
+                * XXX: our set never completes, so we prune the completed
+                * reqs after each iteration. boy could this be smarter.
+                */
+               list_for_each_safe(pos, tmp, &set->set_requests) {
+                       req = list_entry(pos, struct ptlrpc_request,
+                                            rq_set_chain);
+                       if (req->rq_phase != RQ_PHASE_COMPLETE)
+                               continue;
+
+                       list_del_init(&req->rq_set_chain);
+                       req->rq_set = NULL;
+                       ptlrpc_req_finished(req);
+               }
+       }
+
+       if (rc == 0) {
+               /*
+                * If new requests have been added, make sure to wake up.
+                */
+               rc = atomic_read(&set->set_new_count);
+
+               /* If we have nothing to do, check whether we can take some
+                * work from our partner threads. */
+               if (rc == 0 && pc->pc_npartners > 0) {
+                       struct ptlrpcd_ctl *partner;
+                       struct ptlrpc_request_set *ps;
+                       int first = pc->pc_cursor;
+
+                       do {
+                               partner = pc->pc_partners[pc->pc_cursor++];
+                               if (pc->pc_cursor >= pc->pc_npartners)
+                                       pc->pc_cursor = 0;
+                               if (partner == NULL)
+                                       continue;
+
+                               spin_lock(&partner->pc_lock);
+                               ps = partner->pc_set;
+                               if (ps == NULL) {
+                                       spin_unlock(&partner->pc_lock);
+                                       continue;
+                               }
+
+                               ptlrpc_reqset_get(ps);
+                               spin_unlock(&partner->pc_lock);
+
+                               if (atomic_read(&ps->set_new_count)) {
+                                       rc = ptlrpcd_steal_rqset(set, ps);
+                                       if (rc > 0)
+                                               CDEBUG(D_RPCTRACE, "transfer %d"
+                                                      " async RPCs [%d->%d]\n",
+                                                       rc, partner->pc_index,
+                                                       pc->pc_index);
+                               }
+                               ptlrpc_reqset_put(ps);
+                       } while (rc == 0 && pc->pc_cursor != first);
+               }
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+       struct ptlrpcd_ctl *pc = arg;
+       struct ptlrpc_request_set *set = pc->pc_set;
+       struct lu_env env = { .le_ses = NULL };
+       int rc, exit = 0;
+       ENTRY;
+
+       unshare_fs_struct();
+#if defined(CONFIG_SMP)
+       if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+               int index = pc->pc_index;
+
+               if (index >= 0 && index < num_possible_cpus()) {
+                       while (!cpu_online(index)) {
+                               if (++index >= num_possible_cpus())
+                                       index = 0;
+                       }
+                       cfs_set_cpus_allowed(current,
+                                    *cpumask_of_node(cpu_to_node(index)));
+               }
+       }
+#endif
+       /*
+        * XXX So far only "client" ptlrpcd uses an environment. In
+        * the future, ptlrpcd thread (or a thread-set) has to given
+        * an argument, describing its "scope".
+        */
+       rc = lu_context_init(&env.le_ctx,
+                            LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+       complete(&pc->pc_starting);
+
+       if (rc != 0)
+               RETURN(rc);
+
+       /*
+        * This mainloop strongly resembles ptlrpc_set_wait() except that our
+        * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+        * there are requests in the set. New requests come in on the set's
+        * new_req_list and ptlrpcd_check() moves them into the set.
+        */
+       do {
+               struct l_wait_info lwi;
+               int timeout;
+
+               timeout = ptlrpc_set_next_timeout(set);
+               lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+                                 ptlrpc_expired_set, set);
+
+               lu_context_enter(&env.le_ctx);
+               l_wait_event(set->set_waitq,
+                            ptlrpcd_check(&env, pc), &lwi);
+               lu_context_exit(&env.le_ctx);
+
+               /*
+                * Abort inflight rpcs for forced stop case.
+                */
+               if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+                       if (test_bit(LIOD_FORCE, &pc->pc_flags))
+                               ptlrpc_abort_set(set);
+                       exit++;
+               }
+
+               /*
+                * Let's make one more loop to make sure that ptlrpcd_check()
+                * copied all raced new rpcs into the set so we can kill them.
+                */
+       } while (exit < 2);
+
+       /*
+        * Wait for inflight requests to drain.
+        */
+       if (!list_empty(&set->set_requests))
+               ptlrpc_set_wait(set);
+       lu_context_fini(&env.le_ctx);
+
+       complete(&pc->pc_finishing);
+
+       return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ *      ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ *      data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ *      CPU core. But binding all ptlrpcd threads maybe cause response delay
+ *      because of some CPU core(s) busy with other loads.
+ *
+ *      For example: "ls -l", some async RPCs for statahead are assigned to
+ *      ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ *      with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ *      thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ *      such case, the statahead async RPCs can not be processed in time, it is
+ *      unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ *      be better. But it breaks former data transfer policy.
+ *
+ *      So we shouldn't be blind for avoiding the data transfer. We make some
+ *      compromise: divide the ptlrpcd threds pool into two parts. One part is
+ *      for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ *      core. The other part is for free mode, all the ptlrpcd threads in the
+ *      part can be scheduled on any CPU core. We specify some partnership
+ *      between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ *      and the async RPC load within the partners are shared.
+ *
+ *      It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ *      thread can be scheduled in time), and try to guarantee the async RPC
+ *      processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ *      on any CPU core).
+ *
+ *      As for how to specify the partnership between bound mode ptlrpcd
+ *      thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ *      <free bound> pair. In future, we can specify some more complex
+ *      partnership based on the patches for CPU partition. But before such
+ *      patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+       struct ptlrpcd_ctl *pc;
+       int rc = 0;
+#if defined(CONFIG_NUMA)
+       cpumask_t mask;
+#endif
+       ENTRY;
+
+       LASSERT(index <= max - 1);
+       pc = &ptlrpcds->pd_threads[index];
+       switch (ptlrpcd_bind_policy) {
+       case PDB_POLICY_NONE:
+               pc->pc_npartners = -1;
+               break;
+       case PDB_POLICY_FULL:
+               pc->pc_npartners = 0;
+               set_bit(LIOD_BIND, &pc->pc_flags);
+               break;
+       case PDB_POLICY_PAIR:
+               LASSERT(max % 2 == 0);
+               pc->pc_npartners = 1;
+               break;
+       case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+       {
+               int i;
+               mask = *cpumask_of_node(cpu_to_node(index));
+               for (i = max; i < num_online_cpus(); i++)
+                       cpu_clear(i, mask);
+               pc->pc_npartners = cpus_weight(mask) - 1;
+               set_bit(LIOD_BIND, &pc->pc_flags);
+       }
+#else
+               LASSERT(max >= 3);
+               pc->pc_npartners = 2;
+#endif
+               break;
+       default:
+               CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+               rc = -EINVAL;
+       }
+
+       if (rc == 0 && pc->pc_npartners > 0) {
+               OBD_ALLOC(pc->pc_partners,
+                         sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+               if (pc->pc_partners == NULL) {
+                       pc->pc_npartners = 0;
+                       rc = -ENOMEM;
+               } else {
+                       switch (ptlrpcd_bind_policy) {
+                       case PDB_POLICY_PAIR:
+                               if (index & 0x1) {
+                                       set_bit(LIOD_BIND, &pc->pc_flags);
+                                       pc->pc_partners[0] = &ptlrpcds->
+                                               pd_threads[index - 1];
+                                       ptlrpcds->pd_threads[index - 1].
+                                               pc_partners[0] = pc;
+                               }
+                               break;
+                       case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+                       {
+                               struct ptlrpcd_ctl *ppc;
+                               int i, pidx;
+                               /* partners are cores in the same NUMA node.
+                                * setup partnership only with ptlrpcd threads
+                                * that are already initialized
+                                */
+                               for (pidx = 0, i = 0; i < index; i++) {
+                                       if (cpu_isset(i, mask)) {
+                                               ppc = &ptlrpcds->pd_threads[i];
+                                               pc->pc_partners[pidx++] = ppc;
+                                               ppc->pc_partners[ppc->
+                                                         pc_npartners++] = pc;
+                                       }
+                               }
+                               /* adjust number of partners to the number
+                                * of partnership really setup */
+                               pc->pc_npartners = pidx;
+                       }
+#else
+                               if (index & 0x1)
+                                       set_bit(LIOD_BIND, &pc->pc_flags);
+                               if (index > 0) {
+                                       pc->pc_partners[0] = &ptlrpcds->
+                                               pd_threads[index - 1];
+                                       ptlrpcds->pd_threads[index - 1].
+                                               pc_partners[1] = pc;
+                                       if (index == max - 1) {
+                                               pc->pc_partners[1] =
+                                               &ptlrpcds->pd_threads[0];
+                                               ptlrpcds->pd_threads[0].
+                                               pc_partners[0] = pc;
+                                       }
+                               }
+#endif
+                               break;
+                       }
+               }
+       }
+
+       RETURN(rc);
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+       int rc;
+       int env = 0;
+       ENTRY;
+
+       /*
+        * Do not allow start second thread for one pc.
+        */
+       if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Starting second thread (%s) for same pc %p\n",
+                     name, pc);
+               RETURN(0);
+       }
+
+       pc->pc_index = index;
+       init_completion(&pc->pc_starting);
+       init_completion(&pc->pc_finishing);
+       spin_lock_init(&pc->pc_lock);
+       strncpy(pc->pc_name, name, sizeof(pc->pc_name) - 1);
+       pc->pc_set = ptlrpc_prep_set();
+       if (pc->pc_set == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /*
+        * So far only "client" ptlrpcd uses an environment. In the future,
+        * ptlrpcd thread (or a thread-set) has to be given an argument,
+        * describing its "scope".
+        */
+       rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       env = 1;
+       {
+               task_t *task;
+               if (index >= 0) {
+                       rc = ptlrpcd_bind(index, max);
+                       if (rc < 0)
+                               GOTO(out, rc);
+               }
+
+               task = kthread_run(ptlrpcd, pc, pc->pc_name);
+               if (IS_ERR(task))
+                       GOTO(out, rc = PTR_ERR(task));
+
+               rc = 0;
+               wait_for_completion(&pc->pc_starting);
+       }
+out:
+       if (rc) {
+               if (pc->pc_set != NULL) {
+                       struct ptlrpc_request_set *set = pc->pc_set;
+
+                       spin_lock(&pc->pc_lock);
+                       pc->pc_set = NULL;
+                       spin_unlock(&pc->pc_lock);
+                       ptlrpc_set_destroy(set);
+               }
+               if (env != 0)
+                       lu_context_fini(&pc->pc_env.le_ctx);
+               clear_bit(LIOD_BIND, &pc->pc_flags);
+               clear_bit(LIOD_START, &pc->pc_flags);
+       }
+       RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+       ENTRY;
+
+       if (!test_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Thread for pc %p was not started\n", pc);
+               goto out;
+       }
+
+       set_bit(LIOD_STOP, &pc->pc_flags);
+       if (force)
+               set_bit(LIOD_FORCE, &pc->pc_flags);
+       wake_up(&pc->pc_set->set_waitq);
+
+out:
+       EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+       struct ptlrpc_request_set *set = pc->pc_set;
+       ENTRY;
+
+       if (!test_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Thread for pc %p was not started\n", pc);
+               goto out;
+       }
+
+       wait_for_completion(&pc->pc_finishing);
+       lu_context_fini(&pc->pc_env.le_ctx);
+
+       spin_lock(&pc->pc_lock);
+       pc->pc_set = NULL;
+       spin_unlock(&pc->pc_lock);
+       ptlrpc_set_destroy(set);
+
+       clear_bit(LIOD_START, &pc->pc_flags);
+       clear_bit(LIOD_STOP, &pc->pc_flags);
+       clear_bit(LIOD_FORCE, &pc->pc_flags);
+       clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+       if (pc->pc_npartners > 0) {
+               LASSERT(pc->pc_partners != NULL);
+
+               OBD_FREE(pc->pc_partners,
+                        sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+               pc->pc_partners = NULL;
+       }
+       pc->pc_npartners = 0;
+       EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+       int i;
+       ENTRY;
+
+       if (ptlrpcds != NULL) {
+               for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+                       ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+               for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+                       ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+               ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+               ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+               OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+               ptlrpcds = NULL;
+       }
+
+       EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+       int nthreads = num_online_cpus();
+       char name[16];
+       int size, i = -1, j, rc = 0;
+       ENTRY;
+
+       if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+               nthreads = max_ptlrpcds;
+       if (nthreads < 2)
+               nthreads = 2;
+       if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+               ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+       else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+               nthreads &= ~1; /* make sure it is even */
+
+       size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+       OBD_ALLOC(ptlrpcds, size);
+       if (ptlrpcds == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       snprintf(name, 15, "ptlrpcd_rcv");
+       set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+       rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+        *      non-recovery async RPC to improve overall async RPC efficiency.
+        *
+        *      But there are some issues with async I/O RPCs and async non-I/O
+        *      RPCs processed in the same set under some cases. The ptlrpcd may
+        *      be blocked by some async I/O RPC(s), then will cause other async
+        *      non-I/O RPC(s) can not be processed in time.
+        *
+        *      Maybe we should distinguish blocked async RPCs from non-blocked
+        *      async RPCs, and process them in different ptlrpcd sets to avoid
+        *      unnecessary dependency. But how to distribute async RPCs load
+        *      among all the ptlrpc daemons becomes another trouble. */
+       for (i = 0; i < nthreads; i++) {
+               snprintf(name, 15, "ptlrpcd_%d", i);
+               rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       ptlrpcds->pd_size = size;
+       ptlrpcds->pd_index = 0;
+       ptlrpcds->pd_nthreads = nthreads;
+
+out:
+       if (rc != 0 && ptlrpcds != NULL) {
+               for (j = 0; j <= i; j++)
+                       ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+               for (j = 0; j <= i; j++)
+                       ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+               ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+               ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+               OBD_FREE(ptlrpcds, size);
+               ptlrpcds = NULL;
+       }
+
+       RETURN(0);
+}
+
+int ptlrpcd_addref(void)
+{
+       int rc = 0;
+       ENTRY;
+
+       mutex_lock(&ptlrpcd_mutex);
+       if (++ptlrpcd_users == 1)
+               rc = ptlrpcd_init();
+       mutex_unlock(&ptlrpcd_mutex);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+       mutex_lock(&ptlrpcd_mutex);
+       if (--ptlrpcd_users == 0)
+               ptlrpcd_fini();
+       mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644 (file)
index 0000000..2960889
--- /dev/null
@@ -0,0 +1,357 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+       ENTRY;
+
+       CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+       ptlrpc_connect_import(imp);
+
+       EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+       int rc = 0;
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req = NULL;
+       __u64 last_transno;
+       ENTRY;
+
+       *inflight = 0;
+
+       /* It might have committed some after we last spoke, so make sure we
+        * get rid of them now.
+        */
+       spin_lock(&imp->imp_lock);
+       imp->imp_last_transno_checked = 0;
+       ptlrpc_free_committed(imp);
+       last_transno = imp->imp_last_replay_transno;
+       spin_unlock(&imp->imp_lock);
+
+       CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
+              imp, obd2cli_tgt(imp->imp_obd),
+              imp->imp_peer_committed_transno, last_transno);
+
+       /* Do I need to hold a lock across this iteration?  We shouldn't be
+        * racing with any additions to the list, because we're in recovery
+        * and are therefore not processing additional requests to add.  Calls
+        * to ptlrpc_free_committed might commit requests, but nothing "newer"
+        * than the one we're replaying (it can't be committed until it's
+        * replayed, and we're doing that here).  l_f_e_safe protects against
+        * problems with the current request being committed, in the unlikely
+        * event of that race.  So, in conclusion, I think that it's safe to
+        * perform this list-walk without the imp_lock held.
+        *
+        * But, the {mdc,osc}_replay_open callbacks both iterate
+        * request lists, and have comments saying they assume the
+        * imp_lock is being held by ptlrpc_replay, but it's not. it's
+        * just a little race...
+        */
+       list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+               req = list_entry(tmp, struct ptlrpc_request,
+                                    rq_replay_list);
+
+               /* If need to resend the last sent transno (because a
+                  reconnect has occurred), then stop on the matching
+                  req and send it again. If, however, the last sent
+                  transno has been committed then we continue replay
+                  from the next request. */
+               if (req->rq_transno > last_transno) {
+                       if (imp->imp_resend_replay)
+                               lustre_msg_add_flags(req->rq_reqmsg,
+                                                    MSG_RESENT);
+                       break;
+               }
+               req = NULL;
+       }
+
+       spin_lock(&imp->imp_lock);
+       imp->imp_resend_replay = 0;
+       spin_unlock(&imp->imp_lock);
+
+       if (req != NULL) {
+               rc = ptlrpc_replay_req(req);
+               if (rc) {
+                       CERROR("recovery replay error %d for req "
+                              LPU64"\n", rc, req->rq_xid);
+                       RETURN(rc);
+               }
+               *inflight = 1;
+       }
+       RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+       struct ptlrpc_request *req, *next;
+
+       ENTRY;
+
+       /* As long as we're in recovery, nothing should be added to the sending
+        * list, so we don't need to hold the lock during this iteration and
+        * resend process.
+        */
+       /* Well... what if lctl recover is called twice at the same time?
+        */
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+               spin_unlock(&imp->imp_lock);
+               RETURN(-1);
+       }
+
+       list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+                                    rq_list) {
+               LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+                        "req %p bad\n", req);
+               LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+               if (!ptlrpc_no_resend(req))
+                       ptlrpc_resend_req(req);
+       }
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+               ptlrpc_client_wake_req(req);
+       }
+       spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+       struct obd_import *imp = failed_req->rq_import;
+       ENTRY;
+
+       CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+              imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+              imp->imp_connection->c_remote_uuid.uuid);
+
+       if (ptlrpc_set_import_discon(imp,
+                             lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+               if (!imp->imp_replayable) {
+                       CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                              "auto-deactivating\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid,
+                              imp->imp_obd->obd_name);
+                       ptlrpc_deactivate_import(imp);
+               }
+               /* to control recovery via lctl {disable|enable}_recovery */
+               if (imp->imp_deactive == 0)
+                       ptlrpc_connect_import(imp);
+       }
+
+       /* Wait for recovery to complete and resend. If evicted, then
+          this request will be errored out later.*/
+       spin_lock(&failed_req->rq_lock);
+       if (!failed_req->rq_no_resend)
+               failed_req->rq_resend = 1;
+       spin_unlock(&failed_req->rq_lock);
+
+       EXIT;
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc = 0;
+
+       ENTRY;
+       LASSERT(obd);
+
+       /* When deactivating, mark import invalid, and abort in-flight
+        * requests. */
+       if (!active) {
+               LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+                             "request\n", obd2cli_tgt(imp->imp_obd));
+
+               /* set before invalidate to avoid messages about imp_inval
+                * set without imp_deactive in ptlrpc_import_delay_req */
+               spin_lock(&imp->imp_lock);
+               imp->imp_deactive = 1;
+               spin_unlock(&imp->imp_lock);
+
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+               ptlrpc_invalidate_import(imp);
+       }
+
+       /* When activating, mark import valid, and attempt recovery */
+       if (active) {
+               CDEBUG(D_HA, "setting import %s VALID\n",
+                      obd2cli_tgt(imp->imp_obd));
+
+               spin_lock(&imp->imp_lock);
+               imp->imp_deactive = 0;
+               spin_unlock(&imp->imp_lock);
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+               rc = ptlrpc_recover_import(imp, NULL, 0);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+       int rc = 0;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+           atomic_read(&imp->imp_inval_count))
+               rc = -EINVAL;
+       spin_unlock(&imp->imp_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       /* force import to be disconnected. */
+       ptlrpc_set_import_discon(imp, 0);
+
+       if (new_uuid) {
+               struct obd_uuid uuid;
+
+               /* intruct import to use new uuid */
+               obd_str2uuid(&uuid, new_uuid);
+               rc = import_set_conn_priority(imp, &uuid);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       /* Check if reconnect is already in progress */
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_DISCON) {
+               imp->imp_force_verify = 1;
+               rc = -EALREADY;
+       }
+       spin_unlock(&imp->imp_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = ptlrpc_connect_import(imp);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!async) {
+               struct l_wait_info lwi;
+               int secs = cfs_time_seconds(obd_timeout);
+
+               CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+                      obd2cli_tgt(imp->imp_obd), secs);
+
+               lwi = LWI_TIMEOUT(secs, NULL, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 !ptlrpc_import_in_recovery(imp), &lwi);
+               CDEBUG(D_HA, "%s: recovery finished\n",
+                      obd2cli_tgt(imp->imp_obd));
+       }
+       EXIT;
+
+out:
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+       int in_recovery = 1;
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_FULL ||
+           imp->imp_state == LUSTRE_IMP_CLOSED ||
+           imp->imp_state == LUSTRE_IMP_DISCON)
+               in_recovery = 0;
+       spin_unlock(&imp->imp_lock);
+       return in_recovery;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644 (file)
index 0000000..36e8bed
--- /dev/null
@@ -0,0 +1,2465 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers                        *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+       NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+       __u16 number = policy->sp_policy;
+
+       LASSERT(policy->sp_name);
+       LASSERT(policy->sp_cops);
+       LASSERT(policy->sp_sops);
+
+       if (number >= SPTLRPC_POLICY_MAX)
+               return -EINVAL;
+
+       write_lock(&policy_lock);
+       if (unlikely(policies[number])) {
+               write_unlock(&policy_lock);
+               return -EALREADY;
+       }
+       policies[number] = policy;
+       write_unlock(&policy_lock);
+
+       CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+       __u16 number = policy->sp_policy;
+
+       LASSERT(number < SPTLRPC_POLICY_MAX);
+
+       write_lock(&policy_lock);
+       if (unlikely(policies[number] == NULL)) {
+               write_unlock(&policy_lock);
+               CERROR("%s: already unregistered\n", policy->sp_name);
+               return -EINVAL;
+       }
+
+       LASSERT(policies[number] == policy);
+       policies[number] = NULL;
+       write_unlock(&policy_lock);
+
+       CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
+{
+       static DEFINE_MUTEX(load_mutex);
+       static atomic_t       loaded = ATOMIC_INIT(0);
+       struct ptlrpc_sec_policy *policy;
+       __u16                number = SPTLRPC_FLVR_POLICY(flavor);
+       __u16                flag = 0;
+
+       if (number >= SPTLRPC_POLICY_MAX)
+               return NULL;
+
+       while (1) {
+               read_lock(&policy_lock);
+               policy = policies[number];
+               if (policy && !try_module_get(policy->sp_owner))
+                       policy = NULL;
+               if (policy == NULL)
+                       flag = atomic_read(&loaded);
+               read_unlock(&policy_lock);
+
+               if (policy != NULL || flag != 0 ||
+                   number != SPTLRPC_POLICY_GSS)
+                       break;
+
+               /* try to load gss module, once */
+               mutex_lock(&load_mutex);
+               if (atomic_read(&loaded) == 0) {
+                       if (request_module("ptlrpc_gss") == 0)
+                               CDEBUG(D_SEC,
+                                      "module ptlrpc_gss loaded on demand\n");
+                       else
+                               CERROR("Unable to load module ptlrpc_gss\n");
+
+                       atomic_set(&loaded, 1);
+               }
+               mutex_unlock(&load_mutex);
+       }
+
+       return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+       if (!strcmp(name, "null"))
+               return SPTLRPC_FLVR_NULL;
+       if (!strcmp(name, "plain"))
+               return SPTLRPC_FLVR_PLAIN;
+       if (!strcmp(name, "krb5n"))
+               return SPTLRPC_FLVR_KRB5N;
+       if (!strcmp(name, "krb5a"))
+               return SPTLRPC_FLVR_KRB5A;
+       if (!strcmp(name, "krb5i"))
+               return SPTLRPC_FLVR_KRB5I;
+       if (!strcmp(name, "krb5p"))
+               return SPTLRPC_FLVR_KRB5P;
+
+       return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+       __u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+       if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+               return "null";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+               return "plain";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+               return "krb5n";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+               return "krb5a";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+               return "krb5i";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+               return "krb5p";
+
+       CERROR("invalid wire flavor 0x%x\n", flvr);
+       return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                              char *buf, int bufsize)
+{
+       if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+               snprintf(buf, bufsize, "hash:%s",
+                        sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+       else
+               snprintf(buf, bufsize, "%s",
+                        sptlrpc_flavor2name_base(sf->sf_rpc));
+
+       buf[bufsize - 1] = '\0';
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+       snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+       /*
+        * currently we don't support customized bulk specification for
+        * flavors other than plain
+        */
+       if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+               char bspec[16];
+
+               bspec[0] = '-';
+               sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+               strncat(buf, bspec, bufsize);
+       }
+
+       buf[bufsize - 1] = '\0';
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_SEC_FL_REVERSE)
+               strlcat(buf, "reverse,", bufsize);
+       if (flags & PTLRPC_SEC_FL_ROOTONLY)
+               strlcat(buf, "rootonly,", bufsize);
+       if (flags & PTLRPC_SEC_FL_UDESC)
+               strlcat(buf, "udesc,", bufsize);
+       if (flags & PTLRPC_SEC_FL_BULK)
+               strlcat(buf, "bulk,", bufsize);
+       if (buf[0] == '\0')
+               strlcat(buf, "-,", bufsize);
+
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs                     *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+       struct vfs_cred vcred;
+       int create = 1, remove_dead = 1;
+
+       LASSERT(sec);
+       LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+       if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+                                    PTLRPC_SEC_FL_ROOTONLY)) {
+               vcred.vc_uid = 0;
+               vcred.vc_gid = 0;
+               if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+                       create = 0;
+                       remove_dead = 0;
+               }
+       } else {
+               vcred.vc_uid = current_uid();
+               vcred.vc_gid = current_gid();
+       }
+
+       return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+                                                  create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+       atomic_inc(&ctx->cc_refcount);
+       return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       struct ptlrpc_sec *sec = ctx->cc_sec;
+
+       LASSERT(sec);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (!atomic_dec_and_test(&ctx->cc_refcount))
+               return;
+
+       sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(ctx->cc_ops->die);
+       ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_request *req, *next;
+
+       spin_lock(&ctx->cc_lock);
+       list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+                                    rq_ctx_chain) {
+               list_del_init(&req->rq_ctx_chain);
+               ptlrpc_client_wake_req(req);
+       }
+       spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+       LASSERT(ctx->cc_ops);
+
+       if (ctx->cc_ops->display == NULL)
+               return 0;
+
+       return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+       int     adapt = 0;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_sec_expire &&
+           imp->imp_sec_expire < cfs_time_current_sec()) {
+               adapt = 1;
+               imp->imp_sec_expire = 0;
+       }
+       spin_unlock(&imp->imp_lock);
+
+       if (!adapt)
+               return 0;
+
+       CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+       return sptlrpc_import_sec_adapt(imp, NULL, 0);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+                                  struct ptlrpc_sec **sec)
+{
+       int     rc;
+
+       if (unlikely(imp->imp_sec_expire)) {
+               rc = import_sec_check_expire(imp);
+               if (rc)
+                       return rc;
+       }
+
+       *sec = sptlrpc_import_sec_ref(imp);
+       if (*sec == NULL) {
+               CERROR("import %p (%s) with no sec\n",
+                      imp, ptlrpc_import_state_name(imp->imp_state));
+               return -EACCES;
+       }
+
+       if (unlikely((*sec)->ps_dying)) {
+               CERROR("attempt to use dying sec %p\n", sec);
+               sptlrpc_sec_put(*sec);
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+       struct obd_import *imp = req->rq_import;
+       struct ptlrpc_sec *sec;
+       int             rc;
+       ENTRY;
+
+       LASSERT(!req->rq_cli_ctx);
+       LASSERT(imp);
+
+       rc = import_sec_validate_get(imp, &sec);
+       if (rc)
+               RETURN(rc);
+
+       req->rq_cli_ctx = get_my_ctx(sec);
+
+       sptlrpc_sec_put(sec);
+
+       if (!req->rq_cli_ctx) {
+               CERROR("req %p: fail to get context\n", req);
+               RETURN(-ENOMEM);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+       ENTRY;
+
+       LASSERT(req);
+       LASSERT(req->rq_cli_ctx);
+
+       /* request might be asked to release earlier while still
+        * in the context waiting list.
+        */
+       if (!list_empty(&req->rq_ctx_chain)) {
+               spin_lock(&req->rq_cli_ctx->cc_lock);
+               list_del_init(&req->rq_ctx_chain);
+               spin_unlock(&req->rq_cli_ctx->cc_lock);
+       }
+
+       sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+       req->rq_cli_ctx = NULL;
+       EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+                          struct ptlrpc_cli_ctx *oldctx,
+                          struct ptlrpc_cli_ctx *newctx)
+{
+       struct sptlrpc_flavor   old_flvr;
+       char               *reqmsg = NULL; /* to workaround old gcc */
+       int                  reqmsg_size;
+       int                  rc = 0;
+
+       LASSERT(req->rq_reqmsg);
+       LASSERT(req->rq_reqlen);
+       LASSERT(req->rq_replen);
+
+       CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), "
+              "switch sec %p(%s) -> %p(%s)\n", req,
+              oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+              newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+              oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+              newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+       /* save flavor */
+       old_flvr = req->rq_flvr;
+
+       /* save request message */
+       reqmsg_size = req->rq_reqlen;
+       if (reqmsg_size != 0) {
+               OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+               if (reqmsg == NULL)
+                       return -ENOMEM;
+               memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+       }
+
+       /* release old req/rep buf */
+       req->rq_cli_ctx = oldctx;
+       sptlrpc_cli_free_reqbuf(req);
+       sptlrpc_cli_free_repbuf(req);
+       req->rq_cli_ctx = newctx;
+
+       /* recalculate the flavor */
+       sptlrpc_req_set_flavor(req, 0);
+
+       /* alloc new request buffer
+        * we don't need to alloc reply buffer here, leave it to the
+        * rest procedure of ptlrpc */
+       if (reqmsg_size != 0) {
+               rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+               if (!rc) {
+                       LASSERT(req->rq_reqmsg);
+                       memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+               } else {
+                       CWARN("failed to alloc reqbuf: %d\n", rc);
+                       req->rq_flvr = old_flvr;
+               }
+
+               OBD_FREE_LARGE(reqmsg, reqmsg_size);
+       }
+       return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+       struct ptlrpc_cli_ctx *newctx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oldctx);
+
+       sptlrpc_cli_ctx_get(oldctx);
+       sptlrpc_req_put_ctx(req, 0);
+
+       rc = sptlrpc_req_get_ctx(req);
+       if (unlikely(rc)) {
+               LASSERT(!req->rq_cli_ctx);
+
+               /* restore old ctx */
+               req->rq_cli_ctx = oldctx;
+               RETURN(rc);
+       }
+
+       newctx = req->rq_cli_ctx;
+       LASSERT(newctx);
+
+       if (unlikely(newctx == oldctx &&
+                    test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+               /*
+                * still get the old dead ctx, usually means system too busy
+                */
+               CDEBUG(D_SEC,
+                      "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+                      newctx, newctx->cc_flags);
+
+               schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+                                                  HZ);
+       } else {
+               /*
+                * it's possible newctx == oldctx if we're switching
+                * subflavor with the same sec.
+                */
+               rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+               if (rc) {
+                       /* restore old ctx */
+                       sptlrpc_req_put_ctx(req, 0);
+                       req->rq_cli_ctx = oldctx;
+                       RETURN(rc);
+               }
+
+               LASSERT(req->rq_cli_ctx == newctx);
+       }
+
+       sptlrpc_cli_ctx_put(oldctx, 1);
+       RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       if (cli_ctx_is_refreshed(ctx))
+               return 1;
+       return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+       struct ptlrpc_request *req = data;
+       int rc;
+
+       /* conn_cnt is needed in expire_one_request */
+       lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+       rc = ptlrpc_expire_one_request(req, 1);
+       /* if we started recovery, we should mark this ctx dead; otherwise
+        * in case of lgssd died nobody would retire this ctx, following
+        * connecting will still find the same ctx thus cause deadlock.
+        * there's an assumption that expire time of the request should be
+        * later than the context refresh expire time.
+        */
+       if (rc == 0)
+               req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
+       return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+       struct ptlrpc_request *req = data;
+
+       spin_lock(&req->rq_lock);
+       req->rq_intr = 1;
+       spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+       spin_lock(&ctx->cc_lock);
+       if (!list_empty(&req->rq_ctx_chain))
+               list_del_init(&req->rq_ctx_chain);
+       spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+       struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec      *sec;
+       struct l_wait_info      lwi;
+       int                  rc;
+       ENTRY;
+
+       LASSERT(ctx);
+
+       if (req->rq_ctx_init || req->rq_ctx_fini)
+               RETURN(0);
+
+       /*
+        * during the process a request's context might change type even
+        * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+        * everything
+        */
+again:
+       rc = import_sec_validate_get(req->rq_import, &sec);
+       if (rc)
+               RETURN(rc);
+
+       if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+               CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+                     req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+               req_off_ctx_list(req, ctx);
+               sptlrpc_req_replace_dead_ctx(req);
+               ctx = req->rq_cli_ctx;
+       }
+       sptlrpc_sec_put(sec);
+
+       if (cli_ctx_is_eternal(ctx))
+               RETURN(0);
+
+       if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+               LASSERT(ctx->cc_ops->refresh);
+               ctx->cc_ops->refresh(ctx);
+       }
+       LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+       LASSERT(ctx->cc_ops->validate);
+       if (ctx->cc_ops->validate(ctx) == 0) {
+               req_off_ctx_list(req, ctx);
+               RETURN(0);
+       }
+
+       if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+               spin_lock(&req->rq_lock);
+               req->rq_err = 1;
+               spin_unlock(&req->rq_lock);
+               req_off_ctx_list(req, ctx);
+               RETURN(-EPERM);
+       }
+
+       /*
+        * There's a subtle issue for resending RPCs, suppose following
+        * situation:
+        *  1. the request was sent to server.
+        *  2. recovery was kicked start, after finished the request was
+        *     marked as resent.
+        *  3. resend the request.
+        *  4. old reply from server received, we accept and verify the reply.
+        *     this has to be success, otherwise the error will be aware
+        *     by application.
+        *  5. new reply from server received, dropped by LNet.
+        *
+        * Note the xid of old & new request is the same. We can't simply
+        * change xid for the resent request because the server replies on
+        * it for reply reconstruction.
+        *
+        * Commonly the original context should be uptodate because we
+        * have a expiry nice time; server will keep its context because
+        * we at least hold a ref of old context which prevent context
+        * destroying RPC being sent. So server still can accept the request
+        * and finish the RPC. But if that's not the case:
+        *  1. If server side context has been trimmed, a NO_CONTEXT will
+        *     be returned, gss_cli_ctx_verify/unseal will switch to new
+        *     context by force.
+        *  2. Current context never be refreshed, then we are fine: we
+        *     never really send request with old context before.
+        */
+       if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+           unlikely(req->rq_reqmsg) &&
+           lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+               req_off_ctx_list(req, ctx);
+               RETURN(0);
+       }
+
+       if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+               req_off_ctx_list(req, ctx);
+               /*
+                * don't switch ctx if import was deactivated
+                */
+               if (req->rq_import->imp_deactive) {
+                       spin_lock(&req->rq_lock);
+                       req->rq_err = 1;
+                       spin_unlock(&req->rq_lock);
+                       RETURN(-EINTR);
+               }
+
+               rc = sptlrpc_req_replace_dead_ctx(req);
+               if (rc) {
+                       LASSERT(ctx == req->rq_cli_ctx);
+                       CERROR("req %p: failed to replace dead ctx %p: %d\n",
+                              req, ctx, rc);
+                       spin_lock(&req->rq_lock);
+                       req->rq_err = 1;
+                       spin_unlock(&req->rq_lock);
+                       RETURN(rc);
+               }
+
+               ctx = req->rq_cli_ctx;
+               goto again;
+       }
+
+       /*
+        * Now we're sure this context is during upcall, add myself into
+        * waiting list
+        */
+       spin_lock(&ctx->cc_lock);
+       if (list_empty(&req->rq_ctx_chain))
+               list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+       spin_unlock(&ctx->cc_lock);
+
+       if (timeout < 0)
+               RETURN(-EWOULDBLOCK);
+
+       /* Clear any flags that may be present from previous sends */
+       LASSERT(req->rq_receiving_reply == 0);
+       spin_lock(&req->rq_lock);
+       req->rq_err = 0;
+       req->rq_timedout = 0;
+       req->rq_resend = 0;
+       req->rq_restart = 0;
+       spin_unlock(&req->rq_lock);
+
+       lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+                              ctx_refresh_interrupt, req);
+       rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+       /*
+        * following cases could lead us here:
+        * - successfully refreshed;
+        * - interrupted;
+        * - timedout, and we don't want recover from the failure;
+        * - timedout, and waked up upon recovery finished;
+        * - someone else mark this ctx dead by force;
+        * - someone invalidate the req and call ptlrpc_client_wake_req(),
+        *   e.g. ptlrpc_abort_inflight();
+        */
+       if (!cli_ctx_is_refreshed(ctx)) {
+               /* timed out or interruptted */
+               req_off_ctx_list(req, ctx);
+
+               LASSERT(rc != 0);
+               RETURN(rc);
+       }
+
+       goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+       struct ptlrpc_sec *sec;
+
+       LASSERT(req->rq_import);
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_cli_ctx->cc_sec);
+       LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+       /* special security flags accoding to opcode */
+       switch (opcode) {
+       case OST_READ:
+       case MDS_READPAGE:
+       case MGS_CONFIG_READ:
+       case OBD_IDX_READ:
+               req->rq_bulk_read = 1;
+               break;
+       case OST_WRITE:
+       case MDS_WRITEPAGE:
+               req->rq_bulk_write = 1;
+               break;
+       case SEC_CTX_INIT:
+               req->rq_ctx_init = 1;
+               break;
+       case SEC_CTX_FINI:
+               req->rq_ctx_fini = 1;
+               break;
+       case 0:
+               /* init/fini rpc won't be resend, so can't be here */
+               LASSERT(req->rq_ctx_init == 0);
+               LASSERT(req->rq_ctx_fini == 0);
+
+               /* cleanup flags, which should be recalculated */
+               req->rq_pack_udesc = 0;
+               req->rq_pack_bulk = 0;
+               break;
+       }
+
+       sec = req->rq_cli_ctx->cc_sec;
+
+       spin_lock(&sec->ps_lock);
+       req->rq_flvr = sec->ps_flvr;
+       spin_unlock(&sec->ps_lock);
+
+       /* force SVC_NULL for context initiation rpc, SVC_INTG for context
+        * destruction rpc */
+       if (unlikely(req->rq_ctx_init))
+               flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+       else if (unlikely(req->rq_ctx_fini))
+               flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+       /* user descriptor flag, null security can't do it anyway */
+       if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+           (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+               req->rq_pack_udesc = 1;
+
+       /* bulk security flag */
+       if ((req->rq_bulk_read || req->rq_bulk_write) &&
+           sptlrpc_flavor_has_bulk(&req->rq_flvr))
+               req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+       if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+               return;
+
+       LASSERT(req->rq_clrbuf);
+       if (req->rq_pool || !req->rq_reqbuf)
+               return;
+
+       OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+       req->rq_reqbuf = NULL;
+       req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+       struct ptlrpc_sec     *sec;
+       struct ptlrpc_cli_ctx *ctx;
+       struct ptlrpc_request *req = NULL;
+       int rc;
+       ENTRY;
+
+       might_sleep();
+
+       sec = sptlrpc_import_sec_ref(imp);
+       ctx = get_my_ctx(sec);
+       sptlrpc_sec_put(sec);
+
+       if (!ctx)
+               RETURN(-ENOMEM);
+
+       if (cli_ctx_is_eternal(ctx) ||
+           ctx->cc_ops->validate(ctx) == 0) {
+               sptlrpc_cli_ctx_put(ctx, 1);
+               RETURN(0);
+       }
+
+       if (cli_ctx_is_error(ctx)) {
+               sptlrpc_cli_ctx_put(ctx, 1);
+               RETURN(-EACCES);
+       }
+
+       OBD_ALLOC_PTR(req);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       spin_lock_init(&req->rq_lock);
+       atomic_set(&req->rq_refcount, 10000);
+       INIT_LIST_HEAD(&req->rq_ctx_chain);
+       init_waitqueue_head(&req->rq_reply_waitq);
+       init_waitqueue_head(&req->rq_set_waitq);
+       req->rq_import = imp;
+       req->rq_flvr = sec->ps_flvr;
+       req->rq_cli_ctx = ctx;
+
+       rc = sptlrpc_req_refresh_ctx(req, 0);
+       LASSERT(list_empty(&req->rq_ctx_chain));
+       sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+       OBD_FREE_PTR(req);
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+       /* we wrap bulk request here because now we can be sure
+        * the context is uptodate.
+        */
+       if (req->rq_bulk) {
+               rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(ctx->cc_ops->sign);
+               rc = ctx->cc_ops->sign(ctx, req);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(ctx->cc_ops->seal);
+               rc = ctx->cc_ops->seal(ctx, req);
+               break;
+       default:
+               LBUG();
+       }
+
+       if (rc == 0) {
+               LASSERT(req->rq_reqdata_len);
+               LASSERT(req->rq_reqdata_len % 8 == 0);
+               LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+       }
+
+       RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata);
+       LASSERT(req->rq_repmsg == NULL);
+
+       req->rq_rep_swab_mask = 0;
+
+       rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+       switch (rc) {
+       case 1:
+               lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+       case 0:
+               break;
+       default:
+               CERROR("failed unpack reply: x"LPU64"\n", req->rq_xid);
+               RETURN(-EPROTO);
+       }
+
+       if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+               CERROR("replied data length %d too small\n",
+                      req->rq_repdata_len);
+               RETURN(-EPROTO);
+       }
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+           SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+               CERROR("reply policy %u doesn't match request policy %u\n",
+                      SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+                      SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+               RETURN(-EPROTO);
+       }
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(ctx->cc_ops->verify);
+               rc = ctx->cc_ops->verify(ctx, req);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(ctx->cc_ops->unseal);
+               rc = ctx->cc_ops->unseal(ctx, req);
+               break;
+       default:
+               LBUG();
+       }
+       LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+           !req->rq_ctx_init)
+               req->rq_rep_swab_mask = 0;
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+       LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+       if (req->rq_reply_off == 0 &&
+           (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+               CERROR("real reply with offset 0\n");
+               return -EPROTO;
+       }
+
+       if (req->rq_reply_off % 8 != 0) {
+               CERROR("reply at odd offset %u\n", req->rq_reply_off);
+               return -EPROTO;
+       }
+
+       req->rq_repdata = (struct lustre_msg *)
+                               (req->rq_repbuf + req->rq_reply_off);
+       req->rq_repdata_len = req->rq_nob_received;
+
+       return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                  struct ptlrpc_request **req_ret)
+{
+       struct ptlrpc_request  *early_req;
+       char               *early_buf;
+       int                  early_bufsz, early_size;
+       int                  rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(early_req);
+       if (early_req == NULL)
+               RETURN(-ENOMEM);
+
+       early_size = req->rq_nob_received;
+       early_bufsz = size_roundup_power2(early_size);
+       OBD_ALLOC_LARGE(early_buf, early_bufsz);
+       if (early_buf == NULL)
+               GOTO(err_req, rc = -ENOMEM);
+
+       /* sanity checkings and copy data out, do it inside spinlock */
+       spin_lock(&req->rq_lock);
+
+       if (req->rq_replied) {
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EALREADY);
+       }
+
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+
+       if (req->rq_reply_off != 0) {
+               CERROR("early reply with offset %u\n", req->rq_reply_off);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EPROTO);
+       }
+
+       if (req->rq_nob_received != early_size) {
+               /* even another early arrived the size should be the same */
+               CERROR("data size has changed from %u to %u\n",
+                      early_size, req->rq_nob_received);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EINVAL);
+       }
+
+       if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+               CERROR("early reply length %d too small\n",
+                      req->rq_nob_received);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EALREADY);
+       }
+
+       memcpy(early_buf, req->rq_repbuf, early_size);
+       spin_unlock(&req->rq_lock);
+
+       spin_lock_init(&early_req->rq_lock);
+       early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+       early_req->rq_flvr = req->rq_flvr;
+       early_req->rq_repbuf = early_buf;
+       early_req->rq_repbuf_len = early_bufsz;
+       early_req->rq_repdata = (struct lustre_msg *) early_buf;
+       early_req->rq_repdata_len = early_size;
+       early_req->rq_early = 1;
+       early_req->rq_reqmsg = req->rq_reqmsg;
+
+       rc = do_cli_unwrap_reply(early_req);
+       if (rc) {
+               DEBUG_REQ(D_ADAPTTO, early_req,
+                         "error %d unwrap early reply", rc);
+               GOTO(err_ctx, rc);
+       }
+
+       LASSERT(early_req->rq_repmsg);
+       *req_ret = early_req;
+       RETURN(0);
+
+err_ctx:
+       sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+       OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+       OBD_FREE_PTR(early_req);
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+       LASSERT(early_req->rq_repbuf);
+       LASSERT(early_req->rq_repdata);
+       LASSERT(early_req->rq_repmsg);
+
+       sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+       OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+       OBD_FREE_PTR(early_req);
+}
+
+/**************************************************
+ * sec ID                                       *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+       return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs           *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+                                  int grace, int force)
+{
+       struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+       LASSERT(policy->sp_cops);
+       LASSERT(policy->sp_cops->flush_ctx_cache);
+
+       return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+       struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+       LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+       LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+       LASSERT(policy->sp_cops->destroy_sec);
+
+       CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+       policy->sp_cops->destroy_sec(sec);
+       sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+       sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+       LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+       if (sec->ps_policy->sp_cops->kill_sec) {
+               sec->ps_policy->sp_cops->kill_sec(sec);
+
+               sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+       }
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+       if (sec)
+               atomic_inc(&sec->ps_refcount);
+
+       return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+       if (sec) {
+               LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+               if (atomic_dec_and_test(&sec->ps_refcount)) {
+                       sptlrpc_gc_del_sec(sec);
+                       sec_cop_destroy_sec(sec);
+               }
+       }
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+                                      struct ptlrpc_svc_ctx *svc_ctx,
+                                      struct sptlrpc_flavor *sf,
+                                      enum lustre_sec_part sp)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct ptlrpc_sec       *sec;
+       char                  str[32];
+       ENTRY;
+
+       if (svc_ctx) {
+               LASSERT(imp->imp_dlm_fake == 1);
+
+               CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+                      imp->imp_obd->obd_type->typ_name,
+                      imp->imp_obd->obd_name,
+                      sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+               policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+               sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+       } else {
+               LASSERT(imp->imp_dlm_fake == 0);
+
+               CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+                      imp->imp_obd->obd_type->typ_name,
+                      imp->imp_obd->obd_name,
+                      sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+               policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+               if (!policy) {
+                       CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+                       RETURN(NULL);
+               }
+       }
+
+       sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+       if (sec) {
+               atomic_inc(&sec->ps_refcount);
+
+               sec->ps_part = sp;
+
+               if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+                       sptlrpc_gc_add_sec(sec);
+       } else {
+               sptlrpc_policy_put(policy);
+       }
+
+       RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+       struct ptlrpc_sec *sec;
+
+       spin_lock(&imp->imp_lock);
+       sec = sptlrpc_sec_get(imp->imp_sec);
+       spin_unlock(&imp->imp_lock);
+
+       return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+                                      struct ptlrpc_sec *sec)
+{
+       struct ptlrpc_sec *old_sec;
+
+       LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+       spin_lock(&imp->imp_lock);
+       old_sec = imp->imp_sec;
+       imp->imp_sec = sec;
+       spin_unlock(&imp->imp_lock);
+
+       if (old_sec) {
+               sptlrpc_sec_kill(old_sec);
+
+               /* balance the ref taken by this import */
+               sptlrpc_sec_put(old_sec);
+       }
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+       return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+       *dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+                                            struct ptlrpc_sec *sec,
+                                            struct sptlrpc_flavor *sf)
+{
+       char    str1[32], str2[32];
+
+       if (sec->ps_flvr.sf_flags != sf->sf_flags)
+               CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+                      sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+                                           str1, sizeof(str1)),
+                      sptlrpc_secflags2str(sf->sf_flags,
+                                           str2, sizeof(str2)));
+
+       spin_lock(&sec->ps_lock);
+       flavor_copy(&sec->ps_flvr, sf);
+       spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *svc_ctx,
+                            struct sptlrpc_flavor *flvr)
+{
+       struct ptlrpc_connection   *conn;
+       struct sptlrpc_flavor       sf;
+       struct ptlrpc_sec         *sec, *newsec;
+       enum lustre_sec_part    sp;
+       char                    str[24];
+       int                      rc = 0;
+       ENTRY;
+
+       might_sleep();
+
+       if (imp == NULL)
+               RETURN(0);
+
+       conn = imp->imp_connection;
+
+       if (svc_ctx == NULL) {
+               struct client_obd *cliobd = &imp->imp_obd->u.cli;
+               /*
+                * normal import, determine flavor from rule set, except
+                * for mgc the flavor is predetermined.
+                */
+               if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+                       sf = cliobd->cl_flvr_mgc;
+               else
+                       sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+                                                  cliobd->cl_sp_to,
+                                                  &cliobd->cl_target_uuid,
+                                                  conn->c_self, &sf);
+
+               sp = imp->imp_obd->u.cli.cl_sp_me;
+       } else {
+               /* reverse import, determine flavor from incoming reqeust */
+               sf = *flvr;
+
+               if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+                       sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+                                     PTLRPC_SEC_FL_ROOTONLY;
+
+               sp = sptlrpc_target_sec_part(imp->imp_obd);
+       }
+
+       sec = sptlrpc_import_sec_ref(imp);
+       if (sec) {
+               char    str2[24];
+
+               if (flavor_equal(&sf, &sec->ps_flvr))
+                       GOTO(out, rc);
+
+               CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid),
+                      sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+                      sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+               if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+                   SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+                   SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+                   SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+                       sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+                       GOTO(out, rc);
+               }
+       } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+                  SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+               CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid),
+                      LNET_NIDNET(conn->c_self),
+                      sptlrpc_flavor2name(&sf, str, sizeof(str)));
+       }
+
+       mutex_lock(&imp->imp_sec_mutex);
+
+       newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+       if (newsec) {
+               sptlrpc_import_sec_install(imp, newsec);
+       } else {
+               CERROR("import %s->%s: failed to create new sec\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid));
+               rc = -EPERM;
+       }
+
+       mutex_unlock(&imp->imp_sec_mutex);
+out:
+       sptlrpc_sec_put(sec);
+       RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+       if (imp->imp_sec) {
+               sptlrpc_sec_kill(imp->imp_sec);
+
+               sptlrpc_sec_put(imp->imp_sec);
+               imp->imp_sec = NULL;
+       }
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+                                   uid_t uid, int grace, int force)
+{
+       struct ptlrpc_sec *sec;
+
+       if (imp == NULL)
+               return;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       if (sec == NULL)
+               return;
+
+       sec_cop_flush_ctx_cache(sec, uid, grace, force);
+       sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+       /* it's important to use grace mode, see explain in
+        * sptlrpc_req_refresh_ctx() */
+       import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+       import_flush_ctx_common(imp, current_uid(), 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+       import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       int rc;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT(req->rq_reqmsg == NULL);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       policy = ctx->cc_sec->ps_policy;
+       rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+       if (!rc) {
+               LASSERT(req->rq_reqmsg);
+               LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+               /* zeroing preallocated buffer */
+               if (req->rq_pool)
+                       memset(req->rq_reqmsg, 0, msgsize);
+       }
+
+       return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+               return;
+
+       policy = ctx->cc_sec->ps_policy;
+       policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+       req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                 int segment, int newsize)
+{
+       void   *src, *dst;
+       int     oldsize, oldmsg_size, movesize;
+
+       LASSERT(segment < msg->lm_bufcount);
+       LASSERT(msg->lm_buflens[segment] <= newsize);
+
+       if (msg->lm_buflens[segment] == newsize)
+               return;
+
+       /* nothing to do if we are enlarging the last segment */
+       if (segment == msg->lm_bufcount - 1) {
+               msg->lm_buflens[segment] = newsize;
+               return;
+       }
+
+       oldsize = msg->lm_buflens[segment];
+
+       src = lustre_msg_buf(msg, segment + 1, 0);
+       msg->lm_buflens[segment] = newsize;
+       dst = lustre_msg_buf(msg, segment + 1, 0);
+       msg->lm_buflens[segment] = oldsize;
+
+       /* move from segment + 1 to end segment */
+       LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+       oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+       LASSERT(movesize >= 0);
+
+       if (movesize)
+               memmove(dst, src, movesize);
+
+       /* note we don't clear the ares where old data live, not secret */
+
+       /* finally set new segment size */
+       msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+                              int segment, int newsize)
+{
+       struct ptlrpc_cli_ctx    *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_cops   *cops;
+       struct lustre_msg       *msg = req->rq_reqmsg;
+
+       LASSERT(ctx);
+       LASSERT(msg);
+       LASSERT(msg->lm_bufcount > segment);
+       LASSERT(msg->lm_buflens[segment] <= newsize);
+
+       if (msg->lm_buflens[segment] == newsize)
+               return 0;
+
+       cops = ctx->cc_sec->ps_policy->sp_cops;
+       LASSERT(cops->enlarge_reqbuf);
+       return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+
+       if (req->rq_repbuf)
+               RETURN(0);
+
+       policy = ctx->cc_sec->ps_policy;
+       RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (req->rq_repbuf == NULL)
+               return;
+       LASSERT(req->rq_repbuf_len);
+
+       policy = ctx->cc_sec->ps_policy;
+       policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+       req->rq_repmsg = NULL;
+       EXIT;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+       if (!policy->sp_cops->install_rctx)
+               return 0;
+       return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_svc_ctx *ctx)
+{
+       struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+       if (!policy->sp_sops->install_rctx)
+               return 0;
+       return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security                 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+                         struct ptlrpc_request *req)
+{
+       struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+       if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+               return 1;
+
+       if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+           SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+           SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+           SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+               return 1;
+
+       return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+                               struct ptlrpc_request *req)
+{
+       struct sptlrpc_flavor   flavor;
+
+       if (exp == NULL)
+               return 0;
+
+       /* client side export has no imp_reverse, skip
+        * FIXME maybe we should check flavor this as well??? */
+       if (exp->exp_imp_reverse == NULL)
+               return 0;
+
+       /* don't care about ctx fini rpc */
+       if (req->rq_ctx_fini)
+               return 0;
+
+       spin_lock(&exp->exp_lock);
+
+       /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+        * the first req with the new flavor, then treat it as current flavor,
+        * adapt reverse sec according to it.
+        * note the first rpc with new flavor might not be with root ctx, in
+        * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+       if (unlikely(exp->exp_flvr_changed) &&
+           flavor_allowed(&exp->exp_flvr_old[1], req)) {
+               /* make the new flavor as "current", and old ones as
+                * about-to-expire */
+               CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+                      exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+               flavor = exp->exp_flvr_old[1];
+               exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+               exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+               exp->exp_flvr_old[0] = exp->exp_flvr;
+               exp->exp_flvr_expire[0] = cfs_time_current_sec() +
+                                         EXP_FLVR_UPDATE_EXPIRE;
+               exp->exp_flvr = flavor;
+
+               /* flavor change finished */
+               exp->exp_flvr_changed = 0;
+               LASSERT(exp->exp_flvr_adapt == 1);
+
+               /* if it's gss, we only interested in root ctx init */
+               if (req->rq_auth_gss &&
+                   !(req->rq_ctx_init &&
+                     (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+                      req->rq_auth_usr_ost))) {
+                       spin_unlock(&exp->exp_lock);
+                       CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+                              req->rq_auth_gss, req->rq_ctx_init,
+                              req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+                              req->rq_auth_usr_ost);
+                       return 0;
+               }
+
+               exp->exp_flvr_adapt = 0;
+               spin_unlock(&exp->exp_lock);
+
+               return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+                                               req->rq_svc_ctx, &flavor);
+       }
+
+       /* if it equals to the current flavor, we accept it, but need to
+        * dealing with reverse sec/ctx */
+       if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+               /* most cases should return here, we only interested in
+                * gss root ctx init */
+               if (!req->rq_auth_gss || !req->rq_ctx_init ||
+                   (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                    !req->rq_auth_usr_ost)) {
+                       spin_unlock(&exp->exp_lock);
+                       return 0;
+               }
+
+               /* if flavor just changed, we should not proceed, just leave
+                * it and current flavor will be discovered and replaced
+                * shortly, and let _this_ rpc pass through */
+               if (exp->exp_flvr_changed) {
+                       LASSERT(exp->exp_flvr_adapt);
+                       spin_unlock(&exp->exp_lock);
+                       return 0;
+               }
+
+               if (exp->exp_flvr_adapt) {
+                       exp->exp_flvr_adapt = 0;
+                       CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+                              exp, exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[0].sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+                       flavor = exp->exp_flvr;
+                       spin_unlock(&exp->exp_lock);
+
+                       return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+                                                       req->rq_svc_ctx,
+                                                       &flavor);
+               } else {
+                       CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
+                              "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[0].sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+                       spin_unlock(&exp->exp_lock);
+
+                       return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+                                                          req->rq_svc_ctx);
+               }
+       }
+
+       if (exp->exp_flvr_expire[0]) {
+               if (exp->exp_flvr_expire[0] >= cfs_time_current_sec()) {
+                       if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+                               CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+                                      "middle one ("CFS_DURATION_T")\n", exp,
+                                      exp->exp_flvr.sf_rpc,
+                                      exp->exp_flvr_old[0].sf_rpc,
+                                      exp->exp_flvr_old[1].sf_rpc,
+                                      exp->exp_flvr_expire[0] -
+                                               cfs_time_current_sec());
+                               spin_unlock(&exp->exp_lock);
+                               return 0;
+                       }
+               } else {
+                       CDEBUG(D_SEC, "mark middle expired\n");
+                       exp->exp_flvr_expire[0] = 0;
+               }
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+                      exp->exp_flvr.sf_rpc,
+                      exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                      req->rq_flvr.sf_rpc);
+       }
+
+       /* now it doesn't match the current flavor, the only chance we can
+        * accept it is match the old flavors which is not expired. */
+       if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+               if (exp->exp_flvr_expire[1] >= cfs_time_current_sec()) {
+                       if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+                               CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+                                      "oldest one ("CFS_DURATION_T")\n", exp,
+                                      exp->exp_flvr.sf_rpc,
+                                      exp->exp_flvr_old[0].sf_rpc,
+                                      exp->exp_flvr_old[1].sf_rpc,
+                                      exp->exp_flvr_expire[1] -
+                                               cfs_time_current_sec());
+                               spin_unlock(&exp->exp_lock);
+                               return 0;
+                       }
+               } else {
+                       CDEBUG(D_SEC, "mark oldest expired\n");
+                       exp->exp_flvr_expire[1] = 0;
+               }
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+                      exp, exp->exp_flvr.sf_rpc,
+                      exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                      req->rq_flvr.sf_rpc);
+       } else {
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+                      exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+                      exp->exp_flvr_old[1].sf_rpc);
+       }
+
+       spin_unlock(&exp->exp_lock);
+
+       CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with "
+             "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+             exp, exp->exp_obd->obd_name,
+             req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+             req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+             req->rq_flvr.sf_rpc,
+             exp->exp_flvr.sf_rpc,
+             exp->exp_flvr_old[0].sf_rpc,
+             exp->exp_flvr_expire[0] ?
+             (unsigned long) (exp->exp_flvr_expire[0] -
+                              cfs_time_current_sec()) : 0,
+             exp->exp_flvr_old[1].sf_rpc,
+             exp->exp_flvr_expire[1] ?
+             (unsigned long) (exp->exp_flvr_expire[1] -
+                              cfs_time_current_sec()) : 0);
+       return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                     struct sptlrpc_rule_set *rset)
+{
+       struct obd_export       *exp;
+       struct sptlrpc_flavor    new_flvr;
+
+       LASSERT(obd);
+
+       spin_lock(&obd->obd_dev_lock);
+
+       list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+               if (exp->exp_connection == NULL)
+                       continue;
+
+               /* note if this export had just been updated flavor
+                * (exp_flvr_changed == 1), this will override the
+                * previous one. */
+               spin_lock(&exp->exp_lock);
+               sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+                                            exp->exp_connection->c_peer.nid,
+                                            &new_flvr);
+               if (exp->exp_flvr_changed ||
+                   !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+                       exp->exp_flvr_old[1] = new_flvr;
+                       exp->exp_flvr_expire[1] = 0;
+                       exp->exp_flvr_changed = 1;
+                       exp->exp_flvr_adapt = 1;
+
+                       CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+                              exp, sptlrpc_part2name(exp->exp_sp_peer),
+                              exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+               }
+               spin_unlock(&exp->exp_lock);
+       }
+
+       spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+       /* peer's claim is unreliable unless gss is being used */
+       if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+               return svc_rc;
+
+       switch (req->rq_sp_from) {
+       case LUSTRE_SP_CLI:
+               if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source CLI");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_MDT:
+               if (!req->rq_auth_usr_mdt) {
+                       DEBUG_REQ(D_ERROR, req, "faked source MDT");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_OST:
+               if (!req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source OST");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_MGS:
+       case LUSTRE_SP_MGC:
+               if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                   !req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_ANY:
+       default:
+               DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+               svc_rc = SECSVC_DROP;
+       }
+
+       return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct lustre_msg       *msg = req->rq_reqbuf;
+       int                    rc;
+       ENTRY;
+
+       LASSERT(msg);
+       LASSERT(req->rq_reqmsg == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+       LASSERT(req->rq_svc_ctx == NULL);
+
+       req->rq_req_swab_mask = 0;
+
+       rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+       switch (rc) {
+       case 1:
+               lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+       case 0:
+               break;
+       default:
+               CERROR("error unpacking request from %s x"LPU64"\n",
+                      libcfs_id2str(req->rq_peer), req->rq_xid);
+               RETURN(SECSVC_DROP);
+       }
+
+       req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+       req->rq_sp_from = LUSTRE_SP_ANY;
+       req->rq_auth_uid = INVALID_UID;
+       req->rq_auth_mapped_uid = INVALID_UID;
+
+       policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+       if (!policy) {
+               CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               RETURN(SECSVC_DROP);
+       }
+
+       LASSERT(policy->sp_sops->accept);
+       rc = policy->sp_sops->accept(req);
+       sptlrpc_policy_put(policy);
+       LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+       LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+       /*
+        * if it's not null flavor (which means embedded packing msg),
+        * reset the swab mask for the comming inner msg unpacking.
+        */
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+               req->rq_req_swab_mask = 0;
+
+       /* sanity check for the request source */
+       rc = sptlrpc_svc_check_from(req, rc);
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct ptlrpc_reply_state *rs;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_svc_ctx->sc_policy);
+
+       policy = req->rq_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->alloc_rs);
+
+       rc = policy->sp_sops->alloc_rs(req, msglen);
+       if (unlikely(rc == -ENOMEM)) {
+               /* failed alloc, try emergency pool */
+               rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_svcpt);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               req->rq_reply_state = rs;
+               rc = policy->sp_sops->alloc_rs(req, msglen);
+               if (rc) {
+                       lustre_put_emerg_rs(rs);
+                       req->rq_reply_state = NULL;
+               }
+       }
+
+       LASSERT(rc != 0 ||
+               (req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_sec_policy *policy;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_svc_ctx->sc_policy);
+
+       policy = req->rq_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->authorize);
+
+       rc = policy->sp_sops->authorize(req);
+       LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_sec_policy *policy;
+       unsigned int prealloc;
+       ENTRY;
+
+       LASSERT(rs->rs_svc_ctx);
+       LASSERT(rs->rs_svc_ctx->sc_policy);
+
+       policy = rs->rs_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->free_rs);
+
+       prealloc = rs->rs_prealloc;
+       policy->sp_sops->free_rs(rs);
+
+       if (prealloc)
+               lustre_put_emerg_rs(rs);
+       EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx != NULL)
+               atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx == NULL)
+               return;
+
+       LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+       if (atomic_dec_and_test(&ctx->sc_refcount)) {
+               if (ctx->sc_policy->sp_sops->free_ctx)
+                       ctx->sc_policy->sp_sops->free_ctx(ctx);
+       }
+       req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx == NULL)
+               return;
+
+       LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+       if (ctx->sc_policy->sp_sops->invalidate_ctx)
+               ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security                       *
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return 0;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->wrap_bulk)
+               return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc,
+                                int nob)
+{
+       struct ptlrpc_cli_ctx  *ctx;
+       int                  rc;
+
+       LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return desc->bd_nob_transferred;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->unwrap_bulk) {
+               rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+               if (rc < 0)
+                       return rc;
+       }
+       return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_cli_ctx  *ctx;
+       int                  rc;
+
+       LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return 0;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->unwrap_bulk) {
+               rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+               if (rc < 0)
+                       return rc;
+       }
+
+       /*
+        * if everything is going right, nob should equals to nob_transferred.
+        * in case of privacy mode, nob_transferred needs to be adjusted.
+        */
+       if (desc->bd_nob != desc->bd_nob_transferred) {
+               CERROR("nob %d doesn't match transferred nob %d",
+                      desc->bd_nob, desc->bd_nob_transferred);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers           *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+       int ngroups;
+
+       ngroups = current_ngroups;
+
+       if (ngroups > LUSTRE_MAX_GROUPS)
+               ngroups = LUSTRE_MAX_GROUPS;
+       return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+       struct ptlrpc_user_desc *pud;
+
+       pud = lustre_msg_buf(msg, offset, 0);
+
+       pud->pud_uid = current_uid();
+       pud->pud_gid = current_gid();
+       pud->pud_fsuid = current_fsuid();
+       pud->pud_fsgid = current_fsgid();
+       pud->pud_cap = cfs_curproc_cap_pack();
+       pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+       task_lock(current);
+       if (pud->pud_ngroups > current_ngroups)
+               pud->pud_ngroups = current_ngroups;
+       memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+              pud->pud_ngroups * sizeof(__u32));
+       task_unlock(current);
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+       struct ptlrpc_user_desc *pud;
+       int                   i;
+
+       pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+       if (!pud)
+               return -EINVAL;
+
+       if (swabbed) {
+               __swab32s(&pud->pud_uid);
+               __swab32s(&pud->pud_gid);
+               __swab32s(&pud->pud_fsuid);
+               __swab32s(&pud->pud_fsgid);
+               __swab32s(&pud->pud_cap);
+               __swab32s(&pud->pud_ngroups);
+       }
+
+       if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+               CERROR("%u groups is too large\n", pud->pud_ngroups);
+               return -EINVAL;
+       }
+
+       if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+           msg->lm_buflens[offset]) {
+               CERROR("%u groups are claimed but bufsize only %u\n",
+                      pud->pud_ngroups, msg->lm_buflens[offset]);
+               return -EINVAL;
+       }
+
+       if (swabbed) {
+               for (i = 0; i < pud->pud_ngroups; i++)
+                       __swab32s(&pud->pud_groups[i]);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers                         *
+ ****************************************/
+
+const char * sec2target_str(struct ptlrpc_sec *sec)
+{
+       if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+               return "*";
+       if (sec_is_reverse(sec))
+               return "c";
+       return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+       switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+       case SPTLRPC_BULK_SVC_INTG:
+       case SPTLRPC_BULK_SVC_PRIV:
+               return 1;
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize           *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+       int rc;
+
+       rwlock_init(&policy_lock);
+
+       rc = sptlrpc_gc_init();
+       if (rc)
+               goto out;
+
+       rc = sptlrpc_conf_init();
+       if (rc)
+               goto out_gc;
+
+       rc = sptlrpc_enc_pool_init();
+       if (rc)
+               goto out_conf;
+
+       rc = sptlrpc_null_init();
+       if (rc)
+               goto out_pool;
+
+       rc = sptlrpc_plain_init();
+       if (rc)
+               goto out_null;
+
+       rc = sptlrpc_lproc_init();
+       if (rc)
+               goto out_plain;
+
+       return 0;
+
+out_plain:
+       sptlrpc_plain_fini();
+out_null:
+       sptlrpc_null_fini();
+out_pool:
+       sptlrpc_enc_pool_fini();
+out_conf:
+       sptlrpc_conf_fini();
+out_gc:
+       sptlrpc_gc_fini();
+out:
+       return rc;
+}
+
+void sptlrpc_fini(void)
+{
+       sptlrpc_lproc_fini();
+       sptlrpc_plain_fini();
+       sptlrpc_null_fini();
+       sptlrpc_enc_pool_fini();
+       sptlrpc_conf_fini();
+       sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644 (file)
index 0000000..60ab2ea
--- /dev/null
@@ -0,0 +1,881 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools     *
+ ****************************************/
+
+
+#define PTRS_PER_PAGE   (PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX       (100)
+#define IDLE_IDX_WEIGHT         (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+       /*
+        * constants
+        */
+       unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+       unsigned int     epp_max_pools;   /* number of pools, const */
+
+       /*
+        * wait queue in case of not enough free pages.
+        */
+       wait_queue_head_t      epp_waitq;       /* waiting threads */
+       unsigned int     epp_waitqlen;    /* wait queue length */
+       unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+       unsigned int     epp_growing:1;   /* during adding pages */
+
+       /*
+        * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+        * this is counted based on each time when getting pages from
+        * the pools, not based on time. which means in case that system
+        * is idled for a while but the idle_idx might still be low if no
+        * activities happened in the pools.
+        */
+       unsigned long    epp_idle_idx;
+
+       /* last shrink time due to mem tight */
+       long         epp_last_shrink;
+       long         epp_last_access;
+
+       /*
+        * in-pool pages bookkeeping
+        */
+       spinlock_t       epp_lock;         /* protect following fields */
+       unsigned long    epp_total_pages; /* total pages in pools */
+       unsigned long    epp_free_pages;  /* current pages available */
+
+       /*
+        * statistics
+        */
+       unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+       unsigned int     epp_st_grows;    /* # of grows */
+       unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+       unsigned int     epp_st_shrinks;        /* # of shrinks */
+       unsigned long    epp_st_access;  /* # of access */
+       unsigned long    epp_st_missings;       /* # of cache missing */
+       unsigned long    epp_st_lowfree;        /* lowest free pages reached */
+       unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+       cfs_time_t       epp_st_max_wait;       /* in jeffies */
+       /*
+        * pointers to pools
+        */
+       struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * memory shrinker
+ */
+const int pools_shrinker_seeks = DEFAULT_SEEKS;
+static struct shrinker *pools_shrinker = NULL;
+
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count,
+                              int *eof, void *data)
+{
+       int     rc;
+
+       spin_lock(&page_pools.epp_lock);
+
+       rc = snprintf(page, count,
+                     "physical pages:    %lu\n"
+                     "pages per pool:    %lu\n"
+                     "max pages:              %lu\n"
+                     "max pools:              %u\n"
+                     "total pages:          %lu\n"
+                     "total free:            %lu\n"
+                     "idle index:            %lu/100\n"
+                     "last shrink:          %lds\n"
+                     "last access:          %lds\n"
+                     "max pages reached:       %lu\n"
+                     "grows:              %u\n"
+                     "grows failure:      %u\n"
+                     "shrinks:          %u\n"
+                     "cache access:        %lu\n"
+                     "cache missing:      %lu\n"
+                     "low free mark:      %lu\n"
+                     "max waitqueue depth:     %u\n"
+                     "max wait time:      "CFS_TIME_T"/%u\n"
+                     ,
+                     num_physpages,
+                     PAGES_PER_POOL,
+                     page_pools.epp_max_pages,
+                     page_pools.epp_max_pools,
+                     page_pools.epp_total_pages,
+                     page_pools.epp_free_pages,
+                     page_pools.epp_idle_idx,
+                     cfs_time_current_sec() - page_pools.epp_last_shrink,
+                     cfs_time_current_sec() - page_pools.epp_last_access,
+                     page_pools.epp_st_max_pages,
+                     page_pools.epp_st_grows,
+                     page_pools.epp_st_grow_fails,
+                     page_pools.epp_st_shrinks,
+                     page_pools.epp_st_access,
+                     page_pools.epp_st_missings,
+                     page_pools.epp_st_lowfree,
+                     page_pools.epp_st_max_wqlen,
+                     page_pools.epp_st_max_wait, HZ
+                    );
+
+       spin_unlock(&page_pools.epp_lock);
+       return rc;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+       int     p_idx, g_idx;
+       int     p_idx_max1, p_idx_max2;
+
+       LASSERT(npages > 0);
+       LASSERT(npages <= page_pools.epp_free_pages);
+       LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+       /* max pool index before the release */
+       p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+       page_pools.epp_free_pages -= npages;
+       page_pools.epp_total_pages -= npages;
+
+       /* max pool index after the release */
+       p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+                    ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+       LASSERT(page_pools.epp_pools[p_idx]);
+
+       while (npages--) {
+               LASSERT(page_pools.epp_pools[p_idx]);
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+               __free_page(page_pools.epp_pools[p_idx][g_idx]);
+               page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       };
+
+       /* free unused pools */
+       while (p_idx_max1 < p_idx_max2) {
+               LASSERT(page_pools.epp_pools[p_idx_max2]);
+               OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+               page_pools.epp_pools[p_idx_max2] = NULL;
+               p_idx_max2--;
+       }
+}
+
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       if (unlikely(shrink_param(sc, nr_to_scan) != 0)) {
+               spin_lock(&page_pools.epp_lock);
+               shrink_param(sc, nr_to_scan) = min_t(unsigned long,
+                                                  shrink_param(sc, nr_to_scan),
+                                                  page_pools.epp_free_pages -
+                                                  PTLRPC_MAX_BRW_PAGES);
+               if (shrink_param(sc, nr_to_scan) > 0) {
+                       enc_pools_release_free_pages(shrink_param(sc,
+                                                                 nr_to_scan));
+                       CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+                              (long)shrink_param(sc, nr_to_scan),
+                              page_pools.epp_free_pages);
+
+                       page_pools.epp_st_shrinks++;
+                       page_pools.epp_last_shrink = cfs_time_current_sec();
+               }
+               spin_unlock(&page_pools.epp_lock);
+       }
+
+       /*
+        * if no pool access for a long time, we consider it's fully idle.
+        * a little race here is fine.
+        */
+       if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access >
+                    CACHE_QUIESCENT_PERIOD)) {
+               spin_lock(&page_pools.epp_lock);
+               page_pools.epp_idle_idx = IDLE_IDX_MAX;
+               spin_unlock(&page_pools.epp_lock);
+       }
+
+       LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+       return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+               (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+       return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+       unsigned long cleaned = 0;
+       int        i, j;
+
+       for (i = 0; i < npools; i++) {
+               if (pools[i]) {
+                       for (j = 0; j < PAGES_PER_POOL; j++) {
+                               if (pools[i][j]) {
+                                       __free_page(pools[i][j]);
+                                       cleaned++;
+                               }
+                       }
+                       OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+                       pools[i] = NULL;
+               }
+       }
+
+       return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+       int     freeslot;
+       int     op_idx, np_idx, og_idx, ng_idx;
+       int     cur_npools, end_npools;
+
+       LASSERT(npages > 0);
+       LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+       LASSERT(npages_to_npools(npages) == npools);
+       LASSERT(page_pools.epp_growing);
+
+       spin_lock(&page_pools.epp_lock);
+
+       /*
+        * (1) fill all the free slots of current pools.
+        */
+       /* free slots are those left by rent pages, and the extra ones with
+        * index >= total_pages, locate at the tail of last pool. */
+       freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+       if (freeslot != 0)
+               freeslot = PAGES_PER_POOL - freeslot;
+       freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+       op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+       np_idx = npools - 1;
+       ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+       while (freeslot) {
+               LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+               LASSERT(pools[np_idx][ng_idx] != NULL);
+
+               page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+               pools[np_idx][ng_idx] = NULL;
+
+               freeslot--;
+
+               if (++og_idx == PAGES_PER_POOL) {
+                       op_idx++;
+                       og_idx = 0;
+               }
+               if (--ng_idx < 0) {
+                       if (np_idx == 0)
+                               break;
+                       np_idx--;
+                       ng_idx = PAGES_PER_POOL - 1;
+               }
+       }
+
+       /*
+        * (2) add pools if needed.
+        */
+       cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+                    PAGES_PER_POOL;
+       end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
+                    PAGES_PER_POOL;
+       LASSERT(end_npools <= page_pools.epp_max_pools);
+
+       np_idx = 0;
+       while (cur_npools < end_npools) {
+               LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+               LASSERT(np_idx < npools);
+               LASSERT(pools[np_idx] != NULL);
+
+               page_pools.epp_pools[cur_npools++] = pools[np_idx];
+               pools[np_idx++] = NULL;
+       }
+
+       page_pools.epp_total_pages += npages;
+       page_pools.epp_free_pages += npages;
+       page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+       if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+               page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+       CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+              page_pools.epp_total_pages);
+
+       spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+       static DEFINE_MUTEX(add_pages_mutex);
+       struct page   ***pools;
+       int          npools, alloced = 0;
+       int          i, j, rc = -ENOMEM;
+
+       if (npages < PTLRPC_MAX_BRW_PAGES)
+               npages = PTLRPC_MAX_BRW_PAGES;
+
+       mutex_lock(&add_pages_mutex);
+
+       if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+               npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+       LASSERT(npages > 0);
+
+       page_pools.epp_st_grows++;
+
+       npools = npages_to_npools(npages);
+       OBD_ALLOC(pools, npools * sizeof(*pools));
+       if (pools == NULL)
+               goto out;
+
+       for (i = 0; i < npools; i++) {
+               OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+               if (pools[i] == NULL)
+                       goto out_pools;
+
+               for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+                       pools[i][j] = alloc_page(__GFP_IO |
+                                                    __GFP_HIGHMEM);
+                       if (pools[i][j] == NULL)
+                               goto out_pools;
+
+                       alloced++;
+               }
+       }
+       LASSERT(alloced == npages);
+
+       enc_pools_insert(pools, npools, npages);
+       CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+       rc = 0;
+
+out_pools:
+       enc_pools_cleanup(pools, npools);
+       OBD_FREE(pools, npools * sizeof(*pools));
+out:
+       if (rc) {
+               page_pools.epp_st_grow_fails++;
+               CERROR("Failed to allocate %d enc pages\n", npages);
+       }
+
+       mutex_unlock(&add_pages_mutex);
+       return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+       LASSERT(spin_is_locked(&page_pools.epp_lock));
+       LASSERT(page_pools.epp_waitqlen >= 0);
+
+       if (unlikely(page_pools.epp_waitqlen)) {
+               LASSERT(waitqueue_active(&page_pools.epp_waitq));
+               wake_up_all(&page_pools.epp_waitq);
+       }
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+       /* don't grow if someone else is growing the pools right now,
+        * or the pools has reached its full capacity
+        */
+       if (page_pools.epp_growing ||
+           page_pools.epp_total_pages == page_pools.epp_max_pages)
+               return 0;
+
+       /* if total pages is not enough, we need to grow */
+       if (page_pools.epp_total_pages < page_needed)
+               return 1;
+
+       /*
+        * we wanted to return 0 here if there was a shrink just happened
+        * moment ago, but this may cause deadlock if both client and ost
+        * live on single node.
+        */
+#if 0
+       if (now - page_pools.epp_last_shrink < 2)
+               return 0;
+#endif
+
+       /*
+        * here we perhaps need consider other factors like wait queue
+        * length, idle index, etc. ?
+        */
+
+       /* grow the pools in any other cases */
+       return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+       wait_queue_t  waitlink;
+       unsigned long   this_idle = -1;
+       cfs_time_t      tick = 0;
+       long        now;
+       int          p_idx, g_idx;
+       int          i;
+
+       LASSERT(desc->bd_iov_count > 0);
+       LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+       /* resent bulk, enc iov might have been allocated previously */
+       if (desc->bd_enc_iov != NULL)
+               return 0;
+
+       OBD_ALLOC(desc->bd_enc_iov,
+                 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+       if (desc->bd_enc_iov == NULL)
+               return -ENOMEM;
+
+       spin_lock(&page_pools.epp_lock);
+
+       page_pools.epp_st_access++;
+again:
+       if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+               if (tick == 0)
+                       tick = cfs_time_current();
+
+               now = cfs_time_current_sec();
+
+               page_pools.epp_st_missings++;
+               page_pools.epp_pages_short += desc->bd_iov_count;
+
+               if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+                       page_pools.epp_growing = 1;
+
+                       spin_unlock(&page_pools.epp_lock);
+                       enc_pools_add_pages(page_pools.epp_pages_short / 2);
+                       spin_lock(&page_pools.epp_lock);
+
+                       page_pools.epp_growing = 0;
+
+                       enc_pools_wakeup();
+               } else {
+                       if (++page_pools.epp_waitqlen >
+                           page_pools.epp_st_max_wqlen)
+                               page_pools.epp_st_max_wqlen =
+                                               page_pools.epp_waitqlen;
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       init_waitqueue_entry_current(&waitlink);
+                       add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+                       spin_unlock(&page_pools.epp_lock);
+                       waitq_wait(&waitlink, TASK_UNINTERRUPTIBLE);
+                       remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+                       LASSERT(page_pools.epp_waitqlen > 0);
+                       spin_lock(&page_pools.epp_lock);
+                       page_pools.epp_waitqlen--;
+               }
+
+               LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+               page_pools.epp_pages_short -= desc->bd_iov_count;
+
+               this_idle = 0;
+               goto again;
+       }
+
+       /* record max wait time */
+       if (unlikely(tick != 0)) {
+               tick = cfs_time_current() - tick;
+               if (tick > page_pools.epp_st_max_wait)
+                       page_pools.epp_st_max_wait = tick;
+       }
+
+       /* proceed with rest of allocation */
+       page_pools.epp_free_pages -= desc->bd_iov_count;
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+               desc->bd_enc_iov[i].kiov_page =
+                                       page_pools.epp_pools[p_idx][g_idx];
+               page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       }
+
+       if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+               page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+       /*
+        * new idle index = (old * weight + new) / (weight + 1)
+        */
+       if (this_idle == -1) {
+               this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+                           page_pools.epp_total_pages;
+       }
+       page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+                                  this_idle) /
+                                 (IDLE_IDX_WEIGHT + 1);
+
+       page_pools.epp_last_access = cfs_time_current_sec();
+
+       spin_unlock(&page_pools.epp_lock);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+       int     p_idx, g_idx;
+       int     i;
+
+       if (desc->bd_enc_iov == NULL)
+               return;
+
+       LASSERT(desc->bd_iov_count > 0);
+
+       spin_lock(&page_pools.epp_lock);
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+       LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+               page_pools.epp_total_pages);
+       LASSERT(page_pools.epp_pools[p_idx]);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+               LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+               page_pools.epp_pools[p_idx][g_idx] =
+                                       desc->bd_enc_iov[i].kiov_page;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       }
+
+       page_pools.epp_free_pages += desc->bd_iov_count;
+
+       enc_pools_wakeup();
+
+       spin_unlock(&page_pools.epp_lock);
+
+       OBD_FREE(desc->bd_enc_iov,
+                desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+       desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+       int     need_grow = 0;
+
+       spin_lock(&page_pools.epp_lock);
+       if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+               page_pools.epp_growing = 1;
+               need_grow = 1;
+       }
+       spin_unlock(&page_pools.epp_lock);
+
+       if (need_grow) {
+               enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+                                   PTLRPC_MAX_BRW_PAGES);
+
+               spin_lock(&page_pools.epp_lock);
+               page_pools.epp_growing = 0;
+               enc_pools_wakeup();
+               spin_unlock(&page_pools.epp_lock);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+       LASSERT(page_pools.epp_max_pools);
+       OBD_ALLOC_LARGE(page_pools.epp_pools,
+                       page_pools.epp_max_pools *
+                       sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+       LASSERT(page_pools.epp_max_pools);
+       LASSERT(page_pools.epp_pools);
+
+       OBD_FREE_LARGE(page_pools.epp_pools,
+                      page_pools.epp_max_pools *
+                      sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+       /*
+        * maximum capacity is 1/8 of total physical memory.
+        * is the 1/8 a good number?
+        */
+       page_pools.epp_max_pages = num_physpages / 8;
+       page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+       init_waitqueue_head(&page_pools.epp_waitq);
+       page_pools.epp_waitqlen = 0;
+       page_pools.epp_pages_short = 0;
+
+       page_pools.epp_growing = 0;
+
+       page_pools.epp_idle_idx = 0;
+       page_pools.epp_last_shrink = cfs_time_current_sec();
+       page_pools.epp_last_access = cfs_time_current_sec();
+
+       spin_lock_init(&page_pools.epp_lock);
+       page_pools.epp_total_pages = 0;
+       page_pools.epp_free_pages = 0;
+
+       page_pools.epp_st_max_pages = 0;
+       page_pools.epp_st_grows = 0;
+       page_pools.epp_st_grow_fails = 0;
+       page_pools.epp_st_shrinks = 0;
+       page_pools.epp_st_access = 0;
+       page_pools.epp_st_missings = 0;
+       page_pools.epp_st_lowfree = 0;
+       page_pools.epp_st_max_wqlen = 0;
+       page_pools.epp_st_max_wait = 0;
+
+       enc_pools_alloc();
+       if (page_pools.epp_pools == NULL)
+               return -ENOMEM;
+
+       pools_shrinker = set_shrinker(pools_shrinker_seeks,
+                                         enc_pools_shrink);
+       if (pools_shrinker == NULL) {
+               enc_pools_free();
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+       unsigned long cleaned, npools;
+
+       LASSERT(pools_shrinker);
+       LASSERT(page_pools.epp_pools);
+       LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+       remove_shrinker(pools_shrinker);
+
+       npools = npages_to_npools(page_pools.epp_total_pages);
+       cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+       LASSERT(cleaned == page_pools.epp_total_pages);
+
+       enc_pools_free();
+
+       if (page_pools.epp_st_access > 0) {
+               CDEBUG(D_SEC,
+                      "max pages %lu, grows %u, grow fails %u, shrinks %u, "
+                      "access %lu, missing %lu, max qlen %u, max wait "
+                      CFS_TIME_T"/%d\n",
+                      page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+                      page_pools.epp_st_grow_fails,
+                      page_pools.epp_st_shrinks, page_pools.epp_st_access,
+                      page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+                      page_pools.epp_st_max_wait, HZ);
+       }
+}
+
+
+static int cfs_hash_alg_id[] = {
+       [BULK_HASH_ALG_NULL]    = CFS_HASH_ALG_NULL,
+       [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32,
+       [BULK_HASH_ALG_CRC32]   = CFS_HASH_ALG_CRC32,
+       [BULK_HASH_ALG_MD5]     = CFS_HASH_ALG_MD5,
+       [BULK_HASH_ALG_SHA1]    = CFS_HASH_ALG_SHA1,
+       [BULK_HASH_ALG_SHA256]  = CFS_HASH_ALG_SHA256,
+       [BULK_HASH_ALG_SHA384]  = CFS_HASH_ALG_SHA384,
+       [BULK_HASH_ALG_SHA512]  = CFS_HASH_ALG_SHA512,
+};
+const char * sptlrpc_get_hash_name(__u8 hash_alg)
+{
+       return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+       return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+       int                       size = msg->lm_buflens[offset];
+
+       bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+       if (bsd == NULL) {
+               CERROR("Invalid bulk sec desc: size %d\n", size);
+               return -EINVAL;
+       }
+
+       if (swabbed) {
+               __swab32s(&bsd->bsd_nob);
+       }
+
+       if (unlikely(bsd->bsd_version != 0)) {
+               CERROR("Unexpected version %u\n", bsd->bsd_version);
+               return -EPROTO;
+       }
+
+       if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+               CERROR("Invalid type %u\n", bsd->bsd_type);
+               return -EPROTO;
+       }
+
+       /* FIXME more sanity check here */
+
+       if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+                    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+                    bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+               CERROR("Invalid svc %u\n", bsd->bsd_svc);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                             void *buf, int buflen)
+{
+       struct cfs_crypto_hash_desc     *hdesc;
+       int                             hashsize;
+       char                            hashbuf[64];
+       unsigned int                    bufsize;
+       int                             i, err;
+
+       LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+       LASSERT(buflen >= 4);
+
+       hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+       if (IS_ERR(hdesc)) {
+               CERROR("Unable to initialize checksum hash %s\n",
+                      cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+               return PTR_ERR(hdesc);
+       }
+
+       hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+                                 desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+                                 desc->bd_iov[i].kiov_len);
+       }
+       if (hashsize > buflen) {
+               bufsize = sizeof(hashbuf);
+               err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+                                           &bufsize);
+               memcpy(buf, hashbuf, buflen);
+       } else {
+               bufsize = buflen;
+               err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+                                           &bufsize);
+       }
+
+       if (err)
+               cfs_crypto_hash_final(hdesc, NULL, NULL);
+       return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644 (file)
index 0000000..a45a392
--- /dev/null
@@ -0,0 +1,1233 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+       switch (part) {
+       case LUSTRE_SP_CLI:
+               return "cli";
+       case LUSTRE_SP_MDT:
+               return "mdt";
+       case LUSTRE_SP_OST:
+               return "ost";
+       case LUSTRE_SP_MGC:
+               return "mgc";
+       case LUSTRE_SP_MGS:
+               return "mgs";
+       case LUSTRE_SP_ANY:
+               return "any";
+       default:
+               return "err";
+       }
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+       const char *type = obd->obd_type->typ_name;
+
+       if (!strcmp(type, LUSTRE_MDT_NAME))
+               return LUSTRE_SP_MDT;
+       if (!strcmp(type, LUSTRE_OST_NAME))
+               return LUSTRE_SP_OST;
+       if (!strcmp(type, LUSTRE_MGS_NAME))
+               return LUSTRE_SP_MGS;
+
+       CERROR("unknown target %p(%s)\n", obd, type);
+       return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+       char        buf[32];
+       char       *bulk, *alg;
+
+       memset(flvr, 0, sizeof(*flvr));
+
+       if (str == NULL || str[0] == '\0') {
+               flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+               return 0;
+       }
+
+       strncpy(buf, str, sizeof(buf));
+       buf[sizeof(buf) - 1] = '\0';
+
+       bulk = strchr(buf, '-');
+       if (bulk)
+               *bulk++ = '\0';
+
+       flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+       if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+               goto err_out;
+
+       /*
+        * currently only base flavor "plain" can have bulk specification.
+        */
+       if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+               flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+               if (bulk) {
+                       /*
+                        * format: plain-hash:<hash_alg>
+                        */
+                       alg = strchr(bulk, ':');
+                       if (alg == NULL)
+                               goto err_out;
+                       *alg++ = '\0';
+
+                       if (strcmp(bulk, "hash"))
+                               goto err_out;
+
+                       flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+                       if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+                               goto err_out;
+               }
+
+               if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+                       flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+               else
+                       flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+       } else {
+               if (bulk)
+                       goto err_out;
+       }
+
+       flvr->sf_flags = 0;
+       return 0;
+
+err_out:
+       CERROR("invalid flavor string: %s\n", str);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules                   *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+       memset(sf, 0, sizeof(*sf));
+
+       sf->sf_rpc = SPTLRPC_FLVR_NULL;
+       sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+       rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+       rule->sr_from = LUSTRE_SP_ANY;
+       rule->sr_to = LUSTRE_SP_ANY;
+       rule->sr_padding = 0;
+
+       get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+       char       *flavor, *dir;
+       int          rc;
+
+       sptlrpc_rule_init(rule);
+
+       flavor = strchr(param, '=');
+       if (flavor == NULL) {
+               CERROR("invalid param, no '='\n");
+               RETURN(-EINVAL);
+       }
+       *flavor++ = '\0';
+
+       dir = strchr(param, '.');
+       if (dir)
+               *dir++ = '\0';
+
+       /* 1.1 network */
+       if (strcmp(param, "default")) {
+               rule->sr_netid = libcfs_str2net(param);
+               if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+                       CERROR("invalid network name: %s\n", param);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       /* 1.2 direction */
+       if (dir) {
+               if (!strcmp(dir, "mdt2ost")) {
+                       rule->sr_from = LUSTRE_SP_MDT;
+                       rule->sr_to = LUSTRE_SP_OST;
+               } else if (!strcmp(dir, "mdt2mdt")) {
+                       rule->sr_from = LUSTRE_SP_MDT;
+                       rule->sr_to = LUSTRE_SP_MDT;
+               } else if (!strcmp(dir, "cli2ost")) {
+                       rule->sr_from = LUSTRE_SP_CLI;
+                       rule->sr_to = LUSTRE_SP_OST;
+               } else if (!strcmp(dir, "cli2mdt")) {
+                       rule->sr_from = LUSTRE_SP_CLI;
+                       rule->sr_to = LUSTRE_SP_MDT;
+               } else {
+                       CERROR("invalid rule dir segment: %s\n", dir);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       /* 2.1 flavor */
+       rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+       if (rc)
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+       LASSERT(rset->srs_nslot ||
+               (rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+       if (rset->srs_nslot) {
+               OBD_FREE(rset->srs_rules,
+                        rset->srs_nslot * sizeof(*rset->srs_rules));
+               sptlrpc_rule_set_init(rset);
+       }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule *rules;
+       int nslot;
+
+       might_sleep();
+
+       if (rset->srs_nrule < rset->srs_nslot)
+               return 0;
+
+       nslot = rset->srs_nslot + 8;
+
+       /* better use realloc() if available */
+       OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+       if (rules == NULL)
+               return -ENOMEM;
+
+       if (rset->srs_nrule) {
+               LASSERT(rset->srs_nslot && rset->srs_rules);
+               memcpy(rules, rset->srs_rules,
+                      rset->srs_nrule * sizeof(*rset->srs_rules));
+
+               OBD_FREE(rset->srs_rules,
+                        rset->srs_nslot * sizeof(*rset->srs_rules));
+       }
+
+       rset->srs_rules = rules;
+       rset->srs_nslot = nslot;
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+       return (rule->sr_from != LUSTRE_SP_ANY ||
+               rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+       return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+                                struct sptlrpc_rule *r2)
+{
+       return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+                                struct sptlrpc_rule *r2)
+{
+       return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+                          struct sptlrpc_rule *rule)
+{
+       struct sptlrpc_rule      *p = rset->srs_rules;
+       int                    spec_dir, spec_net;
+       int                    rc, n, match = 0;
+
+       might_sleep();
+
+       spec_net = rule_spec_net(rule);
+       spec_dir = rule_spec_dir(rule);
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               p = &rset->srs_rules[n];
+
+               /* test network match, if failed:
+                * - spec rule: skip rules which is also spec rule match, until
+                *   we hit a wild rule, which means no more chance
+                * - wild rule: skip until reach the one which is also wild
+                *   and matches
+                */
+               if (!rule_match_net(p, rule)) {
+                       if (spec_net) {
+                               if (rule_spec_net(p))
+                                       continue;
+                               else
+                                       break;
+                       } else {
+                               continue;
+                       }
+               }
+
+               /* test dir match, same logic as net matching */
+               if (!rule_match_dir(p, rule)) {
+                       if (spec_dir) {
+                               if (rule_spec_dir(p))
+                                       continue;
+                               else
+                                       break;
+                       } else {
+                               continue;
+                       }
+               }
+
+               /* find a match */
+               match = 1;
+               break;
+       }
+
+       if (match) {
+               LASSERT(n >= 0 && n < rset->srs_nrule);
+
+               if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                       /* remove this rule */
+                       if (n < rset->srs_nrule - 1)
+                               memmove(&rset->srs_rules[n],
+                                       &rset->srs_rules[n + 1],
+                                       (rset->srs_nrule - n - 1) *
+                                       sizeof(*rule));
+                       rset->srs_nrule--;
+               } else {
+                       /* override the rule */
+                       memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+               }
+       } else {
+               LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+               if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+                       rc = sptlrpc_rule_set_expand(rset);
+                       if (rc)
+                               return rc;
+
+                       if (n < rset->srs_nrule)
+                               memmove(&rset->srs_rules[n + 1],
+                                       &rset->srs_rules[n],
+                                       (rset->srs_nrule - n) * sizeof(*rule));
+                       memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                       rset->srs_nrule++;
+               } else {
+                       CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                           enum lustre_sec_part from,
+                           enum lustre_sec_part to,
+                           lnet_nid_t nid,
+                           struct sptlrpc_flavor *sf)
+{
+       struct sptlrpc_rule    *r;
+       int                  n;
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               r = &rset->srs_rules[n];
+
+               if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+                   r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+                   LNET_NIDNET(nid) != r->sr_netid)
+                       continue;
+
+               if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+                   from != r->sr_from)
+                       continue;
+
+               if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+                   to != r->sr_to)
+                       continue;
+
+               *sf = r->sr_flvr;
+               return 1;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule *r;
+       int     n;
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               r = &rset->srs_rules[n];
+               CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+                      r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+       }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+                                   struct sptlrpc_rule_set *tgt,
+                                   enum lustre_sec_part from,
+                                   enum lustre_sec_part to,
+                                   struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule_set *src[2] = { gen, tgt };
+       struct sptlrpc_rule     *rule;
+       int                   i, n, rc;
+
+       might_sleep();
+
+       /* merge general rules firstly, then target-specific rules */
+       for (i = 0; i < 2; i++) {
+               if (src[i] == NULL)
+                       continue;
+
+               for (n = 0; n < src[i]->srs_nrule; n++) {
+                       rule = &src[i]->srs_rules[n];
+
+                       if (from != LUSTRE_SP_ANY &&
+                           rule->sr_from != LUSTRE_SP_ANY &&
+                           rule->sr_from != from)
+                               continue;
+                       if (to != LUSTRE_SP_ANY &&
+                           rule->sr_to != LUSTRE_SP_ANY &&
+                           rule->sr_to != to)
+                               continue;
+
+                       rc = sptlrpc_rule_set_merge(rset, rule);
+                       if (rc) {
+                               CERROR("can't merge: %d\n", rc);
+                               return rc;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+       struct list_head              sct_list;
+       char                sct_name[MAX_OBD_NAME];
+       struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+       struct list_head              sc_list;
+       char                sc_fsname[MTI_NAME_MAXLEN];
+       unsigned int        sc_modified;  /* modified during updating */
+       unsigned int        sc_updated:1, /* updated copy from MGS */
+                               sc_local:1;   /* local copy from target */
+       struct sptlrpc_rule_set sc_rset;      /* fs general rules */
+       struct list_head              sc_tgts;      /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+       return ((c >= '0' && c <= '9') ||
+               (c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+       const char     *ptr;
+       int          len;
+
+       ptr = strrchr(tgt, '-');
+       if (ptr) {
+               if ((strncmp(ptr, "-MDT", 4) != 0 &&
+                    strncmp(ptr, "-OST", 4) != 0) ||
+                   !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+                   !is_hex(ptr[6]) || !is_hex(ptr[7]))
+                       ptr = NULL;
+       }
+
+       /* if we didn't find the pattern, treat the whole string as fsname */
+       if (ptr == NULL)
+               len = strlen(tgt);
+       else
+               len = ptr - tgt;
+
+       len = min(len, buflen - 1);
+       memcpy(fsname, tgt, len);
+       fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+       struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+       sptlrpc_rule_set_free(&conf->sc_rset);
+
+       list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+                                    &conf->sc_tgts, sct_list) {
+               sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+               list_del(&conf_tgt->sct_list);
+               OBD_FREE_PTR(conf_tgt);
+       }
+       LASSERT(list_empty(&conf->sc_tgts));
+
+       conf->sc_updated = 0;
+       conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+       CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+       sptlrpc_conf_free_rsets(conf);
+       list_del(&conf->sc_list);
+       OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+                                             const char *name,
+                                             int create)
+{
+       struct sptlrpc_conf_tgt *conf_tgt;
+
+       list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+               if (strcmp(conf_tgt->sct_name, name) == 0)
+                       return conf_tgt;
+       }
+
+       if (!create)
+               return NULL;
+
+       OBD_ALLOC_PTR(conf_tgt);
+       if (conf_tgt) {
+               strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+               sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+               list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+       }
+
+       return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+                                     int create)
+{
+       struct sptlrpc_conf *conf;
+
+       list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+               if (strcmp(conf->sc_fsname, fsname) == 0)
+                       return conf;
+       }
+
+       if (!create)
+               return NULL;
+
+       OBD_ALLOC_PTR(conf);
+       if (conf == NULL)
+               return NULL;
+
+       strcpy(conf->sc_fsname, fsname);
+       sptlrpc_rule_set_init(&conf->sc_rset);
+       INIT_LIST_HEAD(&conf->sc_tgts);
+       list_add(&conf->sc_list, &sptlrpc_confs);
+
+       CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+       return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+                                  const char *target,
+                                  struct sptlrpc_rule *rule)
+{
+       struct sptlrpc_conf_tgt  *conf_tgt;
+       struct sptlrpc_rule_set  *rule_set;
+
+       /* fsname == target means general rules for the whole fs */
+       if (strcmp(conf->sc_fsname, target) == 0) {
+               rule_set = &conf->sc_rset;
+       } else {
+               conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+               if (conf_tgt) {
+                       rule_set = &conf_tgt->sct_rset;
+               } else {
+                       CERROR("out of memory, can't merge rule!\n");
+                       return -ENOMEM;
+               }
+       }
+
+       return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+                                   struct sptlrpc_conf *conf)
+{
+       char               *target, *param;
+       char                fsname[MTI_NAME_MAXLEN];
+       struct sptlrpc_rule     rule;
+       int                  rc;
+       ENTRY;
+
+       target = lustre_cfg_string(lcfg, 1);
+       if (target == NULL) {
+               CERROR("missing target name\n");
+               RETURN(-EINVAL);
+       }
+
+       param = lustre_cfg_string(lcfg, 2);
+       if (param == NULL) {
+               CERROR("missing parameter\n");
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+       /* parse rule to make sure the format is correct */
+       if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+               CERROR("Invalid sptlrpc parameter: %s\n", param);
+               RETURN(-EINVAL);
+       }
+       param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+       rc = sptlrpc_parse_rule(param, &rule);
+       if (rc)
+               RETURN(-EINVAL);
+
+       if (conf == NULL) {
+               target2fsname(target, fsname, sizeof(fsname));
+
+               mutex_lock(&sptlrpc_conf_lock);
+               conf = sptlrpc_conf_get(fsname, 0);
+               if (conf == NULL) {
+                       CERROR("can't find conf\n");
+                       rc = -ENOMEM;
+               } else {
+                       rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+               }
+               mutex_unlock(&sptlrpc_conf_lock);
+       } else {
+               LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+               rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+       }
+
+       if (rc == 0)
+               conf->sc_modified++;
+
+       RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+       return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+       char   *ptr;
+       int     len;
+
+       ptr = strrchr(logname, '-');
+       if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+               CERROR("%s is not a sptlrpc config log\n", logname);
+               return -EINVAL;
+       }
+
+       len = min((int) (ptr - logname), buflen - 1);
+
+       memcpy(buf, logname, len);
+       buf[len] = '\0';
+       return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf && conf->sc_local) {
+               LASSERT(conf->sc_updated == 0);
+               sptlrpc_conf_free_rsets(conf);
+       }
+       conf->sc_modified = 0;
+
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf) {
+               /*
+                * if original state is not updated, make sure the
+                * modified counter > 0 to enforce updating local copy.
+                */
+               if (conf->sc_updated == 0)
+                       conf->sc_modified++;
+
+               conf->sc_updated = 1;
+       }
+
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       sptlrpc_conf_get(fsname, 1);
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf)
+               sptlrpc_conf_free(conf);
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+                                   enum lustre_sec_part from,
+                                   enum lustre_sec_part to,
+                                   unsigned int fl_udesc)
+{
+       /*
+        * null flavor doesn't need to set any flavor, and in fact
+        * we'd better not do that because everybody share a single sec.
+        */
+       if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+               return;
+
+       if (from == LUSTRE_SP_MDT) {
+               /* MDT->MDT; MDT->OST */
+               sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+       } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+               /* CLI->OST */
+               sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+       } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+               /* CLI->MDT */
+               if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+                       sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+       }
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                               enum lustre_sec_part to,
+                               struct obd_uuid *target,
+                               lnet_nid_t nid,
+                               struct sptlrpc_flavor *sf)
+{
+       struct sptlrpc_conf     *conf;
+       struct sptlrpc_conf_tgt *conf_tgt;
+       char                 name[MTI_NAME_MAXLEN];
+       int                   len, rc = 0;
+
+       target2fsname(target->uuid, name, sizeof(name));
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(name, 0);
+       if (conf == NULL)
+               goto out;
+
+       /* convert uuid name (supposed end with _UUID) to target name */
+       len = strlen(target->uuid);
+       LASSERT(len > 5);
+       memcpy(name, target->uuid, len - 5);
+       name[len - 5] = '\0';
+
+       conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+       if (conf_tgt) {
+               rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+                                            from, to, nid, sf);
+               if (rc)
+                       goto out;
+       }
+
+       rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+       mutex_unlock(&sptlrpc_conf_lock);
+
+       if (rc == 0)
+               get_default_flavor(sf);
+
+       flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                 enum lustre_sec_part from,
+                                 lnet_nid_t nid,
+                                 struct sptlrpc_flavor *sf)
+{
+       if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+               get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY         (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+       struct obd_import  *imp;
+       ENTRY;
+
+       LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0);
+       CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+       /* serialize with connect/disconnect import */
+       down_read(&obd->u.cli.cl_sem);
+
+       imp = obd->u.cli.cl_import;
+       if (imp) {
+               spin_lock(&imp->imp_lock);
+               if (imp->imp_sec)
+                       imp->imp_sec_expire = cfs_time_current_sec() +
+                               SEC_ADAPT_DELAY;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       up_read(&obd->u.cli.cl_sem);
+       EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+
+static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen)
+{
+       char    dirbuf[8];
+       char   *net;
+       char   *ptr = buf;
+
+       if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY))
+               net = "default";
+       else
+               net = libcfs_net2str(r->sr_netid);
+
+       if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY)
+               dirbuf[0] = '\0';
+       else
+               snprintf(dirbuf, sizeof(dirbuf), ".%s2%s",
+                        sptlrpc_part2name(r->sr_from),
+                        sptlrpc_part2name(r->sr_to));
+
+       ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf);
+
+       sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf));
+       buf[buflen - 1] = '\0';
+}
+
+static int sptlrpc_record_rule_set(struct llog_handle *llh,
+                                  char *target,
+                                  struct sptlrpc_rule_set *rset)
+{
+       struct lustre_cfg_bufs  bufs;
+       struct lustre_cfg      *lcfg;
+       struct llog_rec_hdr     rec;
+       int                  buflen;
+       char                param[48];
+       int                  i, rc;
+
+       for (i = 0; i < rset->srs_nrule; i++) {
+               rule2string(&rset->srs_rules[i], param, sizeof(param));
+
+               lustre_cfg_bufs_reset(&bufs, NULL);
+               lustre_cfg_bufs_set_string(&bufs, 1, target);
+               lustre_cfg_bufs_set_string(&bufs, 2, param);
+               lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs);
+               LASSERT(lcfg);
+
+               buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                       lcfg->lcfg_buflens);
+               rec.lrh_len = llog_data_len(buflen);
+               rec.lrh_type = OBD_CFG_REC;
+               rc = llog_write(NULL, llh, &rec, NULL, 0, (void *)lcfg, -1);
+               if (rc)
+                       CERROR("failed to write a rec: rc = %d\n", rc);
+               lustre_cfg_free(lcfg);
+       }
+       return 0;
+}
+
+static int sptlrpc_record_rules(struct llog_handle *llh,
+                               struct sptlrpc_conf *conf)
+{
+       struct sptlrpc_conf_tgt *conf_tgt;
+
+       sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset);
+
+       list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+               sptlrpc_record_rule_set(llh, conf_tgt->sct_name,
+                                       &conf_tgt->sct_rset);
+       }
+       return 0;
+}
+
+#define LOG_SPTLRPC_TMP "sptlrpc.tmp"
+#define LOG_SPTLRPC     "sptlrpc"
+
+static
+int sptlrpc_target_local_copy_conf(struct obd_device *obd,
+                                  struct sptlrpc_conf *conf)
+{
+       struct llog_handle   *llh = NULL;
+       struct llog_ctxt     *ctxt;
+       struct lvfs_run_ctxt  saved;
+       struct dentry   *dentry;
+       int                rc;
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt == NULL)
+               RETURN(-EINVAL);
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+       dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+                                  strlen(MOUNT_CONFIGS_DIR));
+       if (IS_ERR(dentry)) {
+               rc = PTR_ERR(dentry);
+               CERROR("cannot lookup %s directory: rc = %d\n",
+                      MOUNT_CONFIGS_DIR, rc);
+               GOTO(out_ctx, rc);
+       }
+
+       /* erase the old tmp log */
+       rc = llog_erase(NULL, ctxt, NULL, LOG_SPTLRPC_TMP);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("%s: cannot erase temporary sptlrpc log: rc = %d\n",
+                      obd->obd_name, rc);
+               GOTO(out_dput, rc);
+       }
+
+       /* write temporary log */
+       rc = llog_open_create(NULL, ctxt, &llh, NULL, LOG_SPTLRPC_TMP);
+       if (rc)
+               GOTO(out_dput, rc);
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       rc = sptlrpc_record_rules(llh, conf);
+
+out_close:
+       llog_close(NULL, llh);
+       if (rc == 0)
+               rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt,
+                                  LOG_SPTLRPC_TMP, LOG_SPTLRPC);
+out_dput:
+       l_dput(dentry);
+out_ctx:
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n",
+              obd->obd_name, rc);
+       RETURN(rc);
+}
+
+static int local_read_handler(const struct lu_env *env,
+                             struct llog_handle *llh,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct sptlrpc_conf  *conf = (struct sptlrpc_conf *) data;
+       struct lustre_cfg    *lcfg = (struct lustre_cfg *)(rec + 1);
+       int                cfg_len, rc;
+       ENTRY;
+
+       if (rec->lrh_type != OBD_CFG_REC) {
+               CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+               RETURN(-EINVAL);
+       }
+
+       cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
+                 sizeof(struct llog_rec_tail);
+
+       rc = lustre_cfg_sanity_check(lcfg, cfg_len);
+       if (rc) {
+               CERROR("Insane cfg\n");
+               RETURN(rc);
+       }
+
+       if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) {
+               CERROR("invalid command (%x)\n", lcfg->lcfg_command);
+               RETURN(-EINVAL);
+       }
+
+       RETURN(__sptlrpc_process_config(lcfg, conf));
+}
+
+static
+int sptlrpc_target_local_read_conf(struct obd_device *obd,
+                                  struct sptlrpc_conf *conf)
+{
+       struct llog_handle    *llh = NULL;
+       struct llog_ctxt      *ctxt;
+       struct lvfs_run_ctxt   saved;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(conf->sc_updated == 0 && conf->sc_local == 0);
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt == NULL) {
+               CERROR("missing llog context\n");
+               RETURN(-EINVAL);
+       }
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(NULL, ctxt, &llh, NULL, LOG_SPTLRPC, LLOG_OPEN_EXISTS);
+       if (rc < 0) {
+               if (rc == -ENOENT)
+                       rc = 0;
+               GOTO(out_pop, rc);
+       }
+
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       if (llog_get_size(llh) <= 1) {
+               CDEBUG(D_SEC, "no local sptlrpc copy found\n");
+               GOTO(out_close, rc = 0);
+       }
+
+       rc = llog_process(NULL, llh, local_read_handler, (void *)conf, NULL);
+
+       if (rc == 0) {
+               conf->sc_local = 1;
+       } else {
+               sptlrpc_conf_free_rsets(conf);
+       }
+
+out_close:
+       llog_close(NULL, llh);
+out_pop:
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n",
+              obd->obd_name, rc);
+       RETURN(rc);
+}
+
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+                                 struct sptlrpc_rule_set *rset,
+                                 int initial)
+{
+       struct sptlrpc_conf      *conf;
+       struct sptlrpc_conf_tgt  *conf_tgt;
+       enum lustre_sec_part      sp_dst;
+       char                  fsname[MTI_NAME_MAXLEN];
+       int                    rc = 0;
+       ENTRY;
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+               sp_dst = LUSTRE_SP_MDT;
+       } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+               sp_dst = LUSTRE_SP_OST;
+       } else {
+               CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid);
+
+       target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname));
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf == NULL) {
+               CERROR("missing sptlrpc config log\n");
+               GOTO(out, rc);
+       }
+
+       if (conf->sc_updated  == 0) {
+               /*
+                * always read from local copy. here another option is
+                * if we already have a local copy (read from another
+                * target device hosted on the same node) we simply use that.
+                */
+               if (conf->sc_local)
+                       sptlrpc_conf_free_rsets(conf);
+
+               sptlrpc_target_local_read_conf(obd, conf);
+       } else {
+               LASSERT(conf->sc_local == 0);
+
+               /* write a local copy */
+               if (initial || conf->sc_modified)
+                       sptlrpc_target_local_copy_conf(obd, conf);
+               else
+                       CDEBUG(D_SEC, "unchanged, skip updating local copy\n");
+       }
+
+       /* extract rule set for this target */
+       conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+       rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+                                     conf_tgt ? &conf_tgt->sct_rset: NULL,
+                                     LUSTRE_SP_ANY, sp_dst, rset);
+out:
+       mutex_unlock(&sptlrpc_conf_lock);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_conf_target_get_rules);
+
+int  sptlrpc_conf_init(void)
+{
+       mutex_init(&sptlrpc_conf_lock);
+       return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+       struct sptlrpc_conf  *conf, *conf_next;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+               sptlrpc_conf_free(conf);
+       }
+       LASSERT(list_empty(&sptlrpc_confs));
+       mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644 (file)
index 0000000..4c96a14
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+       LASSERT(sec->ps_gc_interval > 0);
+       LASSERT(list_empty(&sec->ps_gc_list));
+
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+
+       spin_lock(&sec_gc_list_lock);
+       list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+       spin_unlock(&sec_gc_list_lock);
+
+       CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+       if (list_empty(&sec->ps_gc_list))
+               return;
+
+       might_sleep();
+
+       /* signal before list_del to make iteration in gc thread safe */
+       atomic_inc(&sec_gc_wait_del);
+
+       spin_lock(&sec_gc_list_lock);
+       list_del_init(&sec->ps_gc_list);
+       spin_unlock(&sec_gc_list_lock);
+
+       /* barrier */
+       mutex_lock(&sec_gc_mutex);
+       mutex_unlock(&sec_gc_mutex);
+
+       atomic_dec(&sec_gc_wait_del);
+
+       CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(list_empty(&ctx->cc_gc_chain));
+
+       CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+              ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+       spin_lock(&sec_gc_ctx_list_lock);
+       list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+       spin_unlock(&sec_gc_ctx_list_lock);
+
+       thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+       wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       spin_lock(&sec_gc_ctx_list_lock);
+
+       while (!list_empty(&sec_gc_ctx_list)) {
+               ctx = list_entry(sec_gc_ctx_list.next,
+                                    struct ptlrpc_cli_ctx, cc_gc_chain);
+               list_del_init(&ctx->cc_gc_chain);
+               spin_unlock(&sec_gc_ctx_list_lock);
+
+               LASSERT(ctx->cc_sec);
+               LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+               CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+                      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+               sptlrpc_cli_ctx_put(ctx, 1);
+
+               spin_lock(&sec_gc_ctx_list_lock);
+       }
+
+       spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+       if (unlikely(sec->ps_gc_next == 0)) {
+               CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+                     sec, sec->ps_policy->sp_name);
+               return;
+       }
+
+       CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+       if (cfs_time_after(sec->ps_gc_next, cfs_time_current_sec()))
+               return;
+
+       sec->ps_policy->sp_cops->gc_ctx(sec);
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+       struct l_wait_info    lwi;
+
+       unshare_fs_struct();
+
+       /* Record that the thread is running */
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       while (1) {
+               struct ptlrpc_sec *sec;
+
+               thread_clear_flags(thread, SVC_SIGNAL);
+               sec_process_ctx_list();
+again:
+               /* go through sec list do gc.
+                * FIXME here we iterate through the whole list each time which
+                * is not optimal. we perhaps want to use balanced binary tree
+                * to trace each sec as order of expiry time.
+                * another issue here is we wakeup as fixed interval instead of
+                * according to each sec's expiry time */
+               mutex_lock(&sec_gc_mutex);
+               list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+                       /* if someone is waiting to be deleted, let it
+                        * proceed as soon as possible. */
+                       if (atomic_read(&sec_gc_wait_del)) {
+                               CDEBUG(D_SEC, "deletion pending, start over\n");
+                               mutex_unlock(&sec_gc_mutex);
+                               goto again;
+                       }
+
+                       sec_do_gc(sec);
+               }
+               mutex_unlock(&sec_gc_mutex);
+
+               /* check ctx list again before sleep */
+               sec_process_ctx_list();
+
+               lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopping(thread) ||
+                            thread_is_signal(thread),
+                            &lwi);
+
+               if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                       break;
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+       return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+       struct l_wait_info lwi = { 0 };
+       task_t *task;
+
+       mutex_init(&sec_gc_mutex);
+       spin_lock_init(&sec_gc_list_lock);
+       spin_lock_init(&sec_gc_ctx_list_lock);
+
+       /* initialize thread control */
+       memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+       init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+       task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+       if (IS_ERR(task)) {
+               CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+               return PTR_ERR(task);
+       }
+
+       l_wait_event(sec_gc_thread.t_ctl_waitq,
+                    thread_is_running(&sec_gc_thread), &lwi);
+       return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+       struct l_wait_info lwi = { 0 };
+
+       thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+       wake_up(&sec_gc_thread.t_ctl_waitq);
+
+       l_wait_event(sec_gc_thread.t_ctl_waitq,
+                    thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644 (file)
index 0000000..920591b
--- /dev/null
@@ -0,0 +1,198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_SEC_FL_REVERSE)
+               strlcat(buf, "reverse,", bufsize);
+       if (flags & PTLRPC_SEC_FL_ROOTONLY)
+               strlcat(buf, "rootonly,", bufsize);
+       if (flags & PTLRPC_SEC_FL_UDESC)
+               strlcat(buf, "udesc,", bufsize);
+       if (flags & PTLRPC_SEC_FL_BULK)
+               strlcat(buf, "bulk,", bufsize);
+       if (buf[0] == '\0')
+               strlcat(buf, "-,", bufsize);
+
+       return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_sec *sec = NULL;
+       char           str[32];
+
+       LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+       if (cli->cl_import)
+               sec = sptlrpc_import_sec_ref(cli->cl_import);
+       if (sec == NULL)
+               goto out;
+
+       sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+       seq_printf(seq, "rpc flavor:    %s\n",
+                  sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+       seq_printf(seq, "bulk flavor:   %s\n",
+                  sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+       seq_printf(seq, "flags:  %s\n",
+                  sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+       seq_printf(seq, "id:        %d\n", sec->ps_id);
+       seq_printf(seq, "refcount:      %d\n",
+                  atomic_read(&sec->ps_refcount));
+       seq_printf(seq, "nctx:    %d\n", atomic_read(&sec->ps_nctx));
+       seq_printf(seq, "gc internal    %ld\n", sec->ps_gc_interval);
+       seq_printf(seq, "gc next        %ld\n",
+                  sec->ps_gc_interval ?
+                  sec->ps_gc_next - cfs_time_current_sec() : 0);
+
+       sptlrpc_sec_put(sec);
+out:
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_sec *sec = NULL;
+
+       LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+       if (cli->cl_import)
+               sec = sptlrpc_import_sec_ref(cli->cl_import);
+       if (sec == NULL)
+               goto out;
+
+       if (sec->ps_policy->sp_cops->display)
+               sec->ps_policy->sp_cops->display(sec, seq);
+
+       sptlrpc_sec_put(sec);
+out:
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+       int     rc;
+
+       if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+           strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+           strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+               CERROR("can't register lproc for obd type %s\n",
+                      dev->obd_type->typ_name);
+               return -EINVAL;
+       }
+
+       rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+                                   &sptlrpc_info_lprocfs_fops, dev);
+       if (rc) {
+               CERROR("create proc entry srpc_info for %s: %d\n",
+                      dev->obd_name, rc);
+               return rc;
+       }
+
+       rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+                                   &sptlrpc_ctxs_lprocfs_fops, dev);
+       if (rc) {
+               CERROR("create proc entry srpc_contexts for %s: %d\n",
+                      dev->obd_name, rc);
+               return rc;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+       { "encrypt_page_pools", sptlrpc_proc_read_enc_pool, NULL, NULL },
+       { NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+       int     rc;
+
+       LASSERT(sptlrpc_proc_root == NULL);
+
+       sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+                                            sptlrpc_lprocfs_vars, NULL);
+       if (IS_ERR(sptlrpc_proc_root)) {
+               rc = PTR_ERR(sptlrpc_proc_root);
+               sptlrpc_proc_root = NULL;
+               return rc;
+       }
+       return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+       if (sptlrpc_proc_root) {
+               lprocfs_remove(&sptlrpc_proc_root);
+               sptlrpc_proc_root = NULL;
+       }
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644 (file)
index 0000000..ff1137f
--- /dev/null
@@ -0,0 +1,464 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec       null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+       msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+       return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       /* should never reach here */
+       LBUG();
+       return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+       if (!req->rq_import->imp_dlm_fake) {
+               struct obd_device *obd = req->rq_import->imp_obd;
+               null_encode_sec_part(req->rq_reqbuf,
+                                    obd->u.cli.cl_sp_me);
+       }
+       req->rq_reqdata_len = req->rq_reqlen;
+       return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       __u32   cksums, cksumc;
+
+       LASSERT(req->rq_repdata);
+
+       req->rq_repmsg = req->rq_repdata;
+       req->rq_replen = req->rq_repdata_len;
+
+       if (req->rq_early) {
+               cksums = lustre_msg_get_cksum(req->rq_repdata);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_CKSUM_INCOMPAT18)
+                       cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 0);
+               else
+                       cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+               cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+#endif
+               if (cksumc != cksums) {
+                       CDEBUG(D_SEC,
+                              "early reply checksum mismatch: %08x != %08x\n",
+                              cksumc, cksums);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+                                  struct ptlrpc_svc_ctx *svc_ctx,
+                                  struct sptlrpc_flavor *sf)
+{
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+       /* general layer has take a module reference for us, because we never
+        * really destroy the sec, simply release the reference here.
+        */
+       sptlrpc_policy_put(&null_policy);
+       return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+                                      struct vfs_cred *vcred,
+                                      int create, int remove_dead)
+{
+       atomic_inc(&null_cli_ctx.cc_refcount);
+       return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+                        uid_t uid,
+                        int grace, int force)
+{
+       return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+       if (!req->rq_reqbuf) {
+               int alloc_size = size_roundup_power2(msgsize);
+
+               LASSERT(!req->rq_pool);
+               OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+               if (!req->rq_reqbuf)
+                       return -ENOMEM;
+
+               req->rq_reqbuf_len = alloc_size;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= msgsize);
+               memset(req->rq_reqbuf, 0, msgsize);
+       }
+
+       req->rq_reqmsg = req->rq_reqbuf;
+       return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+       if (!req->rq_pool) {
+               LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+                        "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+                        req, req->rq_reqmsg, req->rq_reqbuf);
+               LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+                        "req %p: reqlen %d should smaller than buflen %d\n",
+                        req, req->rq_reqlen, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+       /* add space for early replied */
+       msgsize += lustre_msg_early_size();
+
+       msgsize = size_roundup_power2(msgsize);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+       if (!req->rq_repbuf)
+               return -ENOMEM;
+
+       req->rq_repbuf_len = msgsize;
+       return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repbuf);
+
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       struct lustre_msg      *oldbuf = req->rq_reqmsg;
+       int                  oldsize, newmsg_size, alloc_size;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+       LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+       LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+       /* compute new message size */
+       oldsize = req->rq_reqbuf->lm_buflens[segment];
+       req->rq_reqbuf->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_packed_msg_size(oldbuf);
+       req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+       if (req->rq_reqbuf_len < newmsg_size) {
+               alloc_size = size_roundup_power2(newmsg_size);
+
+               OBD_ALLOC_LARGE(newbuf, alloc_size);
+               if (newbuf == NULL)
+                       return -ENOMEM;
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = req->rq_reqmsg = newbuf;
+               req->rq_reqbuf_len = alloc_size;
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+       req->rq_reqlen = newmsg_size;
+
+       return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+       .sc_refcount    = ATOMIC_INIT(1),
+       .sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+       LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+               SPTLRPC_POLICY_NULL);
+
+       if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+               CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+               return SECSVC_DROP;
+       }
+
+       req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+       req->rq_reqmsg = req->rq_reqbuf;
+       req->rq_reqlen = req->rq_reqdata_len;
+
+       req->rq_svc_ctx = &null_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_reply_state *rs;
+       int rs_size = sizeof(*rs) + msgsize;
+
+       LASSERT(msgsize % 8 == 0);
+
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       return -ENOMEM;
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = rs_size - sizeof(*rs);
+       rs->rs_msg = rs->rs_repbuf;
+
+       req->rq_reply_state = rs;
+       return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+       LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+       atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+       LASSERT(rs);
+
+       rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+       rs->rs_repdata_len = req->rq_replen;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = lustre_msg_early_size();
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               __u32 cksum;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_CKSUM_INCOMPAT18)
+                       cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 0);
+               else
+                       cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+               cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+#endif
+               lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+               req->rq_reply_off = 0;
+       }
+
+       return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+       .refresh                = null_ctx_refresh,
+       .sign              = null_ctx_sign,
+       .verify          = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+       .create_sec          = null_create_sec,
+       .destroy_sec        = null_destroy_sec,
+       .lookup_ctx          = null_lookup_ctx,
+       .flush_ctx_cache        = null_flush_ctx_cache,
+       .alloc_reqbuf      = null_alloc_reqbuf,
+       .alloc_repbuf      = null_alloc_repbuf,
+       .free_reqbuf        = null_free_reqbuf,
+       .free_repbuf        = null_free_repbuf,
+       .enlarge_reqbuf  = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+       .accept          = null_accept,
+       .alloc_rs              = null_alloc_rs,
+       .authorize            = null_authorize,
+       .free_rs                = null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "sec.null",
+       .sp_policy            = SPTLRPC_POLICY_NULL,
+       .sp_cops                = &null_sec_cops,
+       .sp_sops                = &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+       static HLIST_HEAD(__list);
+
+       null_sec.ps_policy = &null_policy;
+       atomic_set(&null_sec.ps_refcount, 1);     /* always busy */
+       null_sec.ps_id = -1;
+       null_sec.ps_import = NULL;
+       null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+       null_sec.ps_flvr.sf_flags = 0;
+       null_sec.ps_part = LUSTRE_SP_ANY;
+       null_sec.ps_dying = 0;
+       spin_lock_init(&null_sec.ps_lock);
+       atomic_set(&null_sec.ps_nctx, 1);        /* for "null_cli_ctx" */
+       INIT_LIST_HEAD(&null_sec.ps_gc_list);
+       null_sec.ps_gc_interval = 0;
+       null_sec.ps_gc_next = 0;
+
+       hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+       atomic_set(&null_cli_ctx.cc_refcount, 1);    /* for hash */
+       null_cli_ctx.cc_sec = &null_sec;
+       null_cli_ctx.cc_ops = &null_ctx_ops;
+       null_cli_ctx.cc_expire = 0;
+       null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+                               PTLRPC_CTX_UPTODATE;
+       null_cli_ctx.cc_vcred.vc_uid = 0;
+       spin_lock_init(&null_cli_ctx.cc_lock);
+       INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+       INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+       int rc;
+
+       null_init_internal();
+
+       rc = sptlrpc_register_policy(&null_policy);
+       if (rc)
+               CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+       return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+       int rc;
+
+       rc = sptlrpc_unregister_policy(&null_policy);
+       if (rc)
+               CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644 (file)
index 0000000..f552d2f
--- /dev/null
@@ -0,0 +1,1021 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+struct plain_sec {
+       struct ptlrpc_sec       pls_base;
+       rwlock_t            pls_lock;
+       struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+       return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS         (4)
+
+#define PLAIN_PACK_HDR_OFF           (0)
+#define PLAIN_PACK_MSG_OFF           (1)
+#define PLAIN_PACK_USER_OFF         (2)
+#define PLAIN_PACK_BULK_OFF         (3)
+
+#define PLAIN_FL_USER             (0x01)
+#define PLAIN_FL_BULK             (0x02)
+
+struct plain_header {
+       __u8        ph_ver;         /* 0 */
+       __u8        ph_flags;
+       __u8        ph_sp;           /* source */
+       __u8        ph_bulk_hash_alg;  /* complete flavor desc */
+       __u8        ph_pad[4];
+};
+
+struct plain_bulk_token {
+       __u8        pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+       (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers               *
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+
+       if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+               return -EPROTO;
+
+       bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+       if (bsd == NULL) {
+               CERROR("bulk sec desc has short size %d\n",
+                      lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+               return -EPROTO;
+       }
+
+       if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+           bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+               CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                   __u8 hash_alg,
+                                   struct plain_bulk_token *token)
+{
+       if (hash_alg == BULK_HASH_ALG_NULL)
+               return 0;
+
+       memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+       return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+                                        sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                 __u8 hash_alg,
+                                 struct plain_bulk_token *tokenr)
+{
+       struct plain_bulk_token tokenv;
+       int                  rc;
+
+       if (hash_alg == BULK_HASH_ALG_NULL)
+               return 0;
+
+       memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+       rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+                                      sizeof(tokenv.pbt_hash));
+       if (rc)
+               return rc;
+
+       if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+               return -EACCES;
+       return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+       char       *ptr;
+       unsigned int    off, i;
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               if (desc->bd_iov[i].kiov_len == 0)
+                       continue;
+
+               ptr = kmap(desc->bd_iov[i].kiov_page);
+               off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+               ptr[off] ^= 0x1;
+               kunmap(desc->bd_iov[i].kiov_page);
+               return;
+       }
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       /* should never reach here */
+       LBUG();
+       return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+       return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_reqbuf;
+       struct plain_header *phdr;
+       ENTRY;
+
+       msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+       phdr->ph_ver = 0;
+       phdr->ph_flags = 0;
+       phdr->ph_sp = ctx->cc_sec->ps_part;
+       phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+       if (req->rq_pack_udesc)
+               phdr->ph_flags |= PLAIN_FL_USER;
+       if (req->rq_pack_bulk)
+               phdr->ph_flags |= PLAIN_FL_BULK;
+
+       req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+                                                msg->lm_buflens);
+       RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_repdata;
+       struct plain_header *phdr;
+       __u32           cksum;
+       int               swabbed;
+       ENTRY;
+
+       if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+               CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+               RETURN(-EPROTO);
+       }
+
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+       if (phdr == NULL) {
+               CERROR("missing plain header\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_ver != 0) {
+               CERROR("Invalid header version\n");
+               RETURN(-EPROTO);
+       }
+
+       /* expect no user desc in reply */
+       if (phdr->ph_flags & PLAIN_FL_USER) {
+               CERROR("Unexpected udesc flag in reply\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+               CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+                      req->rq_flvr.u_bulk.hash.hash_alg);
+               RETURN(-EPROTO);
+       }
+
+       if (unlikely(req->rq_early)) {
+               unsigned int hsize = 4;
+
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+                               lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+                               lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+                               NULL, 0, (unsigned char *)&cksum, &hsize);
+               if (cksum != msg->lm_cksum) {
+                       CDEBUG(D_SEC,
+                              "early reply checksum mismatch: %08x != %08x\n",
+                              cpu_to_le32(cksum), msg->lm_cksum);
+                       RETURN(-EINVAL);
+               }
+       } else {
+               /* whether we sent with bulk or not, we expect the same
+                * in reply, except for early reply */
+               if (!req->rq_early &&
+                   !equi(req->rq_pack_bulk == 1,
+                         phdr->ph_flags & PLAIN_FL_BULK)) {
+                       CERROR("%s bulk checksum in reply\n",
+                              req->rq_pack_bulk ? "Missing" : "Unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (phdr->ph_flags & PLAIN_FL_BULK) {
+                       if (plain_unpack_bsd(msg, swabbed))
+                               RETURN(-EPROTO);
+               }
+       }
+
+       req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+       req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+       RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+       struct plain_bulk_token     *token;
+       int                       rc;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+       bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       token = (struct plain_bulk_token *) bsd->bsd_data;
+
+       bsd->bsd_version = 0;
+       bsd->bsd_flags = 0;
+       bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+       if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               RETURN(0);
+
+       if (req->rq_bulk_read)
+               RETURN(0);
+
+       rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                     token);
+       if (rc) {
+               CERROR("bulk write: failed to compute checksum: %d\n", rc);
+       } else {
+               /*
+                * for sending we only compute the wrong checksum instead
+                * of corrupting the data so it is still correct on a redo
+                */
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+                   req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+                       token->pbt_hash[0] ^= 0x1;
+       }
+
+       return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_bulk_sec_desc *bsdv;
+       struct plain_bulk_token     *tokenv;
+       int                       rc;
+       int                       i, nob;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+       LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+       bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+       tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+       if (req->rq_bulk_write) {
+               if (bsdv->bsd_flags & BSD_FL_ERR)
+                       return -EIO;
+               return 0;
+       }
+
+       /* fix the actual data size */
+       for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+               if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+                       desc->bd_iov[i].kiov_len =
+                               desc->bd_nob_transferred - nob;
+               }
+               nob += desc->bd_iov[i].kiov_len;
+       }
+
+       rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                   tokenv);
+       if (rc)
+               CERROR("bulk read: client verify failed: %d\n", rc);
+
+       return rc;
+}
+
+/****************************************
+ * sec apis                         *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+       struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+       OBD_ALLOC_PTR(ctx_new);
+
+       write_lock(&plsec->pls_lock);
+
+       ctx = plsec->pls_ctx;
+       if (ctx) {
+               atomic_inc(&ctx->cc_refcount);
+
+               if (ctx_new)
+                       OBD_FREE_PTR(ctx_new);
+       } else if (ctx_new) {
+               ctx = ctx_new;
+
+               atomic_set(&ctx->cc_refcount, 1); /* for cache */
+               ctx->cc_sec = &plsec->pls_base;
+               ctx->cc_ops = &plain_ctx_ops;
+               ctx->cc_expire = 0;
+               ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+               ctx->cc_vcred.vc_uid = 0;
+               spin_lock_init(&ctx->cc_lock);
+               INIT_LIST_HEAD(&ctx->cc_req_list);
+               INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+               plsec->pls_ctx = ctx;
+               atomic_inc(&plsec->pls_base.ps_nctx);
+               atomic_inc(&plsec->pls_base.ps_refcount);
+
+               atomic_inc(&ctx->cc_refcount); /* for caller */
+       }
+
+       write_unlock(&plsec->pls_lock);
+
+       return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       ENTRY;
+
+       LASSERT(sec->ps_policy == &plain_policy);
+       LASSERT(sec->ps_import);
+       LASSERT(atomic_read(&sec->ps_refcount) == 0);
+       LASSERT(atomic_read(&sec->ps_nctx) == 0);
+       LASSERT(plsec->pls_ctx == NULL);
+
+       class_import_put(sec->ps_import);
+
+       OBD_FREE_PTR(plsec);
+       EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+       sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+                                   struct ptlrpc_svc_ctx *svc_ctx,
+                                   struct sptlrpc_flavor *sf)
+{
+       struct plain_sec       *plsec;
+       struct ptlrpc_sec      *sec;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+       OBD_ALLOC_PTR(plsec);
+       if (plsec == NULL)
+               RETURN(NULL);
+
+       /*
+        * initialize plain_sec
+        */
+       rwlock_init(&plsec->pls_lock);
+       plsec->pls_ctx = NULL;
+
+       sec = &plsec->pls_base;
+       sec->ps_policy = &plain_policy;
+       atomic_set(&sec->ps_refcount, 0);
+       atomic_set(&sec->ps_nctx, 0);
+       sec->ps_id = sptlrpc_get_next_secid();
+       sec->ps_import = class_import_get(imp);
+       sec->ps_flvr = *sf;
+       spin_lock_init(&sec->ps_lock);
+       INIT_LIST_HEAD(&sec->ps_gc_list);
+       sec->ps_gc_interval = 0;
+       sec->ps_gc_next = 0;
+
+       /* install ctx immediately if this is a reverse sec */
+       if (svc_ctx) {
+               ctx = plain_sec_install_ctx(plsec);
+               if (ctx == NULL) {
+                       plain_destroy_sec(sec);
+                       RETURN(NULL);
+               }
+               sptlrpc_cli_ctx_put(ctx, 1);
+       }
+
+       RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+                                       struct vfs_cred *vcred,
+                                       int create, int remove_dead)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       read_lock(&plsec->pls_lock);
+       ctx = plsec->pls_ctx;
+       if (ctx)
+               atomic_inc(&ctx->cc_refcount);
+       read_unlock(&plsec->pls_lock);
+
+       if (unlikely(ctx == NULL))
+               ctx = plain_sec_install_ctx(plsec);
+
+       RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+                      struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       LASSERT(ctx->cc_sec == sec);
+
+       OBD_FREE_PTR(ctx);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+                         uid_t uid, int grace, int force)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       /* do nothing unless caller want to flush for 'all' */
+       if (uid != -1)
+               RETURN(0);
+
+       write_lock(&plsec->pls_lock);
+       ctx = plsec->pls_ctx;
+       plsec->pls_ctx = NULL;
+       write_unlock(&plsec->pls_lock);
+
+       if (ctx)
+               sptlrpc_cli_ctx_put(ctx, 1);
+       RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int   alloc_len;
+       ENTRY;
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_udesc)
+               buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+       if (req->rq_pack_bulk) {
+               LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+       }
+
+       alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       if (!req->rq_reqbuf) {
+               LASSERT(!req->rq_pool);
+
+               alloc_len = size_roundup_power2(alloc_len);
+               OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+
+               req->rq_reqbuf_len = alloc_len;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= alloc_len);
+               memset(req->rq_reqbuf, 0, alloc_len);
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+       RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+       ENTRY;
+       if (!req->rq_pool) {
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+       EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int alloc_len;
+       ENTRY;
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_bulk) {
+               LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+       }
+
+       alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       /* add space for early reply */
+       alloc_len += plain_at_offset;
+
+       alloc_len = size_roundup_power2(alloc_len);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+       if (!req->rq_repbuf)
+               RETURN(-ENOMEM);
+
+       req->rq_repbuf_len = alloc_len;
+       RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+       ENTRY;
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+       EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                        struct ptlrpc_request *req,
+                        int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       int                  oldsize;
+       int                  newmsg_size, newbuf_size;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+       LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+               req->rq_reqmsg);
+
+       /* compute new embedded msg size.  */
+       oldsize = req->rq_reqmsg->lm_buflens[segment];
+       req->rq_reqmsg->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+                                        req->rq_reqmsg->lm_buflens);
+       req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+       /* compute new wrapper msg size.  */
+       oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+       req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+       newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+                                        req->rq_reqbuf->lm_buflens);
+       req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+       if (req->rq_reqbuf_len < newbuf_size) {
+               newbuf_size = size_roundup_power2(newbuf_size);
+
+               OBD_ALLOC_LARGE(newbuf, newbuf_size);
+               if (newbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = newbuf;
+               req->rq_reqbuf_len = newbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+                                               PLAIN_PACK_MSG_OFF, 0);
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+                                    newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+       req->rq_reqlen = newmsg_size;
+       RETURN(0);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+       .sc_refcount    = ATOMIC_INIT(1),
+       .sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_reqbuf;
+       struct plain_header *phdr;
+       int               swabbed;
+       ENTRY;
+
+       LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+               SPTLRPC_POLICY_PLAIN);
+
+       if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+           SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+           SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+           SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+               CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+               CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+       if (phdr == NULL) {
+               CERROR("missing plain header\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_ver != 0) {
+               CERROR("Invalid header version\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+               CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+               RETURN(-EPROTO);
+       }
+
+       req->rq_sp_from = phdr->ph_sp;
+       req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+       if (phdr->ph_flags & PLAIN_FL_USER) {
+               if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+                                            swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+       }
+
+       if (phdr->ph_flags & PLAIN_FL_BULK) {
+               if (plain_unpack_bsd(msg, swabbed))
+                       RETURN(SECSVC_DROP);
+
+               req->rq_pack_bulk = 1;
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+       req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+       req->rq_svc_ctx = &plain_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_reply_state   *rs;
+       __u32                   buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int                       rs_size = sizeof(*rs);
+       ENTRY;
+
+       LASSERT(msgsize % 8 == 0);
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+       rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+       lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+       rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+       req->rq_reply_state = rs;
+       RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+       ENTRY;
+
+       LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+       atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+       EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct lustre_msg_v2      *msg = rs->rs_repbuf;
+       struct plain_header       *phdr;
+       int                     len;
+       ENTRY;
+
+       LASSERT(rs);
+       LASSERT(msg);
+
+       if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+               len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+                                       req->rq_replen, 1);
+       else
+               len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+       msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+       phdr->ph_ver = 0;
+       phdr->ph_flags = 0;
+       phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+       if (req->rq_pack_bulk)
+               phdr->ph_flags |= PLAIN_FL_BULK;
+
+       rs->rs_repdata_len = len;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = plain_at_offset;
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               unsigned int hsize = 4;
+
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+                       lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+                       lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+                       NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+                       req->rq_reply_off = 0;
+       }
+
+       RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+       struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+       struct plain_bulk_token     *tokenr;
+       int                       rc;
+
+       LASSERT(req->rq_bulk_write);
+       LASSERT(req->rq_pack_bulk);
+
+       bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+       bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               return 0;
+
+       rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                   tokenr);
+       if (rc) {
+               bsdv->bsd_flags |= BSD_FL_ERR;
+               CERROR("bulk write: server verify failed: %d\n", rc);
+       }
+
+       return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+       struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+       struct plain_bulk_token     *tokenv;
+       int                       rc;
+
+       LASSERT(req->rq_bulk_read);
+       LASSERT(req->rq_pack_bulk);
+
+       bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+       tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               return 0;
+
+       rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                     tokenv);
+       if (rc) {
+               CERROR("bulk read: server failed to compute "
+                      "checksum: %d\n", rc);
+       } else {
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+                       corrupt_bulk_data(desc);
+       }
+
+       return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+       .refresh                = plain_ctx_refresh,
+       .validate              = plain_ctx_validate,
+       .sign              = plain_ctx_sign,
+       .verify          = plain_ctx_verify,
+       .wrap_bulk            = plain_cli_wrap_bulk,
+       .unwrap_bulk        = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+       .create_sec          = plain_create_sec,
+       .destroy_sec        = plain_destroy_sec,
+       .kill_sec              = plain_kill_sec,
+       .lookup_ctx          = plain_lookup_ctx,
+       .release_ctx        = plain_release_ctx,
+       .flush_ctx_cache        = plain_flush_ctx_cache,
+       .alloc_reqbuf      = plain_alloc_reqbuf,
+       .free_reqbuf        = plain_free_reqbuf,
+       .alloc_repbuf      = plain_alloc_repbuf,
+       .free_repbuf        = plain_free_repbuf,
+       .enlarge_reqbuf  = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+       .accept          = plain_accept,
+       .alloc_rs              = plain_alloc_rs,
+       .authorize            = plain_authorize,
+       .free_rs                = plain_free_rs,
+       .unwrap_bulk        = plain_svc_unwrap_bulk,
+       .wrap_bulk            = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "plain",
+       .sp_policy            = SPTLRPC_POLICY_PLAIN,
+       .sp_cops                = &plain_sec_cops,
+       .sp_sops                = &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int rc;
+
+       buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+       plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       rc = sptlrpc_register_policy(&plain_policy);
+       if (rc)
+               CERROR("failed to register: %d\n", rc);
+
+       return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+       int rc;
+
+       rc = sptlrpc_unregister_policy(&plain_policy);
+       if (rc)
+               CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644 (file)
index 0000000..8011127
--- /dev/null
@@ -0,0 +1,3128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <linux/lnet/types.h>
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
+               "set non-zero to put pressure on request buffer pools");
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+               "Adaptive timeout minimum (sec)");
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+               "Adaptive timeout maximum (sec)");
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+               "Adaptive timeouts remember the slowest event that took place "
+               "within this period (sec)");
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+               "How soon before an RPC deadline to send an early reply");
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+               "How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       struct ptlrpc_request_buffer_desc *rqbd;
+
+       OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+       if (rqbd == NULL)
+               return NULL;
+
+       rqbd->rqbd_svcpt = svcpt;
+       rqbd->rqbd_refcount = 0;
+       rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+       rqbd->rqbd_cbid.cbid_arg = rqbd;
+       INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+       OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+                           svcpt->scp_cpt, svc->srv_buf_size);
+       if (rqbd->rqbd_buffer == NULL) {
+               OBD_FREE_PTR(rqbd);
+               return NULL;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+       list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+       svcpt->scp_nrqbds_total++;
+       spin_unlock(&svcpt->scp_lock);
+
+       return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+       struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+       LASSERT(rqbd->rqbd_refcount == 0);
+       LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+       spin_lock(&svcpt->scp_lock);
+       list_del(&rqbd->rqbd_list);
+       svcpt->scp_nrqbds_total--;
+       spin_unlock(&svcpt->scp_lock);
+
+       OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+       OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       struct ptlrpc_request_buffer_desc *rqbd;
+       int                             rc = 0;
+       int                             i;
+
+       if (svcpt->scp_rqbd_allocating)
+               goto try_post;
+
+       spin_lock(&svcpt->scp_lock);
+       /* check again with lock */
+       if (svcpt->scp_rqbd_allocating) {
+               /* NB: we might allow more than one thread in the future */
+               LASSERT(svcpt->scp_rqbd_allocating == 1);
+               spin_unlock(&svcpt->scp_lock);
+               goto try_post;
+       }
+
+       svcpt->scp_rqbd_allocating++;
+       spin_unlock(&svcpt->scp_lock);
+
+
+       for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+               /* NB: another thread might have recycled enough rqbds, we
+                * need to make sure it wouldn't over-allocate, see LU-1212. */
+               if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+                       break;
+
+               rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+               if (rqbd == NULL) {
+                       CERROR("%s: Can't allocate request buffer\n",
+                              svc->srv_name);
+                       rc = -ENOMEM;
+                       break;
+               }
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       LASSERT(svcpt->scp_rqbd_allocating == 1);
+       svcpt->scp_rqbd_allocating--;
+
+       spin_unlock(&svcpt->scp_lock);
+
+       CDEBUG(D_RPCTRACE,
+              "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+              svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+              svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+       if (post && rc == 0)
+               rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+       return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+                struct lustre_handle *lock, int mode, int no_ack)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       int                     idx;
+
+       LASSERT(rs != NULL);
+       LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+       if (req->rq_export->exp_disconnected) {
+               ldlm_lock_decref(lock, mode);
+       } else {
+               idx = rs->rs_nlocks++;
+               rs->rs_locks[idx] = *lock;
+               rs->rs_modes[idx] = mode;
+               rs->rs_difficult = 1;
+               rs->rs_no_ack = !!no_ack;
+       }
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+       int                             hrt_id;         /* thread ID */
+       spinlock_t                      hrt_lock;
+       wait_queue_head_t                       hrt_waitq;
+       struct list_head                        hrt_queue;      /* RS queue */
+       struct ptlrpc_hr_partition      *hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+       /* # of started threads */
+       atomic_t                        hrp_nstarted;
+       /* # of stopped threads */
+       atomic_t                        hrp_nstopped;
+       /* cpu partition id */
+       int                             hrp_cpt;
+       /* round-robin rotor for choosing thread */
+       int                             hrp_rotor;
+       /* total number of threads on this partition */
+       int                             hrp_nthrs;
+       /* threads table */
+       struct ptlrpc_hr_thread         *hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+       /* CPU partition table, it's just cfs_cpt_table for now */
+       struct cfs_cpt_table            *hr_cpt_table;
+       /** controller sleep waitq */
+       wait_queue_head_t                       hr_waitq;
+       unsigned int                    hr_stopping;
+       /** roundrobin rotor for non-affinity service */
+       unsigned int                    hr_rotor;
+       /* partition data */
+       struct ptlrpc_hr_partition      **hr_partitions;
+};
+
+struct rs_batch {
+       struct list_head                        rsb_replies;
+       unsigned int                    rsb_n_replies;
+       struct ptlrpc_service_part      *rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service                ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+       memset(b, 0, sizeof *b);
+       INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       unsigned int                    rotor;
+
+       if (svcpt->scp_cpt >= 0 &&
+           svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+               /* directly match partition */
+               hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+       } else {
+               rotor = ptlrpc_hr.hr_rotor++;
+               rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+               hrp = ptlrpc_hr.hr_partitions[rotor];
+       }
+
+       rotor = hrp->hrp_rotor++;
+       return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+       if (b->rsb_n_replies != 0) {
+               struct ptlrpc_hr_thread *hrt;
+
+               hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+               spin_lock(&hrt->hrt_lock);
+               list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+               spin_unlock(&hrt->hrt_lock);
+
+               wake_up(&hrt->hrt_waitq);
+               b->rsb_n_replies = 0;
+       }
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+       if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+               if (b->rsb_svcpt != NULL) {
+                       rs_batch_dispatch(b);
+                       spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+               }
+               spin_lock(&svcpt->scp_rep_lock);
+               b->rsb_svcpt = svcpt;
+       }
+       spin_lock(&rs->rs_lock);
+       rs->rs_scheduled_ever = 1;
+       if (rs->rs_scheduled == 0) {
+               list_move(&rs->rs_list, &b->rsb_replies);
+               rs->rs_scheduled = 1;
+               b->rsb_n_replies++;
+       }
+       rs->rs_committed = 1;
+       spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+       if (b->rsb_svcpt != NULL) {
+               rs_batch_dispatch(b);
+               spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+       }
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_hr_thread *hrt;
+       ENTRY;
+
+       LASSERT(list_empty(&rs->rs_list));
+
+       hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+       spin_lock(&hrt->hrt_lock);
+       list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+       spin_unlock(&hrt->hrt_lock);
+
+       wake_up(&hrt->hrt_waitq);
+       EXIT;
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+       ENTRY;
+
+       LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock));
+       LASSERT(spin_is_locked(&rs->rs_lock));
+       LASSERT (rs->rs_difficult);
+       rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+       if (rs->rs_scheduled) {     /* being set up or already notified */
+               EXIT;
+               return;
+       }
+
+       rs->rs_scheduled = 1;
+       list_del_init(&rs->rs_list);
+       ptlrpc_dispatch_difficult_reply(rs);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+       struct ptlrpc_reply_state *rs, *nxt;
+       DECLARE_RS_BATCH(batch);
+       ENTRY;
+
+       rs_batch_init(&batch);
+       /* Find any replies that have been committed and get their service
+        * to attend to complete them. */
+
+       /* CAVEAT EMPTOR: spinlock ordering!!! */
+       spin_lock(&exp->exp_uncommitted_replies_lock);
+       list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+                                    rs_obd_list) {
+               LASSERT (rs->rs_difficult);
+               /* VBR: per-export last_committed */
+               LASSERT(rs->rs_export);
+               if (rs->rs_transno <= exp->exp_last_committed) {
+                       list_del_init(&rs->rs_obd_list);
+                       rs_batch_add(&batch, rs);
+               }
+       }
+       spin_unlock(&exp->exp_uncommitted_replies_lock);
+       rs_batch_fini(&batch);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_request_buffer_desc *rqbd;
+       int                               rc;
+       int                               posted = 0;
+
+       for (;;) {
+               spin_lock(&svcpt->scp_lock);
+
+               if (list_empty(&svcpt->scp_rqbd_idle)) {
+                       spin_unlock(&svcpt->scp_lock);
+                       return posted;
+               }
+
+               rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+                                     struct ptlrpc_request_buffer_desc,
+                                     rqbd_list);
+               list_del(&rqbd->rqbd_list);
+
+               /* assume we will post successfully */
+               svcpt->scp_nrqbds_posted++;
+               list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+               spin_unlock(&svcpt->scp_lock);
+
+               rc = ptlrpc_register_rqbd(rqbd);
+               if (rc != 0)
+                       break;
+
+               posted = 1;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       svcpt->scp_nrqbds_posted--;
+       list_del(&rqbd->rqbd_list);
+       list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+       /* Don't complain if no request buffers are posted right now; LNET
+        * won't drop requests because we set the portal lazy! */
+
+       spin_unlock(&svcpt->scp_lock);
+
+       return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+       struct ptlrpc_service_part *svcpt;
+
+       svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+       svcpt->scp_at_check = 1;
+       svcpt->scp_at_checktime = cfs_time_current();
+       wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+                            struct ptlrpc_service_conf *conf)
+{
+       struct ptlrpc_service_thr_conf  *tc = &conf->psc_thr;
+       unsigned                        init;
+       unsigned                        total;
+       unsigned                        nthrs;
+       int                             weight;
+
+       /*
+        * Common code for estimating & validating threads number.
+        * CPT affinity service could have percpt thread-pool instead
+        * of a global thread-pool, which means user might not always
+        * get the threads number they give it in conf::tc_nthrs_user
+        * even they did set. It's because we need to validate threads
+        * number for each CPT to guarantee each pool will have enough
+        * threads to keep the service healthy.
+        */
+       init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+       init = max_t(int, init, tc->tc_nthrs_init);
+
+       /* NB: please see comments in lustre_lnet.h for definition
+        * details of these members */
+       LASSERT(tc->tc_nthrs_max != 0);
+
+       if (tc->tc_nthrs_user != 0) {
+               /* In case there is a reason to test a service with many
+                * threads, we give a less strict check here, it can
+                * be up to 8 * nthrs_max */
+               total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+               nthrs = total / svc->srv_ncpts;
+               init  = max(init, nthrs);
+               goto out;
+       }
+
+       total = tc->tc_nthrs_max;
+       if (tc->tc_nthrs_base == 0) {
+               /* don't care about base threads number per partition,
+                * this is most for non-affinity service */
+               nthrs = total / svc->srv_ncpts;
+               goto out;
+       }
+
+       nthrs = tc->tc_nthrs_base;
+       if (svc->srv_ncpts == 1) {
+               int     i;
+
+               /* NB: Increase the base number if it's single partition
+                * and total number of cores/HTs is larger or equal to 4.
+                * result will always < 2 * nthrs_base */
+               weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+               for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+                           (tc->tc_nthrs_base >> i) != 0; i++)
+                       nthrs += tc->tc_nthrs_base >> i;
+       }
+
+       if (tc->tc_thr_factor != 0) {
+               int       factor = tc->tc_thr_factor;
+               const int fade = 4;
+
+               /*
+                * User wants to increase number of threads with for
+                * each CPU core/HT, most likely the factor is larger then
+                * one thread/core because service threads are supposed to
+                * be blocked by lock or wait for IO.
+                */
+               /*
+                * Amdahl's law says that adding processors wouldn't give
+                * a linear increasing of parallelism, so it's nonsense to
+                * have too many threads no matter how many cores/HTs
+                * there are.
+                */
+               if (cfs_cpu_ht_nsiblings(0) > 1) { /* weight is # of HTs */
+                       /* depress thread factor for hyper-thread */
+                       factor = factor - (factor >> 1) + (factor >> 3);
+               }
+
+               weight = cfs_cpt_weight(svc->srv_cptable, 0);
+               LASSERT(weight > 0);
+
+               for (; factor > 0 && weight > 0; factor--, weight -= fade)
+                       nthrs += min(weight, fade) * factor;
+       }
+
+       if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+               nthrs = max(tc->tc_nthrs_base,
+                           tc->tc_nthrs_max / svc->srv_ncpts);
+       }
+ out:
+       nthrs = max(nthrs, tc->tc_nthrs_init);
+       svc->srv_nthrs_cpt_limit = nthrs;
+       svc->srv_nthrs_cpt_init = init;
+
+       if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+               CDEBUG(D_OTHER, "%s: This service may have more threads (%d) "
+                      "than the given soft limit (%d)\n",
+                      svc->srv_name, nthrs * svc->srv_ncpts,
+                      tc->tc_nthrs_max);
+       }
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+                        struct ptlrpc_service_part *svcpt, int cpt)
+{
+       struct ptlrpc_at_array  *array;
+       int                     size;
+       int                     index;
+       int                     rc;
+
+       svcpt->scp_cpt = cpt;
+       INIT_LIST_HEAD(&svcpt->scp_threads);
+
+       /* rqbd and incoming request queue */
+       spin_lock_init(&svcpt->scp_lock);
+       INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+       INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+       INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+       init_waitqueue_head(&svcpt->scp_waitq);
+       /* history request & rqbd list */
+       INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+       INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+       /* acitve requests and hp requests */
+       spin_lock_init(&svcpt->scp_req_lock);
+
+       /* reply states */
+       spin_lock_init(&svcpt->scp_rep_lock);
+       INIT_LIST_HEAD(&svcpt->scp_rep_active);
+       INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+       init_waitqueue_head(&svcpt->scp_rep_waitq);
+       atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+       /* adaptive timeout */
+       spin_lock_init(&svcpt->scp_at_lock);
+       array = &svcpt->scp_at_array;
+
+       size = at_est2timeout(at_max);
+       array->paa_size     = size;
+       array->paa_count    = 0;
+       array->paa_deadline = -1;
+
+       /* allocate memory for scp_at_array (ptlrpc_at_array) */
+       OBD_CPT_ALLOC(array->paa_reqs_array,
+                     svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+       if (array->paa_reqs_array == NULL)
+               return -ENOMEM;
+
+       for (index = 0; index < size; index++)
+               INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+       OBD_CPT_ALLOC(array->paa_reqs_count,
+                     svc->srv_cptable, cpt, sizeof(__u32) * size);
+       if (array->paa_reqs_count == NULL)
+               goto failed;
+
+       cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+       /* At SOW, service time should be quick; 10s seems generous. If client
+        * timeout is less than this, we'll be sending an early reply. */
+       at_init(&svcpt->scp_at_estimate, 10, 0);
+
+       /* assign this before call ptlrpc_grow_req_bufs */
+       svcpt->scp_service = svc;
+       /* Now allocate the request buffers, but don't post them now */
+       rc = ptlrpc_grow_req_bufs(svcpt, 0);
+       /* We shouldn't be under memory pressure at startup, so
+        * fail if we can't allocate all our buffers at this time. */
+       if (rc != 0)
+               goto failed;
+
+       return 0;
+
+ failed:
+       if (array->paa_reqs_count != NULL) {
+               OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+               array->paa_reqs_count = NULL;
+       }
+
+       if (array->paa_reqs_array != NULL) {
+               OBD_FREE(array->paa_reqs_array,
+                        sizeof(struct list_head) * array->paa_size);
+               array->paa_reqs_array = NULL;
+       }
+
+       return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+                       proc_dir_entry_t *proc_entry)
+{
+       struct ptlrpc_service_cpt_conf  *cconf = &conf->psc_cpt;
+       struct ptlrpc_service           *service;
+       struct ptlrpc_service_part      *svcpt;
+       struct cfs_cpt_table            *cptable;
+       __u32                           *cpts = NULL;
+       int                             ncpts;
+       int                             cpt;
+       int                             rc;
+       int                             i;
+       ENTRY;
+
+       LASSERT(conf->psc_buf.bc_nbufs > 0);
+       LASSERT(conf->psc_buf.bc_buf_size >=
+               conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+       LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+       cptable = cconf->cc_cptable;
+       if (cptable == NULL)
+               cptable = cfs_cpt_table;
+
+       if (!conf->psc_thr.tc_cpu_affinity) {
+               ncpts = 1;
+       } else {
+               ncpts = cfs_cpt_number(cptable);
+               if (cconf->cc_pattern != NULL) {
+                       struct cfs_expr_list    *el;
+
+                       rc = cfs_expr_list_parse(cconf->cc_pattern,
+                                                strlen(cconf->cc_pattern),
+                                                0, ncpts - 1, &el);
+                       if (rc != 0) {
+                               CERROR("%s: invalid CPT pattern string: %s",
+                                      conf->psc_name, cconf->cc_pattern);
+                               RETURN(ERR_PTR(-EINVAL));
+                       }
+
+                       rc = cfs_expr_list_values(el, ncpts, &cpts);
+                       cfs_expr_list_free(el);
+                       if (rc <= 0) {
+                               CERROR("%s: failed to parse CPT array %s: %d\n",
+                                      conf->psc_name, cconf->cc_pattern, rc);
+                               if (cpts != NULL)
+                                       OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+                               RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+                       }
+                       ncpts = rc;
+               }
+       }
+
+       OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+       if (service == NULL) {
+               if (cpts != NULL)
+                       OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       service->srv_cptable            = cptable;
+       service->srv_cpts               = cpts;
+       service->srv_ncpts              = ncpts;
+
+       service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+       while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+               service->srv_cpt_bits++;
+
+       /* public members */
+       spin_lock_init(&service->srv_lock);
+       service->srv_name               = conf->psc_name;
+       service->srv_watchdog_factor    = conf->psc_watchdog_factor;
+       INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+       /* buffer configuration */
+       service->srv_nbuf_per_group     = test_req_buffer_pressure ?
+                                         1 : conf->psc_buf.bc_nbufs;
+       service->srv_max_req_size       = conf->psc_buf.bc_req_max_size +
+                                         SPTLRPC_MAX_PAYLOAD;
+       service->srv_buf_size           = conf->psc_buf.bc_buf_size;
+       service->srv_rep_portal         = conf->psc_buf.bc_rep_portal;
+       service->srv_req_portal         = conf->psc_buf.bc_req_portal;
+
+       /* Increase max reply size to next power of two */
+       service->srv_max_reply_size = 1;
+       while (service->srv_max_reply_size <
+              conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+               service->srv_max_reply_size <<= 1;
+
+       service->srv_thread_name        = conf->psc_thr.tc_thr_name;
+       service->srv_ctx_tags           = conf->psc_thr.tc_ctx_tags;
+       service->srv_hpreq_ratio        = PTLRPC_SVC_HP_RATIO;
+       service->srv_ops                = conf->psc_ops;
+
+       for (i = 0; i < ncpts; i++) {
+               if (!conf->psc_thr.tc_cpu_affinity)
+                       cpt = CFS_CPT_ANY;
+               else
+                       cpt = cpts != NULL ? cpts[i] : i;
+
+               OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+               if (svcpt == NULL)
+                       GOTO(failed, rc = -ENOMEM);
+
+               service->srv_parts[i] = svcpt;
+               rc = ptlrpc_service_part_init(service, svcpt, cpt);
+               if (rc != 0)
+                       GOTO(failed, rc);
+       }
+
+       ptlrpc_server_nthreads_check(service, conf);
+
+       rc = LNetSetLazyPortal(service->srv_req_portal);
+       LASSERT(rc == 0);
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+       list_add (&service->srv_list, &ptlrpc_all_services);
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+       if (proc_entry != NULL)
+               ptlrpc_lprocfs_register_service(proc_entry, service);
+
+       rc = ptlrpc_service_nrs_setup(service);
+       if (rc != 0)
+               GOTO(failed, rc);
+
+       CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+              service->srv_name, service->srv_req_portal);
+
+       rc = ptlrpc_start_threads(service);
+       if (rc != 0) {
+               CERROR("Failed to start threads for service %s: %d\n",
+                      service->srv_name, rc);
+               GOTO(failed, rc);
+       }
+
+       RETURN(service);
+failed:
+       ptlrpc_unregister_service(service);
+       RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+       LASSERT(atomic_read(&req->rq_refcount) == 0);
+       LASSERT(list_empty(&req->rq_timed_list));
+
+        /* DEBUG_REQ() assumes the reply state of a request with a valid
+         * ref will not be destroyed until that reference is dropped. */
+       ptlrpc_req_drop_rs(req);
+
+       sptlrpc_svc_ctx_decref(req);
+
+       if (req != &req->rq_rqbd->rqbd_req) {
+               /* NB request buffers use an embedded
+                * req if the incoming req unlinked the
+                * MD; this isn't one of them! */
+               OBD_FREE(req, sizeof(*req));
+       }
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+       struct ptlrpc_service_part        *svcpt = rqbd->rqbd_svcpt;
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       int                             refcount;
+       struct list_head                        *tmp;
+       struct list_head                        *nxt;
+
+       if (!atomic_dec_and_test(&req->rq_refcount))
+               return;
+
+       if (req->rq_at_linked) {
+               spin_lock(&svcpt->scp_at_lock);
+               /* recheck with lock, in case it's unlinked by
+                * ptlrpc_at_check_timed() */
+               if (likely(req->rq_at_linked))
+                       ptlrpc_at_remove_timed(req);
+               spin_unlock(&svcpt->scp_at_lock);
+       }
+
+       LASSERT(list_empty(&req->rq_timed_list));
+
+       /* finalize request */
+       if (req->rq_export) {
+               class_export_put(req->rq_export);
+               req->rq_export = NULL;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+       refcount = --(rqbd->rqbd_refcount);
+       if (refcount == 0) {
+               /* request buffer is now idle: add to history */
+               list_del(&rqbd->rqbd_list);
+
+               list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+               svcpt->scp_hist_nrqbds++;
+
+               /* cull some history?
+                * I expect only about 1 or 2 rqbds need to be recycled here */
+               while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+                       rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+                                             struct ptlrpc_request_buffer_desc,
+                                             rqbd_list);
+
+                       list_del(&rqbd->rqbd_list);
+                       svcpt->scp_hist_nrqbds--;
+
+                       /* remove rqbd's reqs from svc's req history while
+                        * I've got the service lock */
+                       list_for_each(tmp, &rqbd->rqbd_reqs) {
+                               req = list_entry(tmp, struct ptlrpc_request,
+                                                    rq_list);
+                               /* Track the highest culled req seq */
+                               if (req->rq_history_seq >
+                                   svcpt->scp_hist_seq_culled) {
+                                       svcpt->scp_hist_seq_culled =
+                                               req->rq_history_seq;
+                               }
+                               list_del(&req->rq_history_list);
+                       }
+
+                       spin_unlock(&svcpt->scp_lock);
+
+                       list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+                               req = list_entry(rqbd->rqbd_reqs.next,
+                                                    struct ptlrpc_request,
+                                                    rq_list);
+                               list_del(&req->rq_list);
+                               ptlrpc_server_free_request(req);
+                       }
+
+                       spin_lock(&svcpt->scp_lock);
+                       /*
+                        * now all reqs including the embedded req has been
+                        * disposed, schedule request buffer for re-use.
+                        */
+                       LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+                               0);
+                       list_add_tail(&rqbd->rqbd_list,
+                                         &svcpt->scp_rqbd_idle);
+               }
+
+               spin_unlock(&svcpt->scp_lock);
+       } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+               /* If we are low on memory, we are not interested in history */
+               list_del(&req->rq_list);
+               list_del_init(&req->rq_history_list);
+
+               /* Track the highest culled req seq */
+               if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+                       svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+               spin_unlock(&svcpt->scp_lock);
+
+               ptlrpc_server_free_request(req);
+       } else {
+               spin_unlock(&svcpt->scp_lock);
+       }
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+                                 struct obd_export *export)
+{
+       if (req->rq_export != NULL) {
+               if (!list_empty(&req->rq_exp_list)) {
+                       /* remove rq_exp_list from last export */
+                       spin_lock_bh(&req->rq_export->exp_rpc_lock);
+                       list_del_init(&req->rq_exp_list);
+                       spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+                       /* export has one reference already, so it`s safe to
+                        * add req to export queue here and get another
+                        * reference for request later */
+                       spin_lock_bh(&export->exp_rpc_lock);
+                       list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+                       spin_unlock_bh(&export->exp_rpc_lock);
+               }
+               class_export_rpc_dec(req->rq_export);
+               class_export_put(req->rq_export);
+       }
+
+       /* request takes one export refcount */
+       req->rq_export = class_export_get(export);
+       class_export_rpc_inc(export);
+
+       return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+                                        struct ptlrpc_request *req)
+{
+       ptlrpc_server_hpreq_fini(req);
+
+       ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+                                       struct ptlrpc_service_part *svcpt,
+                                       struct ptlrpc_request *req)
+{
+       spin_lock(&svcpt->scp_req_lock);
+       ptlrpc_nrs_req_stop_nolock(req);
+       svcpt->scp_nreqs_active--;
+       if (req->rq_hp)
+               svcpt->scp_nhreqs_active--;
+       spin_unlock(&svcpt->scp_req_lock);
+
+       ptlrpc_nrs_req_finalize(req);
+
+       if (req->rq_export != NULL)
+               class_export_rpc_dec(req->rq_export);
+
+       ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+       struct obd_export *oldest_exp;
+       time_t oldest_time, new_time;
+
+       ENTRY;
+
+       LASSERT(exp);
+
+       /* Compensate for slow machines, etc, by faking our request time
+          into the future.  Although this can break the strict time-ordering
+          of the list, we can be really lazy here - we don't have to evict
+          at the exact right moment.  Eventually, all silent exports
+          will make it to the top of the list. */
+
+       /* Do not pay attention on 1sec or smaller renewals. */
+       new_time = cfs_time_current_sec() + extra_delay;
+       if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+               RETURN_EXIT;
+
+       exp->exp_last_request_time = new_time;
+       CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n",
+              exp->exp_client_uuid.uuid,
+              exp->exp_last_request_time, exp);
+
+       /* exports may get disconnected from the chain even though the
+          export has references, so we must keep the spin lock while
+          manipulating the lists */
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+
+       if (list_empty(&exp->exp_obd_chain_timed)) {
+               /* this one is not timed */
+               spin_unlock(&exp->exp_obd->obd_dev_lock);
+               RETURN_EXIT;
+       }
+
+       list_move_tail(&exp->exp_obd_chain_timed,
+                          &exp->exp_obd->obd_exports_timed);
+
+       oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+                                   struct obd_export, exp_obd_chain_timed);
+       oldest_time = oldest_exp->exp_last_request_time;
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+       if (exp->exp_obd->obd_recovering) {
+               /* be nice to everyone during recovery */
+               EXIT;
+               return;
+       }
+
+       /* Note - racing to start/reset the obd_eviction timer is safe */
+       if (exp->exp_obd->obd_eviction_timer == 0) {
+               /* Check if the oldest entry is expired. */
+               if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+                                             extra_delay)) {
+                       /* We need a second timer, in case the net was down and
+                        * it just came back. Since the pinger may skip every
+                        * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+                        * we better wait for 3. */
+                       exp->exp_obd->obd_eviction_timer =
+                               cfs_time_current_sec() + 3 * PING_INTERVAL;
+                       CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+                              exp->exp_obd->obd_name,
+                              obd_export_nid2str(oldest_exp), oldest_time);
+               }
+       } else {
+               if (cfs_time_current_sec() >
+                   (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                       /* The evictor won't evict anyone who we've heard from
+                        * recently, so we don't have to check before we start
+                        * it. */
+                       if (!ping_evictor_wake(exp))
+                               exp->exp_obd->obd_eviction_timer = 0;
+               }
+       }
+
+       EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+       int rc = 0;
+
+       if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+                    req->rq_export->exp_conn_cnt)) {
+               DEBUG_REQ(D_RPCTRACE, req,
+                         "DROPPING req from old connection %d < %d",
+                         lustre_msg_get_conn_cnt(req->rq_reqmsg),
+                         req->rq_export->exp_conn_cnt);
+               return -EEXIST;
+       }
+       if (unlikely(req->rq_export->exp_obd &&
+                    req->rq_export->exp_obd->obd_fail)) {
+            /* Failing over, don't handle any more reqs, send
+               error response instead. */
+               CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+                      req, req->rq_export->exp_obd->obd_name);
+               rc = -ENODEV;
+       } else if (lustre_msg_get_flags(req->rq_reqmsg) &
+                  (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+                  !(req->rq_export->exp_obd->obd_recovering)) {
+                       DEBUG_REQ(D_ERROR, req,
+                                 "Invalid replay without recovery");
+                       class_fail_export(req->rq_export);
+                       rc = -ENODEV;
+       } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+                  !(req->rq_export->exp_obd->obd_recovering)) {
+                       DEBUG_REQ(D_ERROR, req, "Invalid req with transno "
+                                 LPU64" without recovery",
+                                 lustre_msg_get_transno(req->rq_reqmsg));
+                       class_fail_export(req->rq_export);
+                       rc = -ENODEV;
+       }
+
+       if (unlikely(rc < 0)) {
+               req->rq_status = rc;
+               ptlrpc_error(req);
+       }
+       return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       __s32 next;
+
+       if (array->paa_count == 0) {
+               cfs_timer_disarm(&svcpt->scp_at_timer);
+               return;
+       }
+
+       /* Set timer for closest deadline */
+       next = (__s32)(array->paa_deadline - cfs_time_current_sec() -
+                      at_early_margin);
+       if (next <= 0) {
+               ptlrpc_at_timer((unsigned long)svcpt);
+       } else {
+               cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+               CDEBUG(D_INFO, "armed %s at %+ds\n",
+                      svcpt->scp_service->srv_name, next);
+       }
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       struct ptlrpc_request *rq = NULL;
+       __u32 index;
+
+       if (AT_OFF)
+               return(0);
+
+       if (req->rq_no_reply)
+               return 0;
+
+       if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+               return(-ENOSYS);
+
+       spin_lock(&svcpt->scp_at_lock);
+       LASSERT(list_empty(&req->rq_timed_list));
+
+       index = (unsigned long)req->rq_deadline % array->paa_size;
+       if (array->paa_reqs_count[index] > 0) {
+               /* latest rpcs will have the latest deadlines in the list,
+                * so search backward. */
+               list_for_each_entry_reverse(rq,
+                                               &array->paa_reqs_array[index],
+                                               rq_timed_list) {
+                       if (req->rq_deadline >= rq->rq_deadline) {
+                               list_add(&req->rq_timed_list,
+                                            &rq->rq_timed_list);
+                               break;
+                       }
+               }
+       }
+
+       /* Add the request at the head of the list */
+       if (list_empty(&req->rq_timed_list))
+               list_add(&req->rq_timed_list,
+                            &array->paa_reqs_array[index]);
+
+       spin_lock(&req->rq_lock);
+       req->rq_at_linked = 1;
+       spin_unlock(&req->rq_lock);
+       req->rq_at_index = index;
+       array->paa_reqs_count[index]++;
+       array->paa_count++;
+       if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+               array->paa_deadline = req->rq_deadline;
+               ptlrpc_at_set_timer(svcpt);
+       }
+       spin_unlock(&svcpt->scp_at_lock);
+
+       return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+       struct ptlrpc_at_array *array;
+
+       array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+       /* NB: must call with hold svcpt::scp_at_lock */
+       LASSERT(!list_empty(&req->rq_timed_list));
+       list_del_init(&req->rq_timed_list);
+
+       spin_lock(&req->rq_lock);
+       req->rq_at_linked = 0;
+       spin_unlock(&req->rq_lock);
+
+       array->paa_reqs_count[req->rq_at_index]--;
+       array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_request *reqcopy;
+       struct lustre_msg *reqmsg;
+       cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+       time_t newdl;
+       int rc;
+       ENTRY;
+
+       /* deadline is when the client expects us to reply, margin is the
+          difference between clients' and servers' expectations */
+       DEBUG_REQ(D_ADAPTTO, req,
+                 "%ssending early reply (deadline %+lds, margin %+lds) for "
+                 "%d+%d", AT_OFF ? "AT off - not " : "",
+                 olddl, olddl - at_get(&svcpt->scp_at_estimate),
+                 at_get(&svcpt->scp_at_estimate), at_extra);
+
+       if (AT_OFF)
+               RETURN(0);
+
+       if (olddl < 0) {
+               DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+                         "not sending early reply. Consider increasing "
+                         "at_early_margin (%d)?", olddl, at_early_margin);
+
+               /* Return an error so we're not re-added to the timed list. */
+               RETURN(-ETIMEDOUT);
+       }
+
+       if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+               DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+                         "but no AT support");
+               RETURN(-ENOSYS);
+       }
+
+       if (req->rq_export &&
+           lustre_msg_get_flags(req->rq_reqmsg) &
+           (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+               /* During recovery, we don't want to send too many early
+                * replies, but on the other hand we want to make sure the
+                * client has enough time to resend if the rpc is lost. So
+                * during the recovery period send at least 4 early replies,
+                * spacing them every at_extra if we can. at_estimate should
+                * always equal this fixed value during recovery. */
+               at_measured(&svcpt->scp_at_estimate, min(at_extra,
+                           req->rq_export->exp_obd->obd_recovery_timeout / 4));
+       } else {
+               /* Fake our processing time into the future to ask the clients
+                * for some extra amount of time */
+               at_measured(&svcpt->scp_at_estimate, at_extra +
+                           cfs_time_current_sec() -
+                           req->rq_arrival_time.tv_sec);
+
+               /* Check to see if we've actually increased the deadline -
+                * we may be past adaptive_max */
+               if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+                   at_get(&svcpt->scp_at_estimate)) {
+                       DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+                                 "(%ld/%ld), not sending early reply\n",
+                                 olddl, req->rq_arrival_time.tv_sec +
+                                 at_get(&svcpt->scp_at_estimate) -
+                                 cfs_time_current_sec());
+                       RETURN(-ETIMEDOUT);
+               }
+       }
+       newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate);
+
+       OBD_ALLOC(reqcopy, sizeof *reqcopy);
+       if (reqcopy == NULL)
+               RETURN(-ENOMEM);
+       OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+       if (!reqmsg) {
+               OBD_FREE(reqcopy, sizeof *reqcopy);
+               RETURN(-ENOMEM);
+       }
+
+       *reqcopy = *req;
+       reqcopy->rq_reply_state = NULL;
+       reqcopy->rq_rep_swab_mask = 0;
+       reqcopy->rq_pack_bulk = 0;
+       reqcopy->rq_pack_udesc = 0;
+       reqcopy->rq_packed_final = 0;
+       sptlrpc_svc_ctx_addref(reqcopy);
+       /* We only need the reqmsg for the magic */
+       reqcopy->rq_reqmsg = reqmsg;
+       memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+       LASSERT(atomic_read(&req->rq_refcount));
+       /** if it is last refcount then early reply isn't needed */
+       if (atomic_read(&req->rq_refcount) == 1) {
+               DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+                         "abort sending early reply\n");
+               GOTO(out, rc = -EINVAL);
+       }
+
+       /* Connection ref */
+       reqcopy->rq_export = class_conn2export(
+                                    lustre_msg_get_handle(reqcopy->rq_reqmsg));
+       if (reqcopy->rq_export == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       /* RPC ref */
+       class_export_rpc_inc(reqcopy->rq_export);
+       if (reqcopy->rq_export->exp_obd &&
+           reqcopy->rq_export->exp_obd->obd_fail)
+               GOTO(out_put, rc = -ENODEV);
+
+       rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+       if (rc)
+               GOTO(out_put, rc);
+
+       rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+       if (!rc) {
+               /* Adjust our own deadline to what we told the client */
+               req->rq_deadline = newdl;
+               req->rq_early_count++; /* number sent, server side */
+       } else {
+               DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+       }
+
+       /* Free the (early) reply state from lustre_pack_reply.
+          (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+       ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+       class_export_rpc_dec(reqcopy->rq_export);
+       class_export_put(reqcopy->rq_export);
+out:
+       sptlrpc_svc_ctx_decref(reqcopy);
+       OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+       OBD_FREE(reqcopy, sizeof *reqcopy);
+       RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       struct ptlrpc_request *rq, *n;
+       struct list_head work_list;
+       __u32  index, count;
+       time_t deadline;
+       time_t now = cfs_time_current_sec();
+       cfs_duration_t delay;
+       int first, counter = 0;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_at_lock);
+       if (svcpt->scp_at_check == 0) {
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+       delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+       svcpt->scp_at_check = 0;
+
+       if (array->paa_count == 0) {
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+
+       /* The timer went off, but maybe the nearest rpc already completed. */
+       first = array->paa_deadline - now;
+       if (first > at_early_margin) {
+               /* We've still got plenty of time.  Reset the timer. */
+               ptlrpc_at_set_timer(svcpt);
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+
+       /* We're close to a timeout, and we don't know how much longer the
+          server will take. Send early replies to everyone expiring soon. */
+       INIT_LIST_HEAD(&work_list);
+       deadline = -1;
+       index = (unsigned long)array->paa_deadline % array->paa_size;
+       count = array->paa_count;
+       while (count > 0) {
+               count -= array->paa_reqs_count[index];
+               list_for_each_entry_safe(rq, n,
+                                            &array->paa_reqs_array[index],
+                                            rq_timed_list) {
+                       if (rq->rq_deadline > now + at_early_margin) {
+                               /* update the earliest deadline */
+                               if (deadline == -1 ||
+                                   rq->rq_deadline < deadline)
+                                       deadline = rq->rq_deadline;
+                               break;
+                       }
+
+                       ptlrpc_at_remove_timed(rq);
+                       /**
+                        * ptlrpc_server_drop_request() may drop
+                        * refcount to 0 already. Let's check this and
+                        * don't add entry to work_list
+                        */
+                       if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+                               list_add(&rq->rq_timed_list, &work_list);
+                       counter++;
+               }
+
+               if (++index >= array->paa_size)
+                       index = 0;
+       }
+       array->paa_deadline = deadline;
+       /* we have a new earliest deadline, restart the timer */
+       ptlrpc_at_set_timer(svcpt);
+
+       spin_unlock(&svcpt->scp_at_lock);
+
+       CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+              "replies\n", first, at_extra, counter);
+       if (first < 0) {
+               /* We're already past request deadlines before we even get a
+                  chance to send early replies */
+               LCONSOLE_WARN("%s: This server is not able to keep up with "
+                             "request traffic (cpu-bound).\n",
+                             svcpt->scp_service->srv_name);
+               CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+                     "delay="CFS_DURATION_T"(jiff)\n",
+                     counter, svcpt->scp_nreqs_incoming,
+                     svcpt->scp_nreqs_active,
+                     at_get(&svcpt->scp_at_estimate), delay);
+       }
+
+       /* we took additional refcount so entries can't be deleted from list, no
+        * locking is needed */
+       while (!list_empty(&work_list)) {
+               rq = list_entry(work_list.next, struct ptlrpc_request,
+                                   rq_timed_list);
+               list_del_init(&rq->rq_timed_list);
+
+               if (ptlrpc_at_send_early_reply(rq) == 0)
+                       ptlrpc_at_add_timed(rq);
+
+               ptlrpc_server_drop_request(rq);
+       }
+
+       RETURN(1); /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+                                   struct ptlrpc_request *req)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+               rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+               if (rc < 0)
+                       RETURN(rc);
+               LASSERT(rc == 0);
+       }
+       if (req->rq_export && req->rq_ops) {
+               /* Perform request specific check. We should do this check
+                * before the request is added into exp_hp_rpcs list otherwise
+                * it may hit swab race at LU-1044. */
+               if (req->rq_ops->hpreq_check) {
+                       rc = req->rq_ops->hpreq_check(req);
+                       /**
+                        * XXX: Out of all current
+                        * ptlrpc_hpreq_ops::hpreq_check(), only
+                        * ldlm_cancel_hpreq_check() can return an error code;
+                        * other functions assert in similar places, which seems
+                        * odd. What also does not seem right is that handlers
+                        * for those RPCs do not assert on the same checks, but
+                        * rather handle the error cases. e.g. see
+                        * ost_rw_hpreq_check(), and ost_brw_read(),
+                        * ost_brw_write().
+                        */
+                       if (rc < 0)
+                               RETURN(rc);
+                       LASSERT(rc == 0 || rc == 1);
+               }
+
+               spin_lock_bh(&req->rq_export->exp_rpc_lock);
+               list_add(&req->rq_exp_list,
+                            &req->rq_export->exp_hp_rpcs);
+               spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+       }
+
+       ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+       RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+       ENTRY;
+       if (req->rq_export && req->rq_ops) {
+               /* refresh lock timeout again so that client has more
+                * room to send lock cancel RPC. */
+               if (req->rq_ops->hpreq_fini)
+                       req->rq_ops->hpreq_fini(req);
+
+               spin_lock_bh(&req->rq_export->exp_rpc_lock);
+               list_del_init(&req->rq_exp_list);
+               spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+       }
+       EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+       return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+       .hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+       int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+       /* Check for export to let only reconnects for not yet evicted
+        * export to become a HP rpc. */
+       if ((req->rq_export != NULL) &&
+           (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+               req->rq_ops = &ptlrpc_hpreq_common;
+
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+                                    struct ptlrpc_request *req)
+{
+       int     rc;
+       ENTRY;
+
+       rc = ptlrpc_server_hpreq_init(svcpt, req);
+       if (rc < 0)
+               RETURN(rc);
+
+       ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+       RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+                                    bool force)
+{
+       int running = svcpt->scp_nthrs_running;
+
+       if (!nrs_svcpt_has_hp(svcpt))
+               return false;
+
+       if (force)
+               return true;
+
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
+       if (svcpt->scp_nreqs_active >= running - 1)
+               return false;
+
+       if (svcpt->scp_nhreqs_active == 0)
+               return true;
+
+       return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+              svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+                                      bool force)
+{
+       return ptlrpc_server_allow_high(svcpt, force) &&
+              ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+                                      bool force)
+{
+       int running = svcpt->scp_nthrs_running;
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
+       if (force ||
+           svcpt->scp_nreqs_active < running - 2)
+               return true;
+
+       if (svcpt->scp_nreqs_active >= running - 1)
+               return false;
+
+       return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+                                        bool force)
+{
+       return ptlrpc_server_allow_normal(svcpt, force) &&
+              ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+       return ptlrpc_server_high_pending(svcpt, force) ||
+              ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+       struct ptlrpc_request *req = NULL;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (ptlrpc_server_high_pending(svcpt, force)) {
+               req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+               if (req != NULL) {
+                       svcpt->scp_hreq_count++;
+                       goto got_request;
+               }
+       }
+
+       if (ptlrpc_server_normal_pending(svcpt, force)) {
+               req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+               if (req != NULL) {
+                       svcpt->scp_hreq_count = 0;
+                       goto got_request;
+               }
+       }
+
+       spin_unlock(&svcpt->scp_req_lock);
+       RETURN(NULL);
+
+got_request:
+       svcpt->scp_nreqs_active++;
+       if (req->rq_hp)
+               svcpt->scp_nhreqs_active++;
+
+       spin_unlock(&svcpt->scp_req_lock);
+
+       if (likely(req->rq_export))
+               class_export_rpc_inc(req->rq_export);
+
+       RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+                           struct ptlrpc_thread *thread)
+{
+       struct ptlrpc_service   *svc = svcpt->scp_service;
+       struct ptlrpc_request   *req;
+       __u32                   deadline;
+       int                     rc;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_lock);
+       if (list_empty(&svcpt->scp_req_incoming)) {
+               spin_unlock(&svcpt->scp_lock);
+               RETURN(0);
+       }
+
+       req = list_entry(svcpt->scp_req_incoming.next,
+                            struct ptlrpc_request, rq_list);
+       list_del_init(&req->rq_list);
+       svcpt->scp_nreqs_incoming--;
+       /* Consider this still a "queued" request as far as stats are
+        * concerned */
+       spin_unlock(&svcpt->scp_lock);
+
+       /* go through security check/transform */
+       rc = sptlrpc_svc_unwrap_request(req);
+       switch (rc) {
+       case SECSVC_OK:
+               break;
+       case SECSVC_COMPLETE:
+               target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+               goto err_req;
+       case SECSVC_DROP:
+               goto err_req;
+       default:
+               LBUG();
+       }
+
+       /*
+        * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+        * redo it wouldn't be harmful.
+        */
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+               rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+               if (rc != 0) {
+                       CERROR("error unpacking request: ptl %d from %s "
+                              "x"LPU64"\n", svc->srv_req_portal,
+                              libcfs_id2str(req->rq_peer), req->rq_xid);
+                       goto err_req;
+               }
+       }
+
+       rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+       if (rc) {
+               CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+                       LPU64"\n", svc->srv_req_portal,
+                       libcfs_id2str(req->rq_peer), req->rq_xid);
+               goto err_req;
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+               CERROR("drop incoming rpc opc %u, x"LPU64"\n",
+                      cfs_fail_val, req->rq_xid);
+               goto err_req;
+       }
+
+       rc = -EINVAL;
+       if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+               CERROR("wrong packet type received (type=%u) from %s\n",
+                      lustre_msg_get_type(req->rq_reqmsg),
+                      libcfs_id2str(req->rq_peer));
+               goto err_req;
+       }
+
+       switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+       case MDS_WRITEPAGE:
+       case OST_WRITE:
+               req->rq_bulk_write = 1;
+               break;
+       case MDS_READPAGE:
+       case OST_READ:
+       case MGS_CONFIG_READ:
+               req->rq_bulk_read = 1;
+               break;
+       }
+
+       CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid);
+
+       req->rq_export = class_conn2export(
+               lustre_msg_get_handle(req->rq_reqmsg));
+       if (req->rq_export) {
+               rc = ptlrpc_check_req(req);
+               if (rc == 0) {
+                       rc = sptlrpc_target_export_check(req->rq_export, req);
+                       if (rc)
+                               DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+                                         "illegal security flavor,");
+               }
+
+               if (rc)
+                       goto err_req;
+               ptlrpc_update_export_timer(req->rq_export, 0);
+       }
+
+       /* req_in handling should/must be fast */
+       if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+               DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+                         cfs_time_sub(cfs_time_current_sec(),
+                                      req->rq_arrival_time.tv_sec));
+
+       /* Set rpc server deadline and add it to the timed list */
+       deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_AT_SUPPORT) ?
+                  /* The max time the client expects us to take */
+                  lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+       req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+       if (unlikely(deadline == 0)) {
+               DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+               goto err_req;
+       }
+
+       req->rq_svc_thread = thread;
+
+       ptlrpc_at_add_timed(req);
+
+       /* Move it over to the request processing queue */
+       rc = ptlrpc_server_request_add(svcpt, req);
+       if (rc)
+               GOTO(err_req, rc);
+
+       wake_up(&svcpt->scp_waitq);
+       RETURN(1);
+
+err_req:
+       ptlrpc_server_finish_request(svcpt, req);
+
+       RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+                            struct ptlrpc_thread *thread)
+{
+       struct ptlrpc_service *svc = svcpt->scp_service;
+       struct ptlrpc_request *request;
+       struct timeval   work_start;
+       struct timeval   work_end;
+       long               timediff;
+       int                 rc;
+       int                 fail_opc = 0;
+       ENTRY;
+
+       request = ptlrpc_server_request_get(svcpt, false);
+       if (request == NULL)
+               RETURN(0);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+               fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+       else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+               fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+       if (unlikely(fail_opc)) {
+               if (request->rq_export && request->rq_ops)
+                       OBD_FAIL_TIMEOUT(fail_opc, 4);
+       }
+
+       ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+       if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+               libcfs_debug_dumplog();
+
+       do_gettimeofday(&work_start);
+       timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
+       if (likely(svc->srv_stats != NULL)) {
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+                                   timediff);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+                                   svcpt->scp_nreqs_incoming);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+                                   svcpt->scp_nreqs_active);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+                                   at_get(&svcpt->scp_at_estimate));
+       }
+
+       rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+       if (rc) {
+               CERROR("Failure to initialize session: %d\n", rc);
+               goto out_req;
+       }
+       request->rq_session.lc_thread = thread;
+       request->rq_session.lc_cookie = 0x5;
+       lu_context_enter(&request->rq_session);
+
+       CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid);
+
+       request->rq_svc_thread = thread;
+       if (thread)
+               request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+       if (likely(request->rq_export)) {
+               if (unlikely(ptlrpc_check_req(request)))
+                       goto put_conn;
+               ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+       }
+
+       /* Discard requests queued for longer than the deadline.
+          The deadline is increased if we send an early reply. */
+       if (cfs_time_current_sec() > request->rq_deadline) {
+               DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+                         ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n",
+                         libcfs_id2str(request->rq_peer),
+                         cfs_time_sub(request->rq_deadline,
+                         request->rq_arrival_time.tv_sec),
+                         cfs_time_sub(cfs_time_current_sec(),
+                         request->rq_deadline));
+               goto put_conn;
+       }
+
+       CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(),
+              (request->rq_export ?
+               (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+              (request->rq_export ?
+               atomic_read(&request->rq_export->exp_refcount) : -99),
+              lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+              libcfs_id2str(request->rq_peer),
+              lustre_msg_get_opc(request->rq_reqmsg));
+
+       if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+               CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+       rc = svc->srv_ops.so_req_handler(request);
+
+       ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+       lu_context_exit(&request->rq_session);
+       lu_context_fini(&request->rq_session);
+
+       if (unlikely(cfs_time_current_sec() > request->rq_deadline)) {
+                    DEBUG_REQ(D_WARNING, request, "Request took longer "
+                              "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);"
+                              " client may timeout.",
+                              cfs_time_sub(request->rq_deadline,
+                                           request->rq_arrival_time.tv_sec),
+                              cfs_time_sub(cfs_time_current_sec(),
+                                           request->rq_deadline));
+       }
+
+       do_gettimeofday(&work_end);
+       timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in "
+              "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+               current_comm(),
+               (request->rq_export ?
+                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+               (request->rq_export ?
+                atomic_read(&request->rq_export->exp_refcount) : -99),
+               lustre_msg_get_status(request->rq_reqmsg),
+               request->rq_xid,
+               libcfs_id2str(request->rq_peer),
+               lustre_msg_get_opc(request->rq_reqmsg),
+               timediff,
+               cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+               (request->rq_repmsg ?
+                lustre_msg_get_transno(request->rq_repmsg) :
+                request->rq_transno),
+               request->rq_status,
+               (request->rq_repmsg ?
+                lustre_msg_get_status(request->rq_repmsg) : -999));
+       if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+               int opc = opcode_offset(op);
+               if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+                       LASSERT(opc < LUSTRE_MAX_OPCODES);
+                       lprocfs_counter_add(svc->srv_stats,
+                                           opc + EXTRA_MAX_OPCODES,
+                                           timediff);
+               }
+       }
+       if (unlikely(request->rq_early_count)) {
+               DEBUG_REQ(D_ADAPTTO, request,
+                         "sent %d early replies before finishing in "
+                         CFS_DURATION_T"s",
+                         request->rq_early_count,
+                         cfs_time_sub(work_end.tv_sec,
+                         request->rq_arrival_time.tv_sec));
+       }
+
+out_req:
+       ptlrpc_server_finish_active_request(svcpt, request);
+
+       RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+       struct ptlrpc_service     *svc = svcpt->scp_service;
+       struct obd_export        *exp;
+       int                     nlocks;
+       int                     been_handled;
+       ENTRY;
+
+       exp = rs->rs_export;
+
+       LASSERT (rs->rs_difficult);
+       LASSERT (rs->rs_scheduled);
+       LASSERT (list_empty(&rs->rs_list));
+
+       spin_lock(&exp->exp_lock);
+       /* Noop if removed already */
+       list_del_init (&rs->rs_exp_list);
+       spin_unlock(&exp->exp_lock);
+
+       /* The disk commit callback holds exp_uncommitted_replies_lock while it
+        * iterates over newly committed replies, removing them from
+        * exp_uncommitted_replies.  It then drops this lock and schedules the
+        * replies it found for handling here.
+        *
+        * We can avoid contention for exp_uncommitted_replies_lock between the
+        * HRT threads and further commit callbacks by checking rs_committed
+        * which is set in the commit callback while it holds both
+        * rs_lock and exp_uncommitted_reples.
+        *
+        * If we see rs_committed clear, the commit callback _may_ not have
+        * handled this reply yet and we race with it to grab
+        * exp_uncommitted_replies_lock before removing the reply from
+        * exp_uncommitted_replies.  Note that if we lose the race and the
+        * reply has already been removed, list_del_init() is a noop.
+        *
+        * If we see rs_committed set, we know the commit callback is handling,
+        * or has handled this reply since store reordering might allow us to
+        * see rs_committed set out of sequence.  But since this is done
+        * holding rs_lock, we can be sure it has all completed once we hold
+        * rs_lock, which we do right next.
+        */
+       if (!rs->rs_committed) {
+               spin_lock(&exp->exp_uncommitted_replies_lock);
+               list_del_init(&rs->rs_obd_list);
+               spin_unlock(&exp->exp_uncommitted_replies_lock);
+       }
+
+       spin_lock(&rs->rs_lock);
+
+       been_handled = rs->rs_handled;
+       rs->rs_handled = 1;
+
+       nlocks = rs->rs_nlocks;          /* atomic "steal", but */
+       rs->rs_nlocks = 0;                    /* locks still on rs_locks! */
+
+       if (nlocks == 0 && !been_handled) {
+               /* If we see this, we should already have seen the warning
+                * in mds_steal_ack_locks()  */
+               CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64
+                      " o%d NID %s\n",
+                      rs,
+                      rs->rs_xid, rs->rs_transno, rs->rs_opc,
+                      libcfs_nid2str(exp->exp_connection->c_peer.nid));
+       }
+
+       if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+               spin_unlock(&rs->rs_lock);
+
+               if (!been_handled && rs->rs_on_net) {
+                       LNetMDUnlink(rs->rs_md_h);
+                       /* Ignore return code; we're racing with completion */
+               }
+
+               while (nlocks-- > 0)
+                       ldlm_lock_decref(&rs->rs_locks[nlocks],
+                                        rs->rs_modes[nlocks]);
+
+               spin_lock(&rs->rs_lock);
+       }
+
+       rs->rs_scheduled = 0;
+
+       if (!rs->rs_on_net) {
+               /* Off the net */
+               spin_unlock(&rs->rs_lock);
+
+               class_export_put (exp);
+               rs->rs_export = NULL;
+               ptlrpc_rs_decref (rs);
+               if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+                   svc->srv_is_stopping)
+                       wake_up_all(&svcpt->scp_waitq);
+               RETURN(1);
+       }
+
+       /* still on the net; callback will schedule */
+       spin_unlock(&rs->rs_lock);
+       RETURN(1);
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+       int avail = svcpt->scp_nrqbds_posted;
+       int low_water = test_req_buffer_pressure ? 0 :
+                       svcpt->scp_service->srv_nbuf_per_group / 2;
+
+       /* NB I'm not locking; just looking. */
+
+       /* CAVEAT EMPTOR: We might be allocating buffers here because we've
+        * allowed the request history to grow out of control.  We could put a
+        * sanity check on that here and cull some history if we need the
+        * space. */
+
+       if (avail <= low_water)
+               ptlrpc_grow_req_bufs(svcpt, 1);
+
+       if (svcpt->scp_service->srv_stats) {
+               lprocfs_counter_add(svcpt->scp_service->srv_stats,
+                                   PTLRPC_REQBUF_AVAIL_CNTR, avail);
+       }
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+       struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+       svcpt->scp_rqbd_timeout = 0;
+       return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nreqs_active <
+              svcpt->scp_nthrs_running - 1 -
+              (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nthrs_running +
+              svcpt->scp_nthrs_starting <
+              svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+       return !ptlrpc_threads_enough(svcpt) &&
+               ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+       return thread_is_stopping(thread) ||
+              thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+       return !list_empty(&svcpt->scp_rqbd_idle) &&
+              svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+       return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+                 struct ptlrpc_thread *thread)
+{
+       /* Don't exit while there are replies to be handled */
+       struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+                                            ptlrpc_retry_rqbds, svcpt);
+
+       lc_watchdog_disable(thread->t_watchdog);
+
+       cond_resched();
+
+       l_wait_event_exclusive_head(svcpt->scp_waitq,
+                               ptlrpc_thread_stopping(thread) ||
+                               ptlrpc_server_request_incoming(svcpt) ||
+                               ptlrpc_server_request_pending(svcpt, false) ||
+                               ptlrpc_rqbd_pending(svcpt) ||
+                               ptlrpc_at_check(svcpt), &lwi);
+
+       if (ptlrpc_thread_stopping(thread))
+               return -EINTR;
+
+       lc_watchdog_touch(thread->t_watchdog,
+                         ptlrpc_server_get_timeout(svcpt));
+       return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+       struct ptlrpc_thread            *thread = (struct ptlrpc_thread *)arg;
+       struct ptlrpc_service_part      *svcpt = thread->t_svcpt;
+       struct ptlrpc_service           *svc = svcpt->scp_service;
+       struct ptlrpc_reply_state       *rs;
+#ifdef WITH_GROUP_INFO
+       group_info_t *ginfo = NULL;
+#endif
+       struct lu_env *env;
+       int counter = 0, rc = 0;
+       ENTRY;
+
+       thread->t_pid = current_pid();
+       unshare_fs_struct();
+
+       /* NB: we will call cfs_cpt_bind() for all threads, because we
+        * might want to run lustre server only on a subset of system CPUs,
+        * in that case ->scp_cpt is CFS_CPT_ANY */
+       rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+       if (rc != 0) {
+               CWARN("%s: failed to bind %s on CPT %d\n",
+                     svc->srv_name, thread->t_name, svcpt->scp_cpt);
+       }
+
+#ifdef WITH_GROUP_INFO
+       ginfo = groups_alloc(0);
+       if (!ginfo) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       set_current_groups(ginfo);
+       put_group_info(ginfo);
+#endif
+
+       if (svc->srv_ops.so_thr_init != NULL) {
+               rc = svc->srv_ops.so_thr_init(thread);
+               if (rc)
+                       goto out;
+       }
+
+       OBD_ALLOC_PTR(env);
+       if (env == NULL) {
+               rc = -ENOMEM;
+               goto out_srv_fini;
+       }
+
+       rc = lu_context_init(&env->le_ctx,
+                            svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+       if (rc)
+               goto out_srv_fini;
+
+       thread->t_env = env;
+       env->le_ctx.lc_thread = thread;
+       env->le_ctx.lc_cookie = 0x6;
+
+       while (!list_empty(&svcpt->scp_rqbd_idle)) {
+               rc = ptlrpc_server_post_idle_rqbds(svcpt);
+               if (rc >= 0)
+                       continue;
+
+               CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+                       svc->srv_name, svcpt->scp_cpt, rc);
+               goto out_srv_fini;
+       }
+
+       /* Alloc reply state structure for this one */
+       OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+       if (!rs) {
+               rc = -ENOMEM;
+               goto out_srv_fini;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       LASSERT(thread_is_starting(thread));
+       thread_clear_flags(thread, SVC_STARTING);
+
+       LASSERT(svcpt->scp_nthrs_starting == 1);
+       svcpt->scp_nthrs_starting--;
+
+       /* SVC_STOPPING may already be set here if someone else is trying
+        * to stop the service while this new thread has been dynamically
+        * forked. We still set SVC_RUNNING to let our creator know that
+        * we are now running, however we will exit as soon as possible */
+       thread_add_flags(thread, SVC_RUNNING);
+       svcpt->scp_nthrs_running++;
+       spin_unlock(&svcpt->scp_lock);
+
+       /* wake up our creator in case he's still waiting. */
+       wake_up(&thread->t_ctl_waitq);
+
+       thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+                                            NULL, NULL);
+
+       spin_lock(&svcpt->scp_rep_lock);
+       list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+       wake_up(&svcpt->scp_rep_waitq);
+       spin_unlock(&svcpt->scp_rep_lock);
+
+       CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+              svcpt->scp_nthrs_running);
+
+       /* XXX maintain a list of all managed devices: insert here */
+       while (!ptlrpc_thread_stopping(thread)) {
+               if (ptlrpc_wait_event(svcpt, thread))
+                       break;
+
+               ptlrpc_check_rqbd_pool(svcpt);
+
+               if (ptlrpc_threads_need_create(svcpt)) {
+                       /* Ignore return code - we tried... */
+                       ptlrpc_start_thread(svcpt, 0);
+               }
+
+               /* Process all incoming reqs before handling any */
+               if (ptlrpc_server_request_incoming(svcpt)) {
+                       lu_context_enter(&env->le_ctx);
+                       ptlrpc_server_handle_req_in(svcpt, thread);
+                       lu_context_exit(&env->le_ctx);
+
+                       /* but limit ourselves in case of flood */
+                       if (counter++ < 100)
+                               continue;
+                       counter = 0;
+               }
+
+               if (ptlrpc_at_check(svcpt))
+                       ptlrpc_at_check_timed(svcpt);
+
+               if (ptlrpc_server_request_pending(svcpt, false)) {
+                       lu_context_enter(&env->le_ctx);
+                       ptlrpc_server_handle_request(svcpt, thread);
+                       lu_context_exit(&env->le_ctx);
+               }
+
+               if (ptlrpc_rqbd_pending(svcpt) &&
+                   ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+                       /* I just failed to repost request buffers.
+                        * Wait for a timeout (unless something else
+                        * happens) before I try again */
+                       svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+                       CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+                              svcpt->scp_nrqbds_posted);
+               }
+       }
+
+       lc_watchdog_delete(thread->t_watchdog);
+       thread->t_watchdog = NULL;
+
+out_srv_fini:
+       /*
+        * deconstruct service specific state created by ptlrpc_start_thread()
+        */
+       if (svc->srv_ops.so_thr_done != NULL)
+               svc->srv_ops.so_thr_done(thread);
+
+       if (env != NULL) {
+               lu_context_fini(&env->le_ctx);
+               OBD_FREE_PTR(env);
+       }
+out:
+       CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+              thread, thread->t_pid, thread->t_id, rc);
+
+       spin_lock(&svcpt->scp_lock);
+       if (thread_test_and_clear_flags(thread, SVC_STARTING))
+               svcpt->scp_nthrs_starting--;
+
+       if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+               /* must know immediately */
+               svcpt->scp_nthrs_running--;
+       }
+
+       thread->t_id = rc;
+       thread_add_flags(thread, SVC_STOPPED);
+
+       wake_up(&thread->t_ctl_waitq);
+       spin_unlock(&svcpt->scp_lock);
+
+       return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+                         struct list_head *replies)
+{
+       int result;
+
+       spin_lock(&hrt->hrt_lock);
+
+       list_splice_init(&hrt->hrt_queue, replies);
+       result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+       spin_unlock(&hrt->hrt_lock);
+       return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+       struct ptlrpc_hr_thread         *hrt = (struct ptlrpc_hr_thread *)arg;
+       struct ptlrpc_hr_partition      *hrp = hrt->hrt_partition;
+       LIST_HEAD                       (replies);
+       char                            threadname[20];
+       int                             rc;
+
+       snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+                hrp->hrp_cpt, hrt->hrt_id);
+       unshare_fs_struct();
+
+       rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+       if (rc != 0) {
+               CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+                     threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+       }
+
+       atomic_inc(&hrp->hrp_nstarted);
+       wake_up(&ptlrpc_hr.hr_waitq);
+
+       while (!ptlrpc_hr.hr_stopping) {
+               l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+               while (!list_empty(&replies)) {
+                       struct ptlrpc_reply_state *rs;
+
+                       rs = list_entry(replies.prev,
+                                           struct ptlrpc_reply_state,
+                                           rs_list);
+                       list_del_init(&rs->rs_list);
+                       ptlrpc_handle_rs(rs);
+               }
+       }
+
+       atomic_inc(&hrp->hrp_nstopped);
+       wake_up(&ptlrpc_hr.hr_waitq);
+
+       return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+       int                             j;
+
+       ptlrpc_hr.hr_stopping = 1;
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs == NULL)
+                       continue; /* uninitialized */
+               for (j = 0; j < hrp->hrp_nthrs; j++)
+                       wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+       }
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs == NULL)
+                       continue; /* uninitialized */
+               wait_event(ptlrpc_hr.hr_waitq,
+                              atomic_read(&hrp->hrp_nstopped) ==
+                              atomic_read(&hrp->hrp_nstarted));
+       }
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+       int                             j;
+       ENTRY;
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               int     rc = 0;
+
+               for (j = 0; j < hrp->hrp_nthrs; j++) {
+                       struct  ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+                       rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+                                                &hrp->hrp_thrs[j],
+                                                "ptlrpc_hr%02d_%03d",
+                                                hrp->hrp_cpt,
+                                                hrt->hrt_id));
+                       if (IS_ERR_VALUE(rc))
+                               break;
+               }
+               wait_event(ptlrpc_hr.hr_waitq,
+                              atomic_read(&hrp->hrp_nstarted) == j);
+               if (!IS_ERR_VALUE(rc))
+                       continue;
+
+               CERROR("Reply handling thread %d:%d Failed on starting: "
+                      "rc = %d\n", i, j, rc);
+               ptlrpc_stop_hr_threads();
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+       struct l_wait_info      lwi = { 0 };
+       struct ptlrpc_thread    *thread;
+       LIST_HEAD               (zombie);
+
+       ENTRY;
+
+       CDEBUG(D_INFO, "Stopping threads for service %s\n",
+              svcpt->scp_service->srv_name);
+
+       spin_lock(&svcpt->scp_lock);
+       /* let the thread know that we would like it to stop asap */
+       list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+               CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+                      svcpt->scp_service->srv_thread_name, thread->t_id);
+               thread_add_flags(thread, SVC_STOPPING);
+       }
+
+       wake_up_all(&svcpt->scp_waitq);
+
+       while (!list_empty(&svcpt->scp_threads)) {
+               thread = list_entry(svcpt->scp_threads.next,
+                                       struct ptlrpc_thread, t_link);
+               if (thread_is_stopped(thread)) {
+                       list_del(&thread->t_link);
+                       list_add(&thread->t_link, &zombie);
+                       continue;
+               }
+               spin_unlock(&svcpt->scp_lock);
+
+               CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+                      svcpt->scp_service->srv_thread_name, thread->t_id);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopped(thread), &lwi);
+
+               spin_lock(&svcpt->scp_lock);
+       }
+
+       spin_unlock(&svcpt->scp_lock);
+
+       while (!list_empty(&zombie)) {
+               thread = list_entry(zombie.next,
+                                       struct ptlrpc_thread, t_link);
+               list_del(&thread->t_link);
+               OBD_FREE_PTR(thread);
+       }
+       EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part *svcpt;
+       int                        i;
+       ENTRY;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service != NULL)
+                       ptlrpc_svcpt_stop_threads(svcpt);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+       int     rc = 0;
+       int     i;
+       int     j;
+       ENTRY;
+
+       /* We require 2 threads min, see note in ptlrpc_server_handle_request */
+       LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+       for (i = 0; i < svc->srv_ncpts; i++) {
+               for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+                       rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+                       if (rc == 0)
+                               continue;
+
+                       if (rc != -EMFILE)
+                               goto failed;
+                       /* We have enough threads, don't start more. b=15759 */
+                       break;
+               }
+       }
+
+       RETURN(0);
+ failed:
+       CERROR("cannot start %s thread #%d_%d: rc %d\n",
+              svc->srv_thread_name, i, j, rc);
+       ptlrpc_stop_all_threads(svc);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+       struct l_wait_info      lwi = { 0 };
+       struct ptlrpc_thread    *thread;
+       struct ptlrpc_service   *svc;
+       int                     rc;
+       ENTRY;
+
+       LASSERT(svcpt != NULL);
+
+       svc = svcpt->scp_service;
+
+       CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+              svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+              svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+       if (unlikely(svc->srv_is_stopping))
+               RETURN(-ESRCH);
+
+       if (!ptlrpc_threads_increasable(svcpt) ||
+           (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+            svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+               RETURN(-EMFILE);
+
+       OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+       if (thread == NULL)
+               RETURN(-ENOMEM);
+       init_waitqueue_head(&thread->t_ctl_waitq);
+
+       spin_lock(&svcpt->scp_lock);
+       if (!ptlrpc_threads_increasable(svcpt)) {
+               spin_unlock(&svcpt->scp_lock);
+               OBD_FREE_PTR(thread);
+               RETURN(-EMFILE);
+       }
+
+       if (svcpt->scp_nthrs_starting != 0) {
+               /* serialize starting because some modules (obdfilter)
+                * might require unique and contiguous t_id */
+               LASSERT(svcpt->scp_nthrs_starting == 1);
+               spin_unlock(&svcpt->scp_lock);
+               OBD_FREE_PTR(thread);
+               if (wait) {
+                       CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+                              svc->srv_thread_name, svcpt->scp_thr_nextid);
+                       schedule();
+                       goto again;
+               }
+
+               CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+                      svc->srv_thread_name, svcpt->scp_thr_nextid);
+               RETURN(-EAGAIN);
+       }
+
+       svcpt->scp_nthrs_starting++;
+       thread->t_id = svcpt->scp_thr_nextid++;
+       thread_add_flags(thread, SVC_STARTING);
+       thread->t_svcpt = svcpt;
+
+       list_add(&thread->t_link, &svcpt->scp_threads);
+       spin_unlock(&svcpt->scp_lock);
+
+       if (svcpt->scp_cpt >= 0) {
+               snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+                        svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+       } else {
+               snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+                        svc->srv_thread_name, thread->t_id);
+       }
+
+       CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+       rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("cannot start thread '%s': rc %d\n",
+                      thread->t_name, rc);
+               spin_lock(&svcpt->scp_lock);
+               list_del(&thread->t_link);
+               --svcpt->scp_nthrs_starting;
+               spin_unlock(&svcpt->scp_lock);
+
+               OBD_FREE(thread, sizeof(*thread));
+               RETURN(rc);
+       }
+
+       if (!wait)
+               RETURN(0);
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+
+       rc = thread_is_stopped(thread) ? thread->t_id : 0;
+       RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       struct ptlrpc_hr_thread         *hrt;
+       int                             rc;
+       int                             i;
+       int                             j;
+       ENTRY;
+
+       memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+       ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+       ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+                                                  sizeof(*hrp));
+       if (ptlrpc_hr.hr_partitions == NULL)
+               RETURN(-ENOMEM);
+
+       init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               hrp->hrp_cpt = i;
+
+               atomic_set(&hrp->hrp_nstarted, 0);
+               atomic_set(&hrp->hrp_nstopped, 0);
+
+               hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+               hrp->hrp_nthrs /= cfs_cpu_ht_nsiblings(0);
+
+               LASSERT(hrp->hrp_nthrs > 0);
+               OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+                             hrp->hrp_nthrs * sizeof(*hrt));
+               if (hrp->hrp_thrs == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               for (j = 0; j < hrp->hrp_nthrs; j++) {
+                       hrt = &hrp->hrp_thrs[j];
+
+                       hrt->hrt_id = j;
+                       hrt->hrt_partition = hrp;
+                       init_waitqueue_head(&hrt->hrt_waitq);
+                       spin_lock_init(&hrt->hrt_lock);
+                       INIT_LIST_HEAD(&hrt->hrt_queue);
+               }
+       }
+
+       rc = ptlrpc_start_hr_threads();
+out:
+       if (rc != 0)
+               ptlrpc_hr_fini();
+       RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+
+       if (ptlrpc_hr.hr_partitions == NULL)
+               return;
+
+       ptlrpc_stop_hr_threads();
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs != NULL) {
+                       OBD_FREE(hrp->hrp_thrs,
+                                hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+               }
+       }
+
+       cfs_percpt_free(ptlrpc_hr.hr_partitions);
+       ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+       while (1) {
+               int rc;
+               struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+                                                    NULL, NULL);
+
+               rc = l_wait_event(svcpt->scp_waitq,
+                    atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+               if (rc == 0)
+                       break;
+               CWARN("Unexpectedly long timeout %s %p\n",
+                     svcpt->scp_service->srv_name, svcpt->scp_service);
+       }
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       int                             i;
+
+       /* early disarm AT timer... */
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service != NULL)
+                       cfs_timer_disarm(&svcpt->scp_at_timer);
+       }
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part        *svcpt;
+       struct ptlrpc_request_buffer_desc *rqbd;
+       struct l_wait_info                lwi;
+       int                               rc;
+       int                               i;
+
+       /* All history will be culled when the next request buffer is
+        * freed in ptlrpc_service_purge_all() */
+       svc->srv_hist_nrqbds_cpt_max = 0;
+
+       rc = LNetClearLazyPortal(svc->srv_req_portal);
+       LASSERT(rc == 0);
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* Unlink all the request buffers.  This forces a 'final'
+                * event with its 'unlink' flag set for each posted rqbd */
+               list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+                                       rqbd_list) {
+                       rc = LNetMDUnlink(rqbd->rqbd_md_h);
+                       LASSERT(rc == 0 || rc == -ENOENT);
+               }
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* Wait for the network to release any buffers
+                * it's currently filling */
+               spin_lock(&svcpt->scp_lock);
+               while (svcpt->scp_nrqbds_posted != 0) {
+                       spin_unlock(&svcpt->scp_lock);
+                       /* Network access will complete in finite time but
+                        * the HUGE timeout lets us CWARN for visibility
+                        * of sluggish NALs */
+                       lwi = LWI_TIMEOUT_INTERVAL(
+                                       cfs_time_seconds(LONG_UNLINK),
+                                       cfs_time_seconds(1), NULL, NULL);
+                       rc = l_wait_event(svcpt->scp_waitq,
+                                         svcpt->scp_nrqbds_posted == 0, &lwi);
+                       if (rc == -ETIMEDOUT) {
+                               CWARN("Service %s waiting for "
+                                     "request buffers\n",
+                                     svcpt->scp_service->srv_name);
+                       }
+                       spin_lock(&svcpt->scp_lock);
+               }
+               spin_unlock(&svcpt->scp_lock);
+       }
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part              *svcpt;
+       struct ptlrpc_request_buffer_desc       *rqbd;
+       struct ptlrpc_request                   *req;
+       struct ptlrpc_reply_state               *rs;
+       int                                     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               spin_lock(&svcpt->scp_rep_lock);
+               while (!list_empty(&svcpt->scp_rep_active)) {
+                       rs = list_entry(svcpt->scp_rep_active.next,
+                                           struct ptlrpc_reply_state, rs_list);
+                       spin_lock(&rs->rs_lock);
+                       ptlrpc_schedule_difficult_reply(rs);
+                       spin_unlock(&rs->rs_lock);
+               }
+               spin_unlock(&svcpt->scp_rep_lock);
+
+               /* purge the request queue.  NB No new replies (rqbds
+                * all unlinked) and no service threads, so I'm the only
+                * thread noodling the request queue now */
+               while (!list_empty(&svcpt->scp_req_incoming)) {
+                       req = list_entry(svcpt->scp_req_incoming.next,
+                                            struct ptlrpc_request, rq_list);
+
+                       list_del(&req->rq_list);
+                       svcpt->scp_nreqs_incoming--;
+                       ptlrpc_server_finish_request(svcpt, req);
+               }
+
+               while (ptlrpc_server_request_pending(svcpt, true)) {
+                       req = ptlrpc_server_request_get(svcpt, true);
+                       ptlrpc_server_finish_active_request(svcpt, req);
+               }
+
+               LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+               LASSERT(svcpt->scp_nreqs_incoming == 0);
+               LASSERT(svcpt->scp_nreqs_active == 0);
+               /* history should have been culled by
+                * ptlrpc_server_finish_request */
+               LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+               /* Now free all the request buffers since nothing
+                * references them any more... */
+
+               while (!list_empty(&svcpt->scp_rqbd_idle)) {
+                       rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+                                             struct ptlrpc_request_buffer_desc,
+                                             rqbd_list);
+                       ptlrpc_free_rqbd(rqbd);
+               }
+               ptlrpc_wait_replies(svcpt);
+
+               while (!list_empty(&svcpt->scp_rep_idle)) {
+                       rs = list_entry(svcpt->scp_rep_idle.next,
+                                           struct ptlrpc_reply_state,
+                                           rs_list);
+                       list_del(&rs->rs_list);
+                       OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+               }
+       }
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_at_array          *array;
+       int                             i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* In case somebody rearmed this in the meantime */
+               cfs_timer_disarm(&svcpt->scp_at_timer);
+               array = &svcpt->scp_at_array;
+
+               if (array->paa_reqs_array != NULL) {
+                       OBD_FREE(array->paa_reqs_array,
+                                sizeof(struct list_head) * array->paa_size);
+                       array->paa_reqs_array = NULL;
+               }
+
+               if (array->paa_reqs_count != NULL) {
+                       OBD_FREE(array->paa_reqs_count,
+                                sizeof(__u32) * array->paa_size);
+                       array->paa_reqs_count = NULL;
+               }
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               OBD_FREE_PTR(svcpt);
+
+       if (svc->srv_cpts != NULL)
+               cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+       OBD_FREE(svc, offsetof(struct ptlrpc_service,
+                              srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+       ENTRY;
+
+       CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+       service->srv_is_stopping = 1;
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+       list_del_init(&service->srv_list);
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+       ptlrpc_service_del_atimer(service);
+       ptlrpc_stop_all_threads(service);
+
+       ptlrpc_service_unlink_rqbd(service);
+       ptlrpc_service_purge_all(service);
+       ptlrpc_service_nrs_cleanup(service);
+
+       ptlrpc_lprocfs_unregister_service(service);
+
+       ptlrpc_service_free(service);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_request           *request = NULL;
+       struct timeval                  right_now;
+       long                            timediff;
+
+       do_gettimeofday(&right_now);
+
+       spin_lock(&svcpt->scp_req_lock);
+       /* How long has the next entry been waiting? */
+       if (ptlrpc_server_high_pending(svcpt, true))
+               request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+       else if (ptlrpc_server_normal_pending(svcpt, true))
+               request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+       if (request == NULL) {
+               spin_unlock(&svcpt->scp_req_lock);
+               return 0;
+       }
+
+       timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+       spin_unlock(&svcpt->scp_req_lock);
+
+       if ((timediff / ONE_MILLION) >
+           (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+               CERROR("%s: unhealthy - request has been waiting %lds\n",
+                      svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       int                             i;
+
+       if (svc == NULL)
+               return 0;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               int rc = ptlrpc_svcpt_health_check(svcpt);
+
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c
new file mode 100644 (file)
index 0000000..93bc40b
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644 (file)
index 0000000..9890bd9
--- /dev/null
@@ -0,0 +1,4474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+void lustre_assert_wire_constants(void)
+{
+        /* Wire protocol assertions generated by 'wirecheck'
+         * (make -C lustre/utils newwiretest)
+         * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x
+         * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC)  */
+
+
+       /* Constants... */
+       LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+                (long long)PTL_RPC_MSG_REQUEST);
+       LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+                (long long)PTL_RPC_MSG_ERR);
+       LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+                (long long)PTL_RPC_MSG_REPLY);
+       LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+                MDS_DIR_END_OFF);
+       LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+                DEAD_HANDLE_MAGIC);
+       CLASSERT(MTI_NAME_MAXLEN == 64);
+       LASSERTF(OST_REPLY == 0, "found %lld\n",
+                (long long)OST_REPLY);
+       LASSERTF(OST_GETATTR == 1, "found %lld\n",
+                (long long)OST_GETATTR);
+       LASSERTF(OST_SETATTR == 2, "found %lld\n",
+                (long long)OST_SETATTR);
+       LASSERTF(OST_READ == 3, "found %lld\n",
+                (long long)OST_READ);
+       LASSERTF(OST_WRITE == 4, "found %lld\n",
+                (long long)OST_WRITE);
+       LASSERTF(OST_CREATE == 5, "found %lld\n",
+                (long long)OST_CREATE);
+       LASSERTF(OST_DESTROY == 6, "found %lld\n",
+                (long long)OST_DESTROY);
+       LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+                (long long)OST_GET_INFO);
+       LASSERTF(OST_CONNECT == 8, "found %lld\n",
+                (long long)OST_CONNECT);
+       LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+                (long long)OST_DISCONNECT);
+       LASSERTF(OST_PUNCH == 10, "found %lld\n",
+                (long long)OST_PUNCH);
+       LASSERTF(OST_OPEN == 11, "found %lld\n",
+                (long long)OST_OPEN);
+       LASSERTF(OST_CLOSE == 12, "found %lld\n",
+                (long long)OST_CLOSE);
+       LASSERTF(OST_STATFS == 13, "found %lld\n",
+                (long long)OST_STATFS);
+       LASSERTF(OST_SYNC == 16, "found %lld\n",
+                (long long)OST_SYNC);
+       LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+                (long long)OST_SET_INFO);
+       LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+                (long long)OST_QUOTACHECK);
+       LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+                (long long)OST_QUOTACTL);
+       LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+                (long long)OST_QUOTA_ADJUST_QUNIT);
+       LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+                (long long)OST_LAST_OPC);
+       LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+                OBD_OBJECT_EOF);
+       LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+                (long long)OST_MIN_PRECREATE);
+       LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+                (long long)OST_MAX_PRECREATE);
+       LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+                OST_LVB_ERR_INIT);
+       LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+                OST_LVB_ERR_MASK);
+       LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+                (long long)MDS_FIRST_OPC);
+       LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+                (long long)MDS_GETATTR);
+       LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+                (long long)MDS_GETATTR_NAME);
+       LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+                (long long)MDS_CLOSE);
+       LASSERTF(MDS_REINT == 36, "found %lld\n",
+                (long long)MDS_REINT);
+       LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+                (long long)MDS_READPAGE);
+       LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+                (long long)MDS_CONNECT);
+       LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+                (long long)MDS_DISCONNECT);
+       LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+                (long long)MDS_GETSTATUS);
+       LASSERTF(MDS_STATFS == 41, "found %lld\n",
+                (long long)MDS_STATFS);
+       LASSERTF(MDS_PIN == 42, "found %lld\n",
+                (long long)MDS_PIN);
+       LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+                (long long)MDS_UNPIN);
+       LASSERTF(MDS_SYNC == 44, "found %lld\n",
+                (long long)MDS_SYNC);
+       LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+                (long long)MDS_DONE_WRITING);
+       LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+                (long long)MDS_SET_INFO);
+       LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+                (long long)MDS_QUOTACHECK);
+       LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+                (long long)MDS_QUOTACTL);
+       LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+                (long long)MDS_GETXATTR);
+       LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+                (long long)MDS_SETXATTR);
+       LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+                (long long)MDS_WRITEPAGE);
+       LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+                (long long)MDS_IS_SUBDIR);
+       LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+                (long long)MDS_GET_INFO);
+       LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+                (long long)MDS_HSM_STATE_GET);
+       LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+                (long long)MDS_HSM_STATE_SET);
+       LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+                (long long)MDS_HSM_ACTION);
+       LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+                (long long)MDS_HSM_PROGRESS);
+       LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+                (long long)MDS_HSM_REQUEST);
+       LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+                (long long)MDS_HSM_CT_REGISTER);
+       LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+                (long long)MDS_HSM_CT_UNREGISTER);
+       LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+                (long long)MDS_SWAP_LAYOUTS);
+       LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+                (long long)MDS_LAST_OPC);
+       LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+                (long long)REINT_SETATTR);
+       LASSERTF(REINT_CREATE == 2, "found %lld\n",
+                (long long)REINT_CREATE);
+       LASSERTF(REINT_LINK == 3, "found %lld\n",
+                (long long)REINT_LINK);
+       LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+                (long long)REINT_UNLINK);
+       LASSERTF(REINT_RENAME == 5, "found %lld\n",
+                (long long)REINT_RENAME);
+       LASSERTF(REINT_OPEN == 6, "found %lld\n",
+                (long long)REINT_OPEN);
+       LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+                (long long)REINT_SETXATTR);
+       LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+                (long long)REINT_RMENTRY);
+       LASSERTF(REINT_MAX == 9, "found %lld\n",
+                (long long)REINT_MAX);
+       LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_IT_EXECD);
+       LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_EXECD);
+       LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_NEG);
+       LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_POS);
+       LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_CREATE);
+       LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_OPEN);
+       LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_COMPLETE);
+       LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_OPEN_REF);
+       LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_CREATE_REF);
+       LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_LOCK);
+       LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+                (long long)MDS_STATUS_CONN);
+       LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+                (long long)MDS_STATUS_LOV);
+       LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+                (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+       LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MF_SOM_CHANGE);
+       LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MF_EPOCH_OPEN);
+       LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MF_EPOCH_CLOSE);
+       LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID1);
+       LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID2);
+       LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID3);
+       LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID4);
+       LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MF_SOM_AU);
+       LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)MF_GETATTR_LOCK);
+       LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MODE);
+       LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_UID);
+       LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_GID);
+       LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_SIZE);
+       LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATIME);
+       LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MTIME);
+       LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_CTIME);
+       LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATIME_SET);
+       LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MTIME_SET);
+       LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_FORCE);
+       LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATTR_FLAG);
+       LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_KILL_SUID);
+       LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_KILL_SGID);
+       LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_CTIME_SET);
+       LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_FROM_OPEN);
+       LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_BLOCKS);
+       LASSERTF(FLD_QUERY == 900, "found %lld\n",
+                (long long)FLD_QUERY);
+       LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+                (long long)FLD_FIRST_OPC);
+       LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+                (long long)FLD_LAST_OPC);
+       LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+                (long long)SEQ_QUERY);
+       LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+                (long long)SEQ_FIRST_OPC);
+       LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+                (long long)SEQ_LAST_OPC);
+       LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+                (long long)SEQ_ALLOC_SUPER);
+       LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+                (long long)SEQ_ALLOC_META);
+       LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+                (long long)LDLM_ENQUEUE);
+       LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+                (long long)LDLM_CONVERT);
+       LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+                (long long)LDLM_CANCEL);
+       LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+                (long long)LDLM_BL_CALLBACK);
+       LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+                (long long)LDLM_CP_CALLBACK);
+       LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+                (long long)LDLM_GL_CALLBACK);
+       LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+                (long long)LDLM_SET_INFO);
+       LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+                (long long)LDLM_LAST_OPC);
+       LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+                (long long)LCK_MINMODE);
+       LASSERTF(LCK_EX == 1, "found %lld\n",
+                (long long)LCK_EX);
+       LASSERTF(LCK_PW == 2, "found %lld\n",
+                (long long)LCK_PW);
+       LASSERTF(LCK_PR == 4, "found %lld\n",
+                (long long)LCK_PR);
+       LASSERTF(LCK_CW == 8, "found %lld\n",
+                (long long)LCK_CW);
+       LASSERTF(LCK_CR == 16, "found %lld\n",
+                (long long)LCK_CR);
+       LASSERTF(LCK_NL == 32, "found %lld\n",
+                (long long)LCK_NL);
+       LASSERTF(LCK_GROUP == 64, "found %lld\n",
+                (long long)LCK_GROUP);
+       LASSERTF(LCK_COS == 128, "found %lld\n",
+                (long long)LCK_COS);
+       LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+                (long long)LCK_MAXMODE);
+       LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+                (long long)LCK_MODE_NUM);
+       CLASSERT(LDLM_PLAIN == 10);
+       CLASSERT(LDLM_EXTENT == 11);
+       CLASSERT(LDLM_FLOCK == 12);
+       CLASSERT(LDLM_IBITS == 13);
+       CLASSERT(LDLM_MAX_TYPE == 14);
+       CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+       CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+       LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+                (long long)UPDATE_OBJ);
+       LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+                (long long)UPDATE_LAST_OPC);
+       CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+       CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+       CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+       CLASSERT(LQUOTA_TYPE_USR == 0);
+       CLASSERT(LQUOTA_TYPE_GRP == 1);
+       CLASSERT(LQUOTA_RES_MD == 1);
+       CLASSERT(LQUOTA_RES_DT == 2);
+       LASSERTF(OBD_PING == 400, "found %lld\n",
+                (long long)OBD_PING);
+       LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+                (long long)OBD_LOG_CANCEL);
+       LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+                (long long)OBD_QC_CALLBACK);
+       LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+                (long long)OBD_IDX_READ);
+       LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+                (long long)OBD_LAST_OPC);
+       LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+                (long long)QUOTA_DQACQ);
+       LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+                (long long)QUOTA_DQREL);
+       LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+                (long long)QUOTA_LAST_OPC);
+       LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+                (long long)MGS_CONNECT);
+       LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+                (long long)MGS_DISCONNECT);
+       LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+                (long long)MGS_EXCEPTION);
+       LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+                (long long)MGS_TARGET_REG);
+       LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+                (long long)MGS_TARGET_DEL);
+       LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+                (long long)MGS_SET_INFO);
+       LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+                (long long)MGS_LAST_OPC);
+       LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+                (long long)SEC_CTX_INIT);
+       LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+                (long long)SEC_CTX_INIT_CONT);
+       LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+                (long long)SEC_CTX_FINI);
+       LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+                (long long)SEC_LAST_OPC);
+       /* Sizes and Offsets */
+
+       /* Checks for struct obd_uuid */
+       LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct obd_uuid));
+
+       /* Checks for struct lu_seq_range */
+       LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lu_seq_range));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+       LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+                (long long)LU_SEQ_RANGE_MDT);
+       LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+                (long long)LU_SEQ_RANGE_OST);
+
+       /* Checks for struct lustre_mdt_attrs */
+       LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_mdt_attrs));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+       LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LMAI_RELEASED);
+       LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LMAC_HSM);
+       LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LMAC_SOM);
+       LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+                (long long)OBJ_CREATE);
+       LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+                (long long)OBJ_DESTROY);
+       LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+                (long long)OBJ_REF_ADD);
+       LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+                (long long)OBJ_REF_DEL);
+       LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+                (long long)OBJ_ATTR_SET);
+       LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+                (long long)OBJ_ATTR_GET);
+       LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+                (long long)OBJ_XATTR_SET);
+       LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+                (long long)OBJ_XATTR_GET);
+       LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+                (long long)OBJ_INDEX_LOOKUP);
+       LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+                (long long)OBJ_INDEX_LOOKUP);
+       LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+                (long long)OBJ_INDEX_INSERT);
+       LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+                (long long)OBJ_INDEX_DELETE);
+
+       /* Checks for struct som_attrs */
+       LASSERTF((int)sizeof(struct som_attrs) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct som_attrs));
+       LASSERTF((int)offsetof(struct som_attrs, som_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_compat));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_compat));
+       LASSERTF((int)offsetof(struct som_attrs, som_incompat) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_incompat));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_incompat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_incompat));
+       LASSERTF((int)offsetof(struct som_attrs, som_ioepoch) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_ioepoch));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_ioepoch));
+       LASSERTF((int)offsetof(struct som_attrs, som_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_size));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_size));
+       LASSERTF((int)offsetof(struct som_attrs, som_blocks) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_blocks));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_blocks));
+       LASSERTF((int)offsetof(struct som_attrs, som_mountid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_mountid));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_mountid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_mountid));
+
+       /* Checks for struct hsm_attrs */
+       LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_attrs));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+
+       /* Checks for struct ost_id */
+       LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct ost_id));
+       LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_id, oi));
+       LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_id *)0)->oi));
+       LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+                (long long)LUSTRE_FID_INIT_OID);
+       LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+                (long long)FID_SEQ_OST_MDT0);
+       LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+                (long long)FID_SEQ_LLOG);
+       LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+                (long long)FID_SEQ_ECHO);
+       LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+                (long long)FID_SEQ_OST_MDT1);
+       LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+                (long long)FID_SEQ_OST_MAX);
+       LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+                (long long)FID_SEQ_RSVD);
+       LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+                (long long)FID_SEQ_IGIF);
+       LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IGIF_MAX);
+       LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IDIF);
+       LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IDIF_MAX);
+       LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_START);
+       LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_LOCAL_FILE);
+       LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_DOT_LUSTRE);
+       LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_SPECIAL);
+       LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_QUOTA);
+       LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_QUOTA_GLB);
+       LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_ROOT);
+       LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_NORMAL);
+       LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_LOV_DEFAULT);
+       LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_SPECIAL_BFL);
+       LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_DOT_LUSTRE);
+       LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+       /* Checks for struct lu_dirent */
+       LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lu_dirent));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_fid));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_hash));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+       LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_FID);
+       LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_TYPE);
+       LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_64BITHASH);
+
+       /* Checks for struct luda_type */
+       LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+                (long long)(int)sizeof(struct luda_type));
+       LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct luda_type, lt_type));
+       LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+       /* Checks for struct lu_dirpage */
+       LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lu_dirpage));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+       LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+                (long long)LDF_EMPTY);
+       LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+                (long long)LDF_COLLIDE);
+       LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+                (long long)LU_PAGE_SIZE);
+       /* Checks for union lu_page */
+       LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+                (long long)(int)sizeof(union lu_page));
+
+       /* Checks for struct lustre_handle */
+       LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_handle));
+       LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_handle, cookie));
+       LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+       /* Checks for struct lustre_msg_v2 */
+       LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_msg_v2));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+       LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V1);
+       LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V2);
+       LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V1_SWABBED);
+       LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+       /* Checks for struct ptlrpc_body */
+       LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+                (long long)(int)sizeof(struct ptlrpc_body_v3));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+       CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+       CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+       LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+                (long long)MSG_PTLRPC_BODY_OFF);
+       LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+                (long long)REQ_REC_OFF);
+       LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+                (long long)REPLY_REC_OFF);
+       LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+                (long long)DLM_LOCKREQ_OFF);
+       LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+                (long long)DLM_REQ_REC_OFF);
+       LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+                (long long)DLM_INTENT_IT_OFF);
+       LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+                (long long)DLM_INTENT_REC_OFF);
+       LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+                (long long)DLM_LOCKREPLY_OFF);
+       LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+                (long long)DLM_REPLY_REC_OFF);
+       LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+                (long long)MSG_PTLRPC_HEADER_OFF);
+       LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+               PTLRPC_MSG_VERSION);
+       LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+               LUSTRE_VERSION_MASK);
+       LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+               LUSTRE_OBD_VERSION);
+       LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+               LUSTRE_MDS_VERSION);
+       LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+               LUSTRE_OST_VERSION);
+       LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+               LUSTRE_DLM_VERSION);
+       LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+               LUSTRE_LOG_VERSION);
+       LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+               LUSTRE_MGS_VERSION);
+       LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+                (long long)MSGHDR_AT_SUPPORT);
+       LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+                (long long)MSGHDR_CKSUM_INCOMPAT18);
+       LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_OP_FLAG_MASK);
+       LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+                (long long)MSG_OP_FLAG_SHIFT);
+       LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+               (unsigned)MSG_GEN_FLAG_MASK);
+       LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_LAST_REPLAY);
+       LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_RESENT);
+       LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_REPLAY);
+       LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_DELAY_REPLAY);
+       LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_VERSION_REPLAY);
+       LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_REQ_REPLAY_DONE);
+       LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_LOCK_REPLAY_DONE);
+       LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_RECOVERING);
+       LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_RECONNECT);
+       LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_REPLAYABLE);
+       LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_LIBCLIENT);
+       LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_INITIAL);
+       LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_ASYNC);
+       LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_NEXT_VER);
+       LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_TRANSNO);
+
+       /* Checks for struct obd_connect_data */
+       LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+                (long long)(int)sizeof(struct obd_connect_data));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding1));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding2));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding3));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding4));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding5));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding6));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding7));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding8));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding9));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingA));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingB));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingC));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingD));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingE));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingF));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+       LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RDONLY);
+       LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_INDEX);
+       LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS);
+       LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT);
+       LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SRVLOCK);
+       LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_VERSION);
+       LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_REQPORTAL);
+       LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_ACL);
+       LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_XATTR);
+       LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CROW);
+       LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_TRUNCLOCK);
+       LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_TRANSNO);
+       LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_IBITS);
+       LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_JOIN);
+       LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_ATTRFID);
+       LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_NODEVOH);
+       LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RMT_CLIENT);
+       LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RMT_CLIENT_FORCE);
+       LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_BRW_SIZE);
+       LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_QUOTA64);
+       LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS_CAPA);
+       LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_OSS_CAPA);
+       LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CANCELSET);
+       LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SOM);
+       LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_AT);
+       LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LRU_RESIZE);
+       LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS_MDS);
+       LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_REAL);
+       LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CHANGE_QS);
+       LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CKSUM);
+       LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FID);
+       LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_VBR);
+       LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LOV_V3);
+       LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT_SHRINK);
+       LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SKIP_ORPHAN);
+       LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MAX_EASIZE);
+       LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FULL20);
+       LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LAYOUTLOCK);
+       LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_64BITHASH);
+       LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MAXBYTES);
+       LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_IMP_RECOV);
+       LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_JOBSTATS);
+       LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_UMASK);
+       LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_EINPROGRESS);
+       LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT_PARAM);
+       LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FLOCK_OWNER);
+       LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LVB_TYPE);
+       LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_NANOSEC_TIME);
+       LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LIGHTWEIGHT);
+       LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SHORTIO);
+       LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_PINGLESS);
+       LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_CRC32);
+       LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_ADLER);
+       LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_CRC32C);
+
+       /* Checks for struct obdo */
+       LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+                (long long)(int)sizeof(struct obdo));
+       LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_valid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+       LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_oi));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+       LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_seq));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+       LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_size));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_size));
+       LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_mtime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+       LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_atime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+       LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_ctime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+       LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_blocks));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+       LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_grant));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+       LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_blksize));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+       LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_mode));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+       LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_uid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+       LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_gid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+       LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_flags));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+       LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_nlink));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+       LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_oid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+       LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_misc));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+       LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_ioepoch));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+       LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_stripe_idx));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+       LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_ver));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+       LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_handle));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+       LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_lcookie));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+       LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_uid_h));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+       LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_gid_h));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+       LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_data_version));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+       LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_4));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+       LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_5));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+       LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_6));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+       LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLID);
+       LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLATIME);
+       LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMTIME);
+       LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCTIME);
+       LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLSIZE);
+       LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLBLOCKS);
+       LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLBLKSZ);
+       LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMODE);
+       LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLTYPE);
+       LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLUID);
+       LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGID);
+       LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLFLAGS);
+       LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLNLINK);
+       LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGENER);
+       LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRDEV);
+       LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLEASIZE);
+       LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_LINKNAME);
+       LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLHANDLE);
+       LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCKSUM);
+       LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLQOS);
+       LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCOOKIE);
+       LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGROUP);
+       LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLFID);
+       LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLEPOCH);
+       LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGRANT);
+       LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLDIREA);
+       LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLUSRQUOTA);
+       LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGRPQUOTA);
+       LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMODEASIZE);
+       LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_MDS);
+       LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_REINT);
+       LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_MEA);
+       LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTR);
+       LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTRLS);
+       LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTRRM);
+       LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLACL);
+       LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTPERM);
+       LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMDSCAPA);
+       LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLOSSCAPA);
+       LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCKSPLIT);
+       LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCROSSREF);
+       LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGETATTRLOCK);
+       LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTLSETFACL);
+       LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTLGETFACL);
+       LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTRSETFACL);
+       LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTRGETFACL);
+       LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLDATAVERSION);
+       CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+       CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+       CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+       CLASSERT(OBD_FL_NORPC == 0x00000008);
+       CLASSERT(OBD_FL_IDONLY == 0x00000010);
+       CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+       CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+       CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+       CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+       CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+       CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+       CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+       CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+       CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+       CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+       CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+       CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+       CLASSERT(OBD_FL_MMAP == 0x00040000);
+       CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+       CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+       CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+       /* Checks for struct lov_ost_data_v1 */
+       LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lov_ost_data_v1));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+       /* Checks for struct lov_mds_md_v1 */
+       LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lov_mds_md_v1));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+       CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+       /* Checks for struct lov_mds_md_v3 */
+       LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct lov_mds_md_v3));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+       CLASSERT(LOV_MAXPOOLNAME == 16);
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+       CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+       LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_RAID0);
+       LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_RAID1);
+       LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_FIRST);
+       LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_CMOBD);
+
+       /* Checks for struct obd_statfs */
+       LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+                (long long)(int)sizeof(struct obd_statfs));
+       LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_type));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+       LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_blocks));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bfree));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bavail));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+       LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_ffree));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+       LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_fsid));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bsize));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+       LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_namelen));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+       LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_state));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+       LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare2));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare3));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare4));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare5));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare6));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare7));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare8));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare9));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+       /* Checks for struct obd_ioobj */
+       LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct obd_ioobj));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+       /* Checks for union lquota_id */
+       LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(union lquota_id));
+
+       LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+                (long long)QUOTABLOCK_BITS);
+       LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+                (long long)QUOTABLOCK_SIZE);
+
+       /* Checks for struct obd_quotactl */
+       LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct obd_quotactl));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_type));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_id));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+       /* Checks for struct obd_dqinfo */
+       LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct obd_dqinfo));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+       /* Checks for struct obd_dqblk */
+       LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct obd_dqblk));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+       LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+               Q_QUOTACHECK);
+       LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+               Q_INITQUOTA);
+       LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+               Q_GETOINFO);
+       LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+               Q_GETOQUOTA);
+       LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+               Q_FINVALIDATE);
+
+       /* Checks for struct lquota_acct_rec */
+       LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_acct_rec));
+       LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+       LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+       LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+       LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+       /* Checks for struct lquota_glb_rec */
+       LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_glb_rec));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+       /* Checks for struct lquota_slv_rec */
+       LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_slv_rec));
+       LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+       LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+       /* Checks for struct idx_info */
+       LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+                (long long)(int)sizeof(struct idx_info));
+       LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_magic));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+       LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_flags));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+       LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_count));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad0));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+       LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_attrs));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+       LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_fid));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+       LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_version));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+       LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_hash_start));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+       LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_hash_end));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+       LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_keysize));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+       LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_recsize));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad1));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad2));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad3));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+       CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+       /* Checks for struct lu_idxpage */
+       LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct lu_idxpage));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+       CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+       LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+                (long long)LIP_HDR_SIZE);
+       LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+                (long long)II_FL_NOHASH);
+       LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+                (long long)II_FL_VARKEY);
+       LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+                (long long)II_FL_VARREC);
+       LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+                (long long)II_FL_NONUNQ);
+
+       /* Checks for struct niobuf_remote */
+       LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct niobuf_remote));
+       LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, offset));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+       LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, len));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+       LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, flags));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+       LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+               OBD_BRW_READ);
+       LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+               OBD_BRW_WRITE);
+       LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+               OBD_BRW_SYNC);
+       LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+               OBD_BRW_CHECK);
+       LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+               OBD_BRW_FROM_GRANT);
+       LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+               OBD_BRW_GRANTED);
+       LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+               OBD_BRW_NOCACHE);
+       LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+               OBD_BRW_NOQUOTA);
+       LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+               OBD_BRW_SRVLOCK);
+       LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+               OBD_BRW_ASYNC);
+       LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+               OBD_BRW_MEMALLOC);
+
+       /* Checks for struct ost_body */
+       LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+                (long long)(int)sizeof(struct ost_body));
+       LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_body, oa));
+       LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+       /* Checks for struct ll_fid */
+       LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fid));
+       LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, id));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->id));
+       LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, generation));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+       LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, f_type));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+       /* Checks for struct mdt_body */
+       LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_body));
+       LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fid1));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+       LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fid2));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+       LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, handle));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+       LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, valid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+       LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, size));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->size));
+       LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, mtime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+       LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, atime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+       LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, ctime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+       LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, blocks));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+       LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, unused1));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->unused1));
+       LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fsuid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+       LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fsgid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+       LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, capability));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+       LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, mode));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+       LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, uid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+       LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, gid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+       LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, flags));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+       LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, rdev));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+       LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, nlink));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+       LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, unused2));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+       LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, suppgid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+       LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, eadatasize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+       LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, aclsize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+       LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, max_mdsize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+       LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+       LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, uid_h));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+       LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, gid_h));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+       LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_5));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+       LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_6));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+       LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_7));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+       LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_8));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+       LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_9));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+       LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_10));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+       LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_CLOSED);
+       LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+               MDS_FMODE_EXEC);
+       LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_EPOCH);
+       LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_TRUNC);
+       LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_SOM);
+       LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+               MDS_OPEN_CREATED);
+       LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+               MDS_OPEN_CROSS);
+       LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+               MDS_OPEN_CREAT);
+       LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+               MDS_OPEN_EXCL);
+       LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+               MDS_OPEN_TRUNC);
+       LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+               MDS_OPEN_APPEND);
+       LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+               MDS_OPEN_SYNC);
+       LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+               MDS_OPEN_DIRECTORY);
+       LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_BY_FID);
+       LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_DELAY_CREATE);
+       LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_OWNEROVERRIDE);
+       LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_JOIN_FILE);
+       LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_LOCK);
+       LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_HAS_EA);
+       LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_HAS_OBJS);
+       LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_NORESTORE);
+       LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_NEWSTRIPE);
+       LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_VOLATILE);
+       LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+               LUSTRE_SYNC_FL);
+       LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+               LUSTRE_IMMUTABLE_FL);
+       LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+               LUSTRE_APPEND_FL);
+       LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+               LUSTRE_NOATIME_FL);
+       LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+               LUSTRE_DIRSYNC_FL);
+       LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+               MDS_INODELOCK_LOOKUP);
+       LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+               MDS_INODELOCK_UPDATE);
+       LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+               MDS_INODELOCK_OPEN);
+       LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+               MDS_INODELOCK_LAYOUT);
+
+       /* Checks for struct mdt_ioepoch */
+       LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_ioepoch));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, handle));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, flags));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, padding));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+       /* Checks for struct mdt_remote_perm */
+       LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_remote_perm));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+       LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETUID_PERM);
+       LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETGID_PERM);
+       LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETGRP_PERM);
+       LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_RMTACL_PERM);
+       LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_RMTOWN_PERM);
+
+       /* Checks for struct mdt_rec_setattr */
+       LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_setattr));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+       /* Checks for struct mdt_rec_create */
+       LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_create));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+       /* Checks for struct mdt_rec_link */
+       LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_link));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+       /* Checks for struct mdt_rec_unlink */
+       LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_unlink));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+       /* Checks for struct mdt_rec_rename */
+       LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_rename));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+       /* Checks for struct mdt_rec_setxattr */
+       LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_setxattr));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+       /* Checks for struct mdt_rec_reint */
+       LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_reint));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+       /* Checks for struct lmv_desc */
+       LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct lmv_desc));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+       /* Checks for struct lmv_stripe_md */
+       LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lmv_stripe_md));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+       CLASSERT(LOV_MAXPOOLNAME == 16);
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+       /* Checks for struct lov_desc */
+       LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct lov_desc));
+       LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_pattern));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+       LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+       LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_uuid));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+       CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+       /* Checks for struct ldlm_res_id */
+       LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_res_id));
+       CLASSERT(RES_NAME_SIZE == 4);
+       LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+       LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+       /* Checks for struct ldlm_extent */
+       LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_extent));
+       LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, start));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+       LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, end));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+       LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, gid));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+       /* Checks for struct ldlm_inodebits */
+       LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_inodebits));
+       LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_inodebits, bits));
+       LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+       /* Checks for struct ldlm_flock_wire */
+       LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_flock_wire));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+       /* Checks for struct ldlm_intent */
+       LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_intent));
+       LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_intent, opc));
+       LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+       /* Checks for struct ldlm_resource_desc */
+       LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_resource_desc));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+       /* Checks for struct ldlm_lock_desc */
+       LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+       /* Checks for struct ldlm_request */
+       LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_request));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_flags));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_count));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_desc));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_handle));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+       /* Checks for struct ldlm_reply */
+       LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_reply));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+       /* Checks for struct ost_lvb_v1 */
+       LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct ost_lvb_v1));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+       /* Checks for struct ost_lvb */
+       LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct ost_lvb));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_size));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+       /* Checks for struct lquota_lvb */
+       LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_lvb));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+       LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+                (long long)LQUOTA_FL_EDQUOT);
+
+       /* Checks for struct ldlm_gl_lquota_desc */
+       LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+       /* Checks for struct mgs_send_param */
+       LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+                (long long)(int)sizeof(struct mgs_send_param));
+       CLASSERT(MGS_PARAM_MAXLEN == 1024);
+       LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+       LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+       /* Checks for struct cfg_marker */
+       LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+                (long long)(int)sizeof(struct cfg_marker));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_step));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_flags));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_vers));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_padding));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_comment));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+       /* Checks for struct llog_logid */
+       LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(struct llog_logid));
+       LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid, lgl_oi));
+       LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+       LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+       LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+       CLASSERT(OST_SZ_REC == 274730752);
+       CLASSERT(MDS_UNLINK_REC == 274801668);
+       CLASSERT(MDS_UNLINK64_REC == 275325956);
+       CLASSERT(MDS_SETATTR64_REC == 275325953);
+       CLASSERT(OBD_CFG_REC == 274857984);
+       CLASSERT(LLOG_GEN_REC == 274989056);
+       CLASSERT(CHANGELOG_REC == 275120128);
+       CLASSERT(CHANGELOG_USER_REC == 275185664);
+       CLASSERT(LLOG_HDR_MAGIC == 275010873);
+       CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+       /* Checks for struct llog_catid */
+       LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct llog_catid));
+       LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_logid));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding1));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding2));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding3));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+       /* Checks for struct llog_rec_hdr */
+       LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct llog_rec_hdr));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+       /* Checks for struct llog_rec_tail */
+       LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct llog_rec_tail));
+       LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+       LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+       LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+       LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+       /* Checks for struct llog_logid_rec */
+       LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_logid_rec));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+       /* Checks for struct llog_unlink_rec */
+       LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llog_unlink_rec));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+       /* Checks for struct llog_unlink64_rec */
+       LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_unlink64_rec));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+       /* Checks for struct llog_setattr64_rec */
+       LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_setattr64_rec));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+       /* Checks for struct llog_size_change_rec */
+       LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_size_change_rec));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+       /* Checks for struct changelog_rec */
+       LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_rec));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_flags));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_type));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_index));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_prev));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_time));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+       /* Checks for struct changelog_ext_rec */
+       LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_ext_rec));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+       /* Checks for struct changelog_setinfo */
+       LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_setinfo));
+       LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+       LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+       LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+       LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+       /* Checks for struct llog_changelog_rec */
+       LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct llog_changelog_rec));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+       /* Checks for struct llog_changelog_user_rec */
+       LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llog_changelog_user_rec));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+       /* Checks for struct llog_gen */
+       LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct llog_gen));
+       LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+       LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+       LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen, conn_cnt));
+       LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+       /* Checks for struct llog_gen_rec */
+       LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_gen_rec));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+       /* Checks for struct llog_log_hdr */
+       LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+                (long long)(int)sizeof(struct llog_log_hdr));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+       /* Checks for struct llog_cookie */
+       LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct llog_cookie));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_index));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+       /* Checks for struct llogd_body */
+       LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct llogd_body));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_logid));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_index));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_len));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+       CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+       CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+       CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+       CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+       CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+       CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+       CLASSERT(LLOG_CATINFO == 507);
+       CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+       CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+       CLASSERT(LLOG_FIRST_OPC == 501);
+       CLASSERT(LLOG_LAST_OPC == 510);
+
+       /* Checks for struct llogd_conn_body */
+       LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llogd_conn_body));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+       /* Checks for struct ll_fiemap_info_key */
+       LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fiemap_info_key));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+       /* Checks for struct quota_body */
+       LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct quota_body));
+       LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_fid));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+       LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_id));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+       LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_flags));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+       LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_padding));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+       LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_count));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+       LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_usage));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+       LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+       LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_lockh));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+       LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+       LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+       /* Checks for struct mgs_target_info */
+       LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+                (long long)(int)sizeof(struct mgs_target_info));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_params));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+       /* Checks for struct lustre_capa */
+       LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_capa));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_fid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_opc));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_uid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_gid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_flags));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+       CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+       LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+       /* Checks for struct lustre_capa_key */
+       LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_capa_key));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+       CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+       /* Checks for struct getinfo_fid2path */
+       LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct getinfo_fid2path));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+       /* Checks for struct ll_user_fiemap */
+       LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ll_user_fiemap));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+       CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+       CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+       CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+       /* Checks for struct ll_fiemap_extent */
+       LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fiemap_extent));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+       CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+       CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+       CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+       CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+       CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+       CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+       CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+       CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+       CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+       CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+       CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+       CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+       /* Checks for type posix_acl_xattr_entry */
+       LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+                (long long)(int)sizeof(posix_acl_xattr_entry));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+       /* Checks for type posix_acl_xattr_header */
+       LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+                (long long)(int)sizeof(posix_acl_xattr_header));
+       LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+       LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+       LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+       LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+       /* Checks for struct link_ea_header */
+       LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct link_ea_header));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_magic));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_len));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+       LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, padding1));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+       LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, padding2));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+       CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+       /* Checks for struct link_ea_entry */
+       LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+                (long long)(int)sizeof(struct link_ea_entry));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_name));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+       /* Checks for struct layout_intent */
+       LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct layout_intent));
+       LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_opc));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+       LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_flags));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+       LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_start));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+       LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_end));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+       LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+                (long long)LAYOUT_INTENT_ACCESS);
+       LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+                (long long)LAYOUT_INTENT_READ);
+       LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+                (long long)LAYOUT_INTENT_WRITE);
+       LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+                (long long)LAYOUT_INTENT_GLIMPSE);
+       LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+                (long long)LAYOUT_INTENT_TRUNC);
+       LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+                (long long)LAYOUT_INTENT_RELEASE);
+       LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+                (long long)LAYOUT_INTENT_RESTORE);
+
+       /* Checks for struct hsm_action_item */
+       LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_action_item));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_len));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_action));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_data));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+       /* Checks for struct hsm_action_list */
+       LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_action_list));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_version));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_count));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+       LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, padding1));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+       /* Checks for struct hsm_progress */
+       LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_progress));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_fid));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_extent));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_flags));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_errval));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+       LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, padding));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+       LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+               HP_FLAG_COMPLETED);
+       LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+               HP_FLAG_RETRY);
+
+       LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_flags));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_errval));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+       LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, padding));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_hai));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+       /* Checks for struct hsm_progress_kernel */
+       LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_progress_kernel));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+       /* Checks for struct hsm_user_item */
+       LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_item));
+       LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+       LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+       LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+       LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+       /* Checks for struct hsm_user_state */
+       LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_state));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_states));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+       /* Checks for struct hsm_state_set */
+       LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_state_set));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+       /* Checks for struct hsm_current_action */
+       LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_current_action));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_state));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_action));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_location));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+       /* Checks for struct hsm_request */
+       LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_request));
+       LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_action));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+       LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+       LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_flags));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+       LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+       LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_data_len));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+       LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)HSM_FORCE_ACTION);
+       LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)HSM_GHOST_COPY);
+
+       /* Checks for struct hsm_user_request */
+       LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_request));
+       LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_request, hur_request));
+       LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+       LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+       LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+       /* Checks for struct update_buf */
+       LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct update_buf));
+       LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_magic));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+       LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_count));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+       LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_bufs));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+       /* Checks for struct update_reply */
+       LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct update_reply));
+       LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_version));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+       LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_count));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+       LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_lens));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+       /* Checks for struct update */
+       LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct update));
+       LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_type));
+       LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_type));
+       LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_batchid));
+       LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_batchid));
+       LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_fid));
+       LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_fid));
+       LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_lens));
+       LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_lens));
+       LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_bufs));
+       LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}